From 9e4108def17e4b6ec1abe7a1beaebb9679750d89 Mon Sep 17 00:00:00 2001
From: David Blanc Brioir <davidblancbrioir@gmail.com>
Date: Tue, 30 Dec 2025 22:03:50 +0100
Subject: [PATCH] =?UTF-8?q?Int=C3=A9gration=20Word=20dans=20Flask:=20uploa?=
 =?UTF-8?q?d=20et=20traitement=20web?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Modifications:
- flask_app.py:
  * Ajout de "docx" dans ALLOWED_EXTENSIONS
  * Nouvelle fonction run_word_processing_job() avec:
    - Gestion tempfile pour python-docx (besoin d'un path)
    - Intégration du callback de progression SSE
    - Nettoyage automatique du fichier temporaire
  * Modification upload() route:
    - Détection du type de fichier (PDF/Word)
    - Routage vers le bon processeur (run_processing_job vs run_word_processing_job)
    - Messages d'erreur adaptés pour PDF et Word
  * Mise à jour des docstrings

- templates/upload.html:
  * Titre: "Parser PDF/Word/Markdown" (au lieu de PDF/Markdown)
  * Accept attribute: ".pdf,.docx,.md"
  * Tooltips: Explique que Word n'a pas besoin d'OCR
  * Pipeline de traitement: Section séparée pour PDF vs Word
  * Labels mis à jour pour inclure Word

Fonctionnalités:
✅ Upload de fichiers .docx via interface web
✅ Traitement en arrière-plan avec SSE
✅ Pas d'OCR nécessaire pour Word (économie ~0.003€/page)
✅ Réutilisation complète des modules LLM existants
✅ Extraction directe via python-docx
✅ Construction TOC depuis styles Heading 1-9

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 generations/library_rag/flask_app.py          | 125 ++++++++++++++++--
 generations/library_rag/templates/upload.html |  34 +++--
 2 files changed, 132 insertions(+), 27 deletions(-)

diff --git a/generations/library_rag/flask_app.py b/generations/library_rag/flask_app.py
index e13ab29..5c349f6 100644
--- a/generations/library_rag/flask_app.py
+++ b/generations/library_rag/flask_app.py
@@ -97,7 +97,7 @@ app.config["SECRET_KEY"] = os.environ.get("SECRET_KEY", "dev-secret-key-change-i
 # Configuration upload
 app.config["UPLOAD_FOLDER"] = Path(__file__).parent / "output"
 app.config["MAX_CONTENT_LENGTH"] = 50 * 1024 * 1024  # 50 MB max
-ALLOWED_EXTENSIONS = {"pdf", "md"}
+ALLOWED_EXTENSIONS = {"pdf", "md", "docx"}
 
 # Stockage des jobs de traitement en cours
 processing_jobs: Dict[str, Dict[str, Any]] = {}  # {job_id: {"status": str, "queue": Queue, "result": dict}}
@@ -1940,23 +1940,108 @@ def run_processing_job(
         q.put(exception_event)
 
 
+def run_word_processing_job(
+    job_id: str,
+    file_bytes: bytes,
+    filename: str,
+    options: ProcessingOptions,
+) -> None:
+    """Execute Word processing in background with SSE event emission.
+
+    Args:
+        job_id: Unique identifier for this processing job.
+        file_bytes: Raw Word file content (.docx).
+        filename: Original filename for the Word document.
+        options: Processing options (LLM settings, etc.).
+    """
+    job: Dict[str, Any] = processing_jobs[job_id]
+    q: queue.Queue[SSEEvent] = job["queue"]
+
+    try:
+        from utils.word_pipeline import process_word
+        import tempfile
+
+        # Callback pour émettre la progression
+        def progress_callback(step: str, status: str, detail: str = "") -> None:
+            event: SSEEvent = {
+                "type": "step",
+                "step": step,
+                "status": status,
+                "detail": detail if detail else None
+            }
+            q.put(event)
+
+        # Save Word file to temporary location (python-docx needs a file path)
+        with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp_file:
+            tmp_file.write(file_bytes)
+            tmp_path = Path(tmp_file.name)
+
+        try:
+            # Traiter le Word avec callback
+            from utils.types import LLMProvider, PipelineResult
+            from typing import cast
+
+            result: PipelineResult = process_word(
+                tmp_path,
+                use_llm=options["use_llm"],
+                llm_provider=cast(LLMProvider, options["llm_provider"]),
+                use_semantic_chunking=True,
+                ingest_to_weaviate=options["ingest_weaviate"],
+                skip_metadata_lines=5,
+                extract_images=True,
+                progress_callback=progress_callback,
+            )
+
+            job["result"] = result
+
+            if result.get("success"):
+                job["status"] = "complete"
+                doc_name: str = result.get("document_name", Path(filename).stem)
+                complete_event: SSEEvent = {
+                    "type": "complete",
+                    "redirect": f"/documents/{doc_name}/view"
+                }
+                q.put(complete_event)
+            else:
+                job["status"] = "error"
+                error_event: SSEEvent = {
+                    "type": "error",
+                    "message": result.get("error", "Erreur inconnue")
+                }
+                q.put(error_event)
+
+        finally:
+            # Clean up temporary file
+            if tmp_path.exists():
+                tmp_path.unlink()
+
+    except Exception as e:
+        job["status"] = "error"
+        job["result"] = {"error": str(e)}
+        exception_event: SSEEvent = {
+            "type": "error",
+            "message": str(e)
+        }
+        q.put(exception_event)
+
+
 @app.route("/upload", methods=["GET", "POST"])
 def upload() -> str:
-    """Handle PDF upload form display and file submission.
+    """Handle PDF/Word upload form display and file submission.
 
     GET: Displays the upload form with processing options.
-    POST: Validates the uploaded PDF, starts background processing, and
-    redirects to the progress page.
+    POST: Validates the uploaded file (PDF or Word), starts background processing,
+    and redirects to the progress page.
 
     Form Parameters (POST):
-        file: PDF file to upload (required, max 50MB).
+        file: PDF (.pdf) or Word (.docx) file to upload (required, max 50MB).
         llm_provider (str): LLM provider - "mistral" or "ollama". Defaults to "mistral".
         llm_model (str): Specific model name. Defaults based on provider.
-        skip_ocr (bool): Skip OCR if markdown already exists. Defaults to False.
+        skip_ocr (bool): Skip OCR if markdown already exists (PDF only). Defaults to False.
         use_llm (bool): Enable LLM processing steps. Defaults to True.
         ingest_weaviate (bool): Ingest chunks to Weaviate. Defaults to True.
-        use_ocr_annotations (bool): Use OCR annotations for better TOC. Defaults to False.
-        max_toc_pages (int): Max pages to scan for TOC. Defaults to 8.
+        use_ocr_annotations (bool): Use OCR annotations for better TOC (PDF only). Defaults to False.
+        max_toc_pages (int): Max pages to scan for TOC (PDF only). Defaults to 8.
 
     Returns:
         GET: Rendered upload form (upload.html).
@@ -1980,7 +2065,7 @@ def upload() -> str:
         return render_template("upload.html", error="Aucun fichier sélectionné")
 
     if not allowed_file(file.filename):
-        return render_template("upload.html", error="Format non supporté. Utilisez un fichier PDF ou Markdown (.md).")
+        return render_template("upload.html", error="Format non supporté. Utilisez un fichier PDF (.pdf) ou Word (.docx).")
 
     # Options de traitement
     llm_provider: str = request.form.get("llm_provider", "mistral")
@@ -2000,6 +2085,10 @@ def upload() -> str:
     filename: str = secure_filename(file.filename)
     file_bytes: bytes = file.read()
 
+    # Déterminer le type de fichier
+    file_extension: str = filename.rsplit(".", 1)[1].lower() if "." in filename else ""
+    is_word_document: bool = file_extension == "docx"
+
     # Créer un job de traitement
     job_id: str = str(uuid.uuid4())
     processing_jobs[job_id] = {
@@ -2009,15 +2098,23 @@ def upload() -> str:
         "filename": filename,
     }
 
-    # Démarrer le traitement en background
-    thread: threading.Thread = threading.Thread(
-        target=run_processing_job,
-        args=(job_id, file_bytes, filename, options)
-    )
+    # Démarrer le traitement en background (Word ou PDF)
+    if is_word_document:
+        thread: threading.Thread = threading.Thread(
+            target=run_word_processing_job,
+            args=(job_id, file_bytes, filename, options)
+        )
+    else:
+        thread: threading.Thread = threading.Thread(
+            target=run_processing_job,
+            args=(job_id, file_bytes, filename, options)
+        )
+
     thread.daemon = True
     thread.start()
 
     # Afficher la page de progression
+    file_type_label: str = "Word" if is_word_document else "PDF"
     return render_template("upload_progress.html", job_id=job_id, filename=filename)
 
 
diff --git a/generations/library_rag/templates/upload.html b/generations/library_rag/templates/upload.html
index d0071dd..87d87b8 100644
--- a/generations/library_rag/templates/upload.html
+++ b/generations/library_rag/templates/upload.html
@@ -4,8 +4,8 @@
 
 {% block content %}
 <section class="section">
-    <h1>📄 Parser PDF/Markdown</h1>
-    <p class="lead">Uploadez un fichier PDF ou Markdown pour l'analyser et structurer son contenu</p>
+    <h1>📄 Parser PDF/Word/Markdown</h1>
+    <p class="lead">Uploadez un fichier PDF, Word (.docx) ou Markdown pour l'analyser et structurer son contenu</p>
 
     {% if error %}
         <div class="alert alert-warning">
@@ -16,18 +16,19 @@
     <div class="search-box">
         <form method="post" enctype="multipart/form-data">
             <div class="form-group">
-                <label class="form-label" for="file">Fichier PDF ou Markdown</label>
-                <input 
-                    type="file" 
-                    name="file" 
-                    id="file" 
+                <label class="form-label" for="file">Fichier PDF, Word ou Markdown</label>
+                <input
+                    type="file"
+                    name="file"
+                    id="file"
                     class="form-control"
-                    accept=".pdf,.md"
+                    accept=".pdf,.docx,.md"
                     required
                 >
                 <div class="caption mt-1">Taille maximale : 50 MB</div>
                 <div class="caption" style="color: var(--color-accent); margin-top: 0.25rem;">
-                    💡 Pour retester un document existant sans refaire l'OCR payant, cochez "Skip OCR"
+                    💡 PDF: Pour retester sans refaire l'OCR payant, cochez "Skip OCR"
+                    <br>💡 Word: Pas d'OCR nécessaire (extraction directe du contenu)
                 </div>
             </div>
 
@@ -150,10 +151,17 @@
     <div class="card">
         <h3>📋 Pipeline de traitement</h3>
         <div class="mt-2">
-            <p><strong>1. OCR Mistral</strong> — Extraction du texte et des images via l'API Mistral</p>
-            <p><strong>2. Markdown</strong> — Construction du document Markdown avec images</p>
-            <p><strong>3. Hiérarchie</strong> — Analyse des titres pour créer une structure arborescente</p>
-            <p><strong>4. LLM (optionnel)</strong> — Amélioration de la structure via Ollama</p>
+            <p><strong>PDF:</strong></p>
+            <p style="margin-left: 1rem;">1. OCR Mistral — Extraction du texte et des images via l'API Mistral</p>
+            <p style="margin-left: 1rem;">2. Markdown — Construction du document Markdown avec images</p>
+            <p style="margin-left: 1rem;">3. Hiérarchie — Analyse des titres pour créer une structure arborescente</p>
+            <p style="margin-left: 1rem;">4. LLM (optionnel) — Amélioration de la structure via Ollama/Mistral</p>
+
+            <p class="mt-3"><strong>Word (.docx):</strong></p>
+            <p style="margin-left: 1rem;">1. Extraction Word — Lecture directe du contenu (pas d'OCR)</p>
+            <p style="margin-left: 1rem;">2. Markdown — Construction du document Markdown à partir des paragraphes</p>
+            <p style="margin-left: 1rem;">3. TOC — Extraction de la hiérarchie depuis les styles Heading 1-9</p>
+            <p style="margin-left: 1rem;">4. LLM — Structuration sémantique et enrichissement</p>
         </div>
     </div>