Intégration Word dans Flask: upload et traitement web
Modifications:
- flask_app.py:
* Ajout de "docx" dans ALLOWED_EXTENSIONS
* Nouvelle fonction run_word_processing_job() avec:
- Gestion tempfile pour python-docx (besoin d'un path)
- Intégration du callback de progression SSE
- Nettoyage automatique du fichier temporaire
* Modification upload() route:
- Détection du type de fichier (PDF/Word)
- Routage vers le bon processeur (run_processing_job vs run_word_processing_job)
- Messages d'erreur adaptés pour PDF et Word
* Mise à jour des docstrings
- templates/upload.html:
* Titre: "Parser PDF/Word/Markdown" (au lieu de PDF/Markdown)
* Accept attribute: ".pdf,.docx,.md"
* Tooltips: Explique que Word n'a pas besoin d'OCR
* Pipeline de traitement: Section séparée pour PDF vs Word
* Labels mis à jour pour inclure Word
Fonctionnalités:
✅ Upload de fichiers .docx via interface web
✅ Traitement en arrière-plan avec SSE
✅ Pas d'OCR nécessaire pour Word (économie ~0.003€/page)
✅ Réutilisation complète des modules LLM existants
✅ Extraction directe via python-docx
✅ Construction TOC depuis styles Heading 1-9
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -97,7 +97,7 @@ app.config["SECRET_KEY"] = os.environ.get("SECRET_KEY", "dev-secret-key-change-i
|
|||||||
# Configuration upload
|
# Configuration upload
|
||||||
app.config["UPLOAD_FOLDER"] = Path(__file__).parent / "output"
|
app.config["UPLOAD_FOLDER"] = Path(__file__).parent / "output"
|
||||||
app.config["MAX_CONTENT_LENGTH"] = 50 * 1024 * 1024 # 50 MB max
|
app.config["MAX_CONTENT_LENGTH"] = 50 * 1024 * 1024 # 50 MB max
|
||||||
ALLOWED_EXTENSIONS = {"pdf", "md"}
|
ALLOWED_EXTENSIONS = {"pdf", "md", "docx"}
|
||||||
|
|
||||||
# Stockage des jobs de traitement en cours
|
# Stockage des jobs de traitement en cours
|
||||||
processing_jobs: Dict[str, Dict[str, Any]] = {} # {job_id: {"status": str, "queue": Queue, "result": dict}}
|
processing_jobs: Dict[str, Dict[str, Any]] = {} # {job_id: {"status": str, "queue": Queue, "result": dict}}
|
||||||
@@ -1940,23 +1940,108 @@ def run_processing_job(
|
|||||||
q.put(exception_event)
|
q.put(exception_event)
|
||||||
|
|
||||||
|
|
||||||
|
def run_word_processing_job(
|
||||||
|
job_id: str,
|
||||||
|
file_bytes: bytes,
|
||||||
|
filename: str,
|
||||||
|
options: ProcessingOptions,
|
||||||
|
) -> None:
|
||||||
|
"""Execute Word processing in background with SSE event emission.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
job_id: Unique identifier for this processing job.
|
||||||
|
file_bytes: Raw Word file content (.docx).
|
||||||
|
filename: Original filename for the Word document.
|
||||||
|
options: Processing options (LLM settings, etc.).
|
||||||
|
"""
|
||||||
|
job: Dict[str, Any] = processing_jobs[job_id]
|
||||||
|
q: queue.Queue[SSEEvent] = job["queue"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from utils.word_pipeline import process_word
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
# Callback pour émettre la progression
|
||||||
|
def progress_callback(step: str, status: str, detail: str = "") -> None:
|
||||||
|
event: SSEEvent = {
|
||||||
|
"type": "step",
|
||||||
|
"step": step,
|
||||||
|
"status": status,
|
||||||
|
"detail": detail if detail else None
|
||||||
|
}
|
||||||
|
q.put(event)
|
||||||
|
|
||||||
|
# Save Word file to temporary location (python-docx needs a file path)
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp_file:
|
||||||
|
tmp_file.write(file_bytes)
|
||||||
|
tmp_path = Path(tmp_file.name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Traiter le Word avec callback
|
||||||
|
from utils.types import LLMProvider, PipelineResult
|
||||||
|
from typing import cast
|
||||||
|
|
||||||
|
result: PipelineResult = process_word(
|
||||||
|
tmp_path,
|
||||||
|
use_llm=options["use_llm"],
|
||||||
|
llm_provider=cast(LLMProvider, options["llm_provider"]),
|
||||||
|
use_semantic_chunking=True,
|
||||||
|
ingest_to_weaviate=options["ingest_weaviate"],
|
||||||
|
skip_metadata_lines=5,
|
||||||
|
extract_images=True,
|
||||||
|
progress_callback=progress_callback,
|
||||||
|
)
|
||||||
|
|
||||||
|
job["result"] = result
|
||||||
|
|
||||||
|
if result.get("success"):
|
||||||
|
job["status"] = "complete"
|
||||||
|
doc_name: str = result.get("document_name", Path(filename).stem)
|
||||||
|
complete_event: SSEEvent = {
|
||||||
|
"type": "complete",
|
||||||
|
"redirect": f"/documents/{doc_name}/view"
|
||||||
|
}
|
||||||
|
q.put(complete_event)
|
||||||
|
else:
|
||||||
|
job["status"] = "error"
|
||||||
|
error_event: SSEEvent = {
|
||||||
|
"type": "error",
|
||||||
|
"message": result.get("error", "Erreur inconnue")
|
||||||
|
}
|
||||||
|
q.put(error_event)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Clean up temporary file
|
||||||
|
if tmp_path.exists():
|
||||||
|
tmp_path.unlink()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
job["status"] = "error"
|
||||||
|
job["result"] = {"error": str(e)}
|
||||||
|
exception_event: SSEEvent = {
|
||||||
|
"type": "error",
|
||||||
|
"message": str(e)
|
||||||
|
}
|
||||||
|
q.put(exception_event)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/upload", methods=["GET", "POST"])
|
@app.route("/upload", methods=["GET", "POST"])
|
||||||
def upload() -> str:
|
def upload() -> str:
|
||||||
"""Handle PDF upload form display and file submission.
|
"""Handle PDF/Word upload form display and file submission.
|
||||||
|
|
||||||
GET: Displays the upload form with processing options.
|
GET: Displays the upload form with processing options.
|
||||||
POST: Validates the uploaded PDF, starts background processing, and
|
POST: Validates the uploaded file (PDF or Word), starts background processing,
|
||||||
redirects to the progress page.
|
and redirects to the progress page.
|
||||||
|
|
||||||
Form Parameters (POST):
|
Form Parameters (POST):
|
||||||
file: PDF file to upload (required, max 50MB).
|
file: PDF (.pdf) or Word (.docx) file to upload (required, max 50MB).
|
||||||
llm_provider (str): LLM provider - "mistral" or "ollama". Defaults to "mistral".
|
llm_provider (str): LLM provider - "mistral" or "ollama". Defaults to "mistral".
|
||||||
llm_model (str): Specific model name. Defaults based on provider.
|
llm_model (str): Specific model name. Defaults based on provider.
|
||||||
skip_ocr (bool): Skip OCR if markdown already exists. Defaults to False.
|
skip_ocr (bool): Skip OCR if markdown already exists (PDF only). Defaults to False.
|
||||||
use_llm (bool): Enable LLM processing steps. Defaults to True.
|
use_llm (bool): Enable LLM processing steps. Defaults to True.
|
||||||
ingest_weaviate (bool): Ingest chunks to Weaviate. Defaults to True.
|
ingest_weaviate (bool): Ingest chunks to Weaviate. Defaults to True.
|
||||||
use_ocr_annotations (bool): Use OCR annotations for better TOC. Defaults to False.
|
use_ocr_annotations (bool): Use OCR annotations for better TOC (PDF only). Defaults to False.
|
||||||
max_toc_pages (int): Max pages to scan for TOC. Defaults to 8.
|
max_toc_pages (int): Max pages to scan for TOC (PDF only). Defaults to 8.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
GET: Rendered upload form (upload.html).
|
GET: Rendered upload form (upload.html).
|
||||||
@@ -1980,7 +2065,7 @@ def upload() -> str:
|
|||||||
return render_template("upload.html", error="Aucun fichier sélectionné")
|
return render_template("upload.html", error="Aucun fichier sélectionné")
|
||||||
|
|
||||||
if not allowed_file(file.filename):
|
if not allowed_file(file.filename):
|
||||||
return render_template("upload.html", error="Format non supporté. Utilisez un fichier PDF ou Markdown (.md).")
|
return render_template("upload.html", error="Format non supporté. Utilisez un fichier PDF (.pdf) ou Word (.docx).")
|
||||||
|
|
||||||
# Options de traitement
|
# Options de traitement
|
||||||
llm_provider: str = request.form.get("llm_provider", "mistral")
|
llm_provider: str = request.form.get("llm_provider", "mistral")
|
||||||
@@ -2000,6 +2085,10 @@ def upload() -> str:
|
|||||||
filename: str = secure_filename(file.filename)
|
filename: str = secure_filename(file.filename)
|
||||||
file_bytes: bytes = file.read()
|
file_bytes: bytes = file.read()
|
||||||
|
|
||||||
|
# Déterminer le type de fichier
|
||||||
|
file_extension: str = filename.rsplit(".", 1)[1].lower() if "." in filename else ""
|
||||||
|
is_word_document: bool = file_extension == "docx"
|
||||||
|
|
||||||
# Créer un job de traitement
|
# Créer un job de traitement
|
||||||
job_id: str = str(uuid.uuid4())
|
job_id: str = str(uuid.uuid4())
|
||||||
processing_jobs[job_id] = {
|
processing_jobs[job_id] = {
|
||||||
@@ -2009,15 +2098,23 @@ def upload() -> str:
|
|||||||
"filename": filename,
|
"filename": filename,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Démarrer le traitement en background
|
# Démarrer le traitement en background (Word ou PDF)
|
||||||
|
if is_word_document:
|
||||||
|
thread: threading.Thread = threading.Thread(
|
||||||
|
target=run_word_processing_job,
|
||||||
|
args=(job_id, file_bytes, filename, options)
|
||||||
|
)
|
||||||
|
else:
|
||||||
thread: threading.Thread = threading.Thread(
|
thread: threading.Thread = threading.Thread(
|
||||||
target=run_processing_job,
|
target=run_processing_job,
|
||||||
args=(job_id, file_bytes, filename, options)
|
args=(job_id, file_bytes, filename, options)
|
||||||
)
|
)
|
||||||
|
|
||||||
thread.daemon = True
|
thread.daemon = True
|
||||||
thread.start()
|
thread.start()
|
||||||
|
|
||||||
# Afficher la page de progression
|
# Afficher la page de progression
|
||||||
|
file_type_label: str = "Word" if is_word_document else "PDF"
|
||||||
return render_template("upload_progress.html", job_id=job_id, filename=filename)
|
return render_template("upload_progress.html", job_id=job_id, filename=filename)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -4,8 +4,8 @@
|
|||||||
|
|
||||||
{% block content %}
|
{% block content %}
|
||||||
<section class="section">
|
<section class="section">
|
||||||
<h1>📄 Parser PDF/Markdown</h1>
|
<h1>📄 Parser PDF/Word/Markdown</h1>
|
||||||
<p class="lead">Uploadez un fichier PDF ou Markdown pour l'analyser et structurer son contenu</p>
|
<p class="lead">Uploadez un fichier PDF, Word (.docx) ou Markdown pour l'analyser et structurer son contenu</p>
|
||||||
|
|
||||||
{% if error %}
|
{% if error %}
|
||||||
<div class="alert alert-warning">
|
<div class="alert alert-warning">
|
||||||
@@ -16,18 +16,19 @@
|
|||||||
<div class="search-box">
|
<div class="search-box">
|
||||||
<form method="post" enctype="multipart/form-data">
|
<form method="post" enctype="multipart/form-data">
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<label class="form-label" for="file">Fichier PDF ou Markdown</label>
|
<label class="form-label" for="file">Fichier PDF, Word ou Markdown</label>
|
||||||
<input
|
<input
|
||||||
type="file"
|
type="file"
|
||||||
name="file"
|
name="file"
|
||||||
id="file"
|
id="file"
|
||||||
class="form-control"
|
class="form-control"
|
||||||
accept=".pdf,.md"
|
accept=".pdf,.docx,.md"
|
||||||
required
|
required
|
||||||
>
|
>
|
||||||
<div class="caption mt-1">Taille maximale : 50 MB</div>
|
<div class="caption mt-1">Taille maximale : 50 MB</div>
|
||||||
<div class="caption" style="color: var(--color-accent); margin-top: 0.25rem;">
|
<div class="caption" style="color: var(--color-accent); margin-top: 0.25rem;">
|
||||||
💡 Pour retester un document existant sans refaire l'OCR payant, cochez "Skip OCR"
|
💡 PDF: Pour retester sans refaire l'OCR payant, cochez "Skip OCR"
|
||||||
|
<br>💡 Word: Pas d'OCR nécessaire (extraction directe du contenu)
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -150,10 +151,17 @@
|
|||||||
<div class="card">
|
<div class="card">
|
||||||
<h3>📋 Pipeline de traitement</h3>
|
<h3>📋 Pipeline de traitement</h3>
|
||||||
<div class="mt-2">
|
<div class="mt-2">
|
||||||
<p><strong>1. OCR Mistral</strong> — Extraction du texte et des images via l'API Mistral</p>
|
<p><strong>PDF:</strong></p>
|
||||||
<p><strong>2. Markdown</strong> — Construction du document Markdown avec images</p>
|
<p style="margin-left: 1rem;">1. OCR Mistral — Extraction du texte et des images via l'API Mistral</p>
|
||||||
<p><strong>3. Hiérarchie</strong> — Analyse des titres pour créer une structure arborescente</p>
|
<p style="margin-left: 1rem;">2. Markdown — Construction du document Markdown avec images</p>
|
||||||
<p><strong>4. LLM (optionnel)</strong> — Amélioration de la structure via Ollama</p>
|
<p style="margin-left: 1rem;">3. Hiérarchie — Analyse des titres pour créer une structure arborescente</p>
|
||||||
|
<p style="margin-left: 1rem;">4. LLM (optionnel) — Amélioration de la structure via Ollama/Mistral</p>
|
||||||
|
|
||||||
|
<p class="mt-3"><strong>Word (.docx):</strong></p>
|
||||||
|
<p style="margin-left: 1rem;">1. Extraction Word — Lecture directe du contenu (pas d'OCR)</p>
|
||||||
|
<p style="margin-left: 1rem;">2. Markdown — Construction du document Markdown à partir des paragraphes</p>
|
||||||
|
<p style="margin-left: 1rem;">3. TOC — Extraction de la hiérarchie depuis les styles Heading 1-9</p>
|
||||||
|
<p style="margin-left: 1rem;">4. LLM — Structuration sémantique et enrichissement</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user