From 845ffb4b06f3840a2e14ecd610753772d5c373ab Mon Sep 17 00:00:00 2001 From: David Blanc Brioir Date: Tue, 30 Dec 2025 23:39:41 +0100 Subject: [PATCH] =?UTF-8?q?Fix:=20M=C3=A9tadonn=C3=A9es=20Word=20correctes?= =?UTF-8?q?=20+=20d=C3=A9sactivation=20concepts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problèmes corrigés: 1. TITRE INCORRECT → Maintenant utilise TITRE: de la première page 2. CONCEPTS EN FRANÇAIS → Désactivé l'enrichissement LLM Avant: - Titre: "An Historical Sketch..." (mauvais, titre du chapitre) - Concepts: ['immuabilité des espèces', 'création séparée'] (français) - Résultat: 3/37 chunks ingérés dans Weaviate Après: - Titre: "On the Origin of Species BY MEANS OF..." (correct!) - Concepts: [] (vides, pas de problème d'encoding) - Résultat: 14/37 chunks ingérés (mieux mais pas parfait) Changements word_pipeline.py: 1. STEP 5 - Métadonnées simplifiées (ligne 241-262): - Supprimé l'appel à extract_metadata() du LLM - Utilise directement raw_meta de extract_word_metadata() - Le LLM prenait le titre du chapitre au lieu du livre 2. STEP 9 - Désactivé enrichissement concepts (ligne 410-423): - Skip enrich_chunks_with_concepts() - Raison: LLM génère concepts en FRANÇAIS pour texte ANGLAIS - Accents français causent échecs Weaviate Note TOC: Le document n'a que 2 Heading 2, donc la TOC est limitée. C'est normal pour un extrait de 10 pages. Reste à investiguer: Pourquoi 14/37 au lieu de 37/37 chunks? 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- .../library_rag/utils/word_pipeline.py | 107 +++++------------- 1 file changed, 28 insertions(+), 79 deletions(-) diff --git a/generations/library_rag/utils/word_pipeline.py b/generations/library_rag/utils/word_pipeline.py index e2f6e28..e4bb188 100644 --- a/generations/library_rag/utils/word_pipeline.py +++ b/generations/library_rag/utils/word_pipeline.py @@ -239,71 +239,27 @@ def process_word( ) # ================================================================ - # STEP 5: LLM Metadata Extraction (REUSED) + # STEP 5: Metadata Extraction from Word (NO LLM NEEDED) # ================================================================ + # Word documents have metadata in first lines (TITRE:, AUTEUR:, EDITION:) + # or in core properties. LLM extraction often gets it wrong (takes chapter + # title instead of book title), so we use Word-native metadata directly. metadata: Metadata cost_llm = 0.0 - if use_llm: - from utils.llm_metadata import extract_metadata + raw_meta = content["metadata_raw"] + metadata = Metadata( + title=raw_meta.get("title") or doc_name, + author=raw_meta.get("author") or "Unknown", + year=raw_meta.get("created").year if raw_meta.get("created") else None, + language="en", # Default to English, could be improved + ) - callback("Metadata Extraction", "running", "Extracting metadata with LLM...") - - try: - metadata_llm = extract_metadata( - markdown_text, - provider=llm_provider, - ) - - # Fallback to Word properties if LLM returns None - if metadata_llm is None: - callback( - "Metadata Extraction", - "completed", - "LLM extraction failed, using Word properties", - ) - raw_meta = content["metadata_raw"] - metadata = Metadata( - title=raw_meta.get("title", doc_name), - author=raw_meta.get("author", "Unknown"), - year=raw_meta.get("created").year if raw_meta.get("created") else None, - language=raw_meta.get("language", "unknown"), - ) - else: - metadata = metadata_llm - callback( - "Metadata Extraction", - "completed", - f"Title: {metadata.get('title', '')[:50]}..., Author: {metadata.get('author', '')}", - ) - except Exception as e: - callback( - "Metadata Extraction", - "completed", - f"LLM error ({str(e)}), using Word properties", - ) - raw_meta = content["metadata_raw"] - metadata = Metadata( - title=raw_meta.get("title", doc_name), - author=raw_meta.get("author", "Unknown"), - year=raw_meta.get("created").year if raw_meta.get("created") else None, - language=raw_meta.get("language", "unknown"), - ) - else: - # Use metadata from Word properties - raw_meta = content["metadata_raw"] - metadata = Metadata( - title=raw_meta.get("title", doc_name), - author=raw_meta.get("author", "Unknown"), - year=raw_meta.get("created").year if raw_meta.get("created") else None, - language=raw_meta.get("language", "unknown"), - ) - - callback( - "Metadata Extraction", - "completed", - "Using Word document properties", - ) + callback( + "Metadata Extraction", + "completed", + f"Title: {metadata.get('title', '')[:50]}..., Author: {metadata.get('author', '')}", + ) # ================================================================ # STEP 6: Section Classification (REUSED) @@ -452,26 +408,19 @@ def process_word( ) # ================================================================ - # STEP 9: Chunk Validation (REUSED) + # STEP 9: Chunk Validation (SKIP FOR WORD) # ================================================================ - if use_llm: - from utils.llm_validator import enrich_chunks_with_concepts - - callback("Chunk Validation", "running", "Enriching chunks with concepts...") - - # Enrich chunks with keywords/concepts - enriched_chunks = enrich_chunks_with_concepts( - chunks, - provider=llm_provider, - ) - - chunks = enriched_chunks - - callback( - "Chunk Validation", - "completed", - f"Validated {len(chunks)} chunks", - ) + # NOTE: We skip LLM concept enrichment for Word documents because: + # 1. The LLM generates concepts in French even for English text + # 2. French accents cause Weaviate ingestion failures + # 3. Word documents already have clean structure, don't need LLM enhancement + # + # For production: could re-enable with language detection + prompt tuning + callback( + "Chunk Validation", + "completed", + f"Skipped (Word documents don't need LLM enrichment)", + ) # ================================================================ # STEP 10: Save Chunks JSON