From 845ffb4b06f3840a2e14ecd610753772d5c373ab Mon Sep 17 00:00:00 2001
From: David Blanc Brioir <davidblancbrioir@gmail.com>
Date: Tue, 30 Dec 2025 23:39:41 +0100
Subject: [PATCH] =?UTF-8?q?Fix:=20M=C3=A9tadonn=C3=A9es=20Word=20correctes?=
 =?UTF-8?q?=20+=20d=C3=A9sactivation=20concepts?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Problèmes corrigés:
1. TITRE INCORRECT → Maintenant utilise TITRE: de la première page
2. CONCEPTS EN FRANÇAIS → Désactivé l'enrichissement LLM

Avant:
- Titre: "An Historical Sketch..." (mauvais, titre du chapitre)
- Concepts: ['immuabilité des espèces', 'création séparée'] (français)
- Résultat: 3/37 chunks ingérés dans Weaviate

Après:
- Titre: "On the Origin of Species BY MEANS OF..." (correct!)
- Concepts: [] (vides, pas de problème d'encoding)
- Résultat: 14/37 chunks ingérés (mieux mais pas parfait)

Changements word_pipeline.py:

1. STEP 5 - Métadonnées simplifiées (ligne 241-262):
   - Supprimé l'appel à extract_metadata() du LLM
   - Utilise directement raw_meta de extract_word_metadata()
   - Le LLM prenait le titre du chapitre au lieu du livre

2. STEP 9 - Désactivé enrichissement concepts (ligne 410-423):
   - Skip enrich_chunks_with_concepts()
   - Raison: LLM génère concepts en FRANÇAIS pour texte ANGLAIS
   - Accents français causent échecs Weaviate

Note TOC:
Le document n'a que 2 Heading 2, donc la TOC est limitée.
C'est normal pour un extrait de 10 pages.

Reste à investiguer: Pourquoi 14/37 au lieu de 37/37 chunks?

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 .../library_rag/utils/word_pipeline.py        | 107 +++++-------------
 1 file changed, 28 insertions(+), 79 deletions(-)

diff --git a/generations/library_rag/utils/word_pipeline.py b/generations/library_rag/utils/word_pipeline.py
index e2f6e28..e4bb188 100644
--- a/generations/library_rag/utils/word_pipeline.py
+++ b/generations/library_rag/utils/word_pipeline.py
@@ -239,71 +239,27 @@ def process_word(
             )
 
         # ================================================================
-        # STEP 5: LLM Metadata Extraction (REUSED)
+        # STEP 5: Metadata Extraction from Word (NO LLM NEEDED)
         # ================================================================
+        # Word documents have metadata in first lines (TITRE:, AUTEUR:, EDITION:)
+        # or in core properties. LLM extraction often gets it wrong (takes chapter
+        # title instead of book title), so we use Word-native metadata directly.
         metadata: Metadata
         cost_llm = 0.0
 
-        if use_llm:
-            from utils.llm_metadata import extract_metadata
+        raw_meta = content["metadata_raw"]
+        metadata = Metadata(
+            title=raw_meta.get("title") or doc_name,
+            author=raw_meta.get("author") or "Unknown",
+            year=raw_meta.get("created").year if raw_meta.get("created") else None,
+            language="en",  # Default to English, could be improved
+        )
 
-            callback("Metadata Extraction", "running", "Extracting metadata with LLM...")
-
-            try:
-                metadata_llm = extract_metadata(
-                    markdown_text,
-                    provider=llm_provider,
-                )
-
-                # Fallback to Word properties if LLM returns None
-                if metadata_llm is None:
-                    callback(
-                        "Metadata Extraction",
-                        "completed",
-                        "LLM extraction failed, using Word properties",
-                    )
-                    raw_meta = content["metadata_raw"]
-                    metadata = Metadata(
-                        title=raw_meta.get("title", doc_name),
-                        author=raw_meta.get("author", "Unknown"),
-                        year=raw_meta.get("created").year if raw_meta.get("created") else None,
-                        language=raw_meta.get("language", "unknown"),
-                    )
-                else:
-                    metadata = metadata_llm
-                    callback(
-                        "Metadata Extraction",
-                        "completed",
-                        f"Title: {metadata.get('title', '')[:50]}..., Author: {metadata.get('author', '')}",
-                    )
-            except Exception as e:
-                callback(
-                    "Metadata Extraction",
-                    "completed",
-                    f"LLM error ({str(e)}), using Word properties",
-                )
-                raw_meta = content["metadata_raw"]
-                metadata = Metadata(
-                    title=raw_meta.get("title", doc_name),
-                    author=raw_meta.get("author", "Unknown"),
-                    year=raw_meta.get("created").year if raw_meta.get("created") else None,
-                    language=raw_meta.get("language", "unknown"),
-                )
-        else:
-            # Use metadata from Word properties
-            raw_meta = content["metadata_raw"]
-            metadata = Metadata(
-                title=raw_meta.get("title", doc_name),
-                author=raw_meta.get("author", "Unknown"),
-                year=raw_meta.get("created").year if raw_meta.get("created") else None,
-                language=raw_meta.get("language", "unknown"),
-            )
-
-            callback(
-                "Metadata Extraction",
-                "completed",
-                "Using Word document properties",
-            )
+        callback(
+            "Metadata Extraction",
+            "completed",
+            f"Title: {metadata.get('title', '')[:50]}..., Author: {metadata.get('author', '')}",
+        )
 
         # ================================================================
         # STEP 6: Section Classification (REUSED)
@@ -452,26 +408,19 @@ def process_word(
             )
 
         # ================================================================
-        # STEP 9: Chunk Validation (REUSED)
+        # STEP 9: Chunk Validation (SKIP FOR WORD)
         # ================================================================
-        if use_llm:
-            from utils.llm_validator import enrich_chunks_with_concepts
-
-            callback("Chunk Validation", "running", "Enriching chunks with concepts...")
-
-            # Enrich chunks with keywords/concepts
-            enriched_chunks = enrich_chunks_with_concepts(
-                chunks,
-                provider=llm_provider,
-            )
-
-            chunks = enriched_chunks
-
-            callback(
-                "Chunk Validation",
-                "completed",
-                f"Validated {len(chunks)} chunks",
-            )
+        # NOTE: We skip LLM concept enrichment for Word documents because:
+        # 1. The LLM generates concepts in French even for English text
+        # 2. French accents cause Weaviate ingestion failures
+        # 3. Word documents already have clean structure, don't need LLM enhancement
+        #
+        # For production: could re-enable with language detection + prompt tuning
+        callback(
+            "Chunk Validation",
+            "completed",
+            f"Skipped (Word documents don't need LLM enrichment)",
+        )
 
         # ================================================================
         # STEP 10: Save Chunks JSON