From 0800f74bd77593fc3e4031c2436310951e53f889 Mon Sep 17 00:00:00 2001
From: David Blanc Brioir <davidblancbrioir@gmail.com>
Date: Tue, 30 Dec 2025 22:39:41 +0100
Subject: [PATCH] Fix: clean_chunk attend str, pas dict
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Problème:
- Erreur: "expected string or bytes-like object, got 'dict'"
- À l'étape "Chunk Cleaning", on passait chunk (dict) au lieu de chunk["text"] (str)

Correction word_pipeline.py (ligne 434):
AVANT:
```python
cleaned = clean_chunk(chunk)  # chunk est un dict!
```

APRÈS:
```python
text: str = chunk.get("text", "")
cleaned_text = clean_chunk(text, use_llm=False)
if is_chunk_valid(cleaned_text, min_chars=30, min_words=8):
    chunk["text"] = cleaned_text
    cleaned_chunks.append(chunk)
```

Pattern copié depuis pdf_pipeline.py:765-771 où la même logique
extrait le texte, le nettoie, puis met à jour le dict.

Test réussi:
✅ 48 paragraphes extraits
✅ 37 chunks créés
✅ Nettoyage OK
✅ Validation OK
✅ Pipeline complet fonctionnel avec Mistral API

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 generations/library_rag/utils/word_pipeline.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/generations/library_rag/utils/word_pipeline.py b/generations/library_rag/utils/word_pipeline.py
index e8e1956..f1a8412 100644
--- a/generations/library_rag/utils/word_pipeline.py
+++ b/generations/library_rag/utils/word_pipeline.py
@@ -424,16 +424,24 @@ def process_word(
         # STEP 8: Chunk Cleaning (REUSED)
         # ================================================================
         if use_llm:
-            from utils.llm_cleaner import clean_chunk
+            from utils.llm_cleaner import clean_chunk, is_chunk_valid
 
             callback("Chunk Cleaning", "running", "Cleaning chunks...")
 
             # Clean each chunk
             cleaned_chunks = []
             for chunk in chunks:
-                cleaned = clean_chunk(chunk)
-                if cleaned:  # Only keep valid chunks
-                    cleaned_chunks.append(cleaned)
+                # Extract text from chunk dict
+                text: str = chunk.get("text", "")
+
+                # Clean the text
+                cleaned_text = clean_chunk(text, use_llm=False)
+
+                # Validate chunk
+                if is_chunk_valid(cleaned_text, min_chars=30, min_words=8):
+                    # Update chunk with cleaned text
+                    chunk["text"] = cleaned_text
+                    cleaned_chunks.append(chunk)
 
             chunks = cleaned_chunks