From 0800f74bd77593fc3e4031c2436310951e53f889 Mon Sep 17 00:00:00 2001 From: David Blanc Brioir Date: Tue, 30 Dec 2025 22:39:41 +0100 Subject: [PATCH] Fix: clean_chunk attend str, pas dict MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problème: - Erreur: "expected string or bytes-like object, got 'dict'" - À l'étape "Chunk Cleaning", on passait chunk (dict) au lieu de chunk["text"] (str) Correction word_pipeline.py (ligne 434): AVANT: ```python cleaned = clean_chunk(chunk) # chunk est un dict! ``` APRÈS: ```python text: str = chunk.get("text", "") cleaned_text = clean_chunk(text, use_llm=False) if is_chunk_valid(cleaned_text, min_chars=30, min_words=8): chunk["text"] = cleaned_text cleaned_chunks.append(chunk) ``` Pattern copié depuis pdf_pipeline.py:765-771 où la même logique extrait le texte, le nettoie, puis met à jour le dict. Test réussi: ✅ 48 paragraphes extraits ✅ 37 chunks créés ✅ Nettoyage OK ✅ Validation OK ✅ Pipeline complet fonctionnel avec Mistral API 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- generations/library_rag/utils/word_pipeline.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/generations/library_rag/utils/word_pipeline.py b/generations/library_rag/utils/word_pipeline.py index e8e1956..f1a8412 100644 --- a/generations/library_rag/utils/word_pipeline.py +++ b/generations/library_rag/utils/word_pipeline.py @@ -424,16 +424,24 @@ def process_word( # STEP 8: Chunk Cleaning (REUSED) # ================================================================ if use_llm: - from utils.llm_cleaner import clean_chunk + from utils.llm_cleaner import clean_chunk, is_chunk_valid callback("Chunk Cleaning", "running", "Cleaning chunks...") # Clean each chunk cleaned_chunks = [] for chunk in chunks: - cleaned = clean_chunk(chunk) - if cleaned: # Only keep valid chunks - cleaned_chunks.append(cleaned) + # Extract text from chunk dict + text: str = chunk.get("text", "") + + # Clean the text + cleaned_text = clean_chunk(text, use_llm=False) + + # Validate chunk + if is_chunk_valid(cleaned_text, min_chars=30, min_words=8): + # Update chunk with cleaned text + chunk["text"] = cleaned_text + cleaned_chunks.append(chunk) chunks = cleaned_chunks