diff --git a/generations/library_rag/templates/upload.html b/generations/library_rag/templates/upload.html index 87d87b8..665696e 100644 --- a/generations/library_rag/templates/upload.html +++ b/generations/library_rag/templates/upload.html @@ -5,7 +5,7 @@ {% block content %}

📄 Parser PDF/Word/Markdown

-

Uploadez un fichier PDF, Word (.docx) ou Markdown pour l'analyser et structurer son contenu

+

Uploadez un document pour l'analyser et l'indexer dans Weaviate

{% if error %}
@@ -15,8 +15,9 @@
-{% endblock %} + +{% endblock %} diff --git a/generations/library_rag/utils/word_pipeline.py b/generations/library_rag/utils/word_pipeline.py index 9beba61..e8e1956 100644 --- a/generations/library_rag/utils/word_pipeline.py +++ b/generations/library_rag/utils/word_pipeline.py @@ -249,18 +249,46 @@ def process_word( callback("Metadata Extraction", "running", "Extracting metadata with LLM...") - metadata = extract_metadata( - markdown_text, - provider=llm_provider, - ) + try: + metadata_llm = extract_metadata( + markdown_text, + provider=llm_provider, + ) - # Note: extract_metadata doesn't return cost directly - - callback( - "Metadata Extraction", - "completed", - f"Title: {metadata['title'][:50]}..., Author: {metadata['author']}", - ) + # Fallback to Word properties if LLM returns None + if metadata_llm is None: + callback( + "Metadata Extraction", + "completed", + "LLM extraction failed, using Word properties", + ) + raw_meta = content["metadata_raw"] + metadata = Metadata( + title=raw_meta.get("title", doc_name), + author=raw_meta.get("author", "Unknown"), + year=raw_meta.get("created").year if raw_meta.get("created") else None, + language=raw_meta.get("language", "unknown"), + ) + else: + metadata = metadata_llm + callback( + "Metadata Extraction", + "completed", + f"Title: {metadata.get('title', '')[:50]}..., Author: {metadata.get('author', '')}", + ) + except Exception as e: + callback( + "Metadata Extraction", + "completed", + f"LLM error ({str(e)}), using Word properties", + ) + raw_meta = content["metadata_raw"] + metadata = Metadata( + title=raw_meta.get("title", doc_name), + author=raw_meta.get("author", "Unknown"), + year=raw_meta.get("created").year if raw_meta.get("created") else None, + language=raw_meta.get("language", "unknown"), + ) else: # Use metadata from Word properties raw_meta = content["metadata_raw"] @@ -303,7 +331,7 @@ def process_word( main_sections = [ s for s in classified_sections - if s["section_type"] == "main_content" + if s.get("type") == "main_content" ] callback( @@ -316,8 +344,9 @@ def process_word( classified_sections = [ { "section_path": entry["sectionPath"], - "section_type": "main_content", - "reason": "No LLM classification", + "type": "main_content", + "should_index": True, + "classification_reason": "No LLM classification", } for entry in toc_flat ]