feat: Add data quality verification & cleanup scripts

## Data Quality & Cleanup (Priorities 1-6) Added comprehensive data quality verification and cleanup system: **Scripts créés**: - verify_data_quality.py: Analyse qualité complète œuvre par œuvre - clean_duplicate_documents.py: Nettoyage doublons Documents - populate_work_collection.py/clean.py: Peuplement Work collection - fix_chunks_count.py: Correction chunksCount incohérents - manage_orphan_chunks.py: Gestion chunks orphelins (3 options) - clean_orphan_works.py: Suppression Works sans chunks - add_missing_work.py: Création Work manquant - generate_schema_stats.py: Génération stats auto - migrate_add_work_collection.py: Migration sûre Work collection **Documentation**: - WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes) - WEAVIATE_SCHEMA.md: Référence schéma rapide - NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session - ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale - rapport_qualite_donnees.txt: Output brut vérification **Résultats nettoyage**: - Documents: 16 → 9 (7 doublons supprimés) - Works: 0 → 9 (peuplé + nettoyé) - Chunks: 5,404 → 5,230 (174 orphelins supprimés) - chunksCount: Corrigés (231 → 5,230 déclaré = réel) - Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres **Modifications code**: - schema.py: Ajout Work collection avec vectorisation - utils/weaviate_ingest.py: Support Work ingestion - utils/word_pipeline.py: Désactivation concepts (problème .lower()) - utils/word_toc_extractor.py: Métadonnées Word correctes - .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-01 11:57:26 +01:00
parent 845ffb4b06
commit 04ee3f9e39
26 changed files with 6945 additions and 16 deletions
--- a/generations/library_rag/tests/test_validation_stricte.py
+++ b/generations/library_rag/tests/test_validation_stricte.py
@@ -0,0 +1,356 @@
+#!/usr/bin/env python3
+"""Tests unitaires pour la validation stricte des métadonnées et nested objects.
+
+Ce module teste les fonctions de validation ajoutées dans weaviate_ingest.py
+pour prévenir les erreurs silencieuses causées par des métadonnées invalides.
+
+Run:
+    pytest tests/test_validation_stricte.py -v
+"""
+
+import pytest
+from typing import Any, Dict
+
+from utils.weaviate_ingest import (
+    validate_document_metadata,
+    validate_chunk_nested_objects,
+)
+
+
+# =============================================================================
+# Tests pour validate_document_metadata()
+# =============================================================================
+
+
+def test_validate_document_metadata_valid() -> None:
+    """Test validation avec métadonnées valides."""
+    # Should not raise
+    validate_document_metadata(
+        doc_name="platon_republique",
+        metadata={"title": "La République", "author": "Platon"},
+        language="fr",
+    )
+
+
+def test_validate_document_metadata_valid_with_work_key() -> None:
+    """Test validation avec key 'work' au lieu de 'title'."""
+    # Should not raise
+    validate_document_metadata(
+        doc_name="test_doc",
+        metadata={"work": "Test Work", "author": "Test Author"},
+        language="en",
+    )
+
+
+def test_validate_document_metadata_empty_doc_name() -> None:
+    """Test que doc_name vide lève ValueError."""
+    with pytest.raises(ValueError, match="Invalid doc_name: empty"):
+        validate_document_metadata(
+            doc_name="",
+            metadata={"title": "Title", "author": "Author"},
+            language="fr",
+        )
+
+
+def test_validate_document_metadata_whitespace_doc_name() -> None:
+    """Test que doc_name whitespace-only lève ValueError."""
+    with pytest.raises(ValueError, match="Invalid doc_name: empty"):
+        validate_document_metadata(
+            doc_name="   ",
+            metadata={"title": "Title", "author": "Author"},
+            language="fr",
+        )
+
+
+def test_validate_document_metadata_missing_title() -> None:
+    """Test que title manquant lève ValueError."""
+    with pytest.raises(ValueError, match="'title' is missing or empty"):
+        validate_document_metadata(
+            doc_name="test_doc",
+            metadata={"author": "Author"},
+            language="fr",
+        )
+
+
+def test_validate_document_metadata_empty_title() -> None:
+    """Test que title vide lève ValueError."""
+    with pytest.raises(ValueError, match="'title' is missing or empty"):
+        validate_document_metadata(
+            doc_name="test_doc",
+            metadata={"title": "", "author": "Author"},
+            language="fr",
+        )
+
+
+def test_validate_document_metadata_whitespace_title() -> None:
+    """Test que title whitespace-only lève ValueError."""
+    with pytest.raises(ValueError, match="'title' is missing or empty"):
+        validate_document_metadata(
+            doc_name="test_doc",
+            metadata={"title": "   ", "author": "Author"},
+            language="fr",
+        )
+
+
+def test_validate_document_metadata_missing_author() -> None:
+    """Test que author manquant lève ValueError."""
+    with pytest.raises(ValueError, match="'author' is missing or empty"):
+        validate_document_metadata(
+            doc_name="test_doc",
+            metadata={"title": "Title"},
+            language="fr",
+        )
+
+
+def test_validate_document_metadata_empty_author() -> None:
+    """Test que author vide lève ValueError."""
+    with pytest.raises(ValueError, match="'author' is missing or empty"):
+        validate_document_metadata(
+            doc_name="test_doc",
+            metadata={"title": "Title", "author": ""},
+            language="fr",
+        )
+
+
+def test_validate_document_metadata_none_author() -> None:
+    """Test que author=None lève ValueError."""
+    with pytest.raises(ValueError, match="'author' is missing or empty"):
+        validate_document_metadata(
+            doc_name="test_doc",
+            metadata={"title": "Title", "author": None},
+            language="fr",
+        )
+
+
+def test_validate_document_metadata_empty_language() -> None:
+    """Test que language vide lève ValueError."""
+    with pytest.raises(ValueError, match="Invalid language.*empty"):
+        validate_document_metadata(
+            doc_name="test_doc",
+            metadata={"title": "Title", "author": "Author"},
+            language="",
+        )
+
+
+def test_validate_document_metadata_optional_edition() -> None:
+    """Test que edition est optionnel (peut être vide)."""
+    # Should not raise - edition is optional
+    validate_document_metadata(
+        doc_name="test_doc",
+        metadata={"title": "Title", "author": "Author", "edition": ""},
+        language="fr",
+    )
+
+
+# =============================================================================
+# Tests pour validate_chunk_nested_objects()
+# =============================================================================
+
+
+def test_validate_chunk_nested_objects_valid() -> None:
+    """Test validation avec chunk valide."""
+    chunk = {
+        "text": "Some text",
+        "work": {"title": "La République", "author": "Platon"},
+        "document": {"sourceId": "platon_republique", "edition": "GF"},
+    }
+    # Should not raise
+    validate_chunk_nested_objects(chunk, 0, "platon_republique")
+
+
+def test_validate_chunk_nested_objects_empty_edition_ok() -> None:
+    """Test que edition vide est accepté (optionnel)."""
+    chunk = {
+        "text": "Some text",
+        "work": {"title": "Title", "author": "Author"},
+        "document": {"sourceId": "doc_id", "edition": ""},
+    }
+    # Should not raise
+    validate_chunk_nested_objects(chunk, 0, "doc_id")
+
+
+def test_validate_chunk_nested_objects_work_not_dict() -> None:
+    """Test que work non-dict lève ValueError."""
+    chunk = {
+        "text": "Some text",
+        "work": "not a dict",
+        "document": {"sourceId": "doc_id", "edition": ""},
+    }
+    with pytest.raises(ValueError, match="work is not a dict"):
+        validate_chunk_nested_objects(chunk, 5, "doc_id")
+
+
+def test_validate_chunk_nested_objects_empty_work_title() -> None:
+    """Test que work.title vide lève ValueError."""
+    chunk = {
+        "text": "Some text",
+        "work": {"title": "", "author": "Author"},
+        "document": {"sourceId": "doc_id", "edition": ""},
+    }
+    with pytest.raises(ValueError, match="work.title is empty"):
+        validate_chunk_nested_objects(chunk, 10, "doc_id")
+
+
+def test_validate_chunk_nested_objects_none_work_title() -> None:
+    """Test que work.title=None lève ValueError."""
+    chunk = {
+        "text": "Some text",
+        "work": {"title": None, "author": "Author"},
+        "document": {"sourceId": "doc_id", "edition": ""},
+    }
+    with pytest.raises(ValueError, match="work.title is empty"):
+        validate_chunk_nested_objects(chunk, 3, "doc_id")
+
+
+def test_validate_chunk_nested_objects_whitespace_work_title() -> None:
+    """Test que work.title whitespace-only lève ValueError."""
+    chunk = {
+        "text": "Some text",
+        "work": {"title": "   ", "author": "Author"},
+        "document": {"sourceId": "doc_id", "edition": ""},
+    }
+    with pytest.raises(ValueError, match="work.title is empty"):
+        validate_chunk_nested_objects(chunk, 7, "doc_id")
+
+
+def test_validate_chunk_nested_objects_empty_work_author() -> None:
+    """Test que work.author vide lève ValueError."""
+    chunk = {
+        "text": "Some text",
+        "work": {"title": "Title", "author": ""},
+        "document": {"sourceId": "doc_id", "edition": ""},
+    }
+    with pytest.raises(ValueError, match="work.author is empty"):
+        validate_chunk_nested_objects(chunk, 2, "doc_id")
+
+
+def test_validate_chunk_nested_objects_document_not_dict() -> None:
+    """Test que document non-dict lève ValueError."""
+    chunk = {
+        "text": "Some text",
+        "work": {"title": "Title", "author": "Author"},
+        "document": ["not", "a", "dict"],
+    }
+    with pytest.raises(ValueError, match="document is not a dict"):
+        validate_chunk_nested_objects(chunk, 15, "doc_id")
+
+
+def test_validate_chunk_nested_objects_empty_source_id() -> None:
+    """Test que document.sourceId vide lève ValueError."""
+    chunk = {
+        "text": "Some text",
+        "work": {"title": "Title", "author": "Author"},
+        "document": {"sourceId": "", "edition": "Ed"},
+    }
+    with pytest.raises(ValueError, match="document.sourceId is empty"):
+        validate_chunk_nested_objects(chunk, 20, "doc_id")
+
+
+def test_validate_chunk_nested_objects_none_source_id() -> None:
+    """Test que document.sourceId=None lève ValueError."""
+    chunk = {
+        "text": "Some text",
+        "work": {"title": "Title", "author": "Author"},
+        "document": {"sourceId": None, "edition": "Ed"},
+    }
+    with pytest.raises(ValueError, match="document.sourceId is empty"):
+        validate_chunk_nested_objects(chunk, 25, "doc_id")
+
+
+def test_validate_chunk_nested_objects_error_message_includes_index() -> None:
+    """Test que le message d'erreur inclut l'index du chunk."""
+    chunk = {
+        "text": "Some text",
+        "work": {"title": "", "author": "Author"},
+        "document": {"sourceId": "doc_id", "edition": ""},
+    }
+    with pytest.raises(ValueError, match="Chunk 42"):
+        validate_chunk_nested_objects(chunk, 42, "my_doc")
+
+
+def test_validate_chunk_nested_objects_error_message_includes_doc_name() -> None:
+    """Test que le message d'erreur inclut doc_name."""
+    chunk = {
+        "text": "Some text",
+        "work": {"title": "", "author": "Author"},
+        "document": {"sourceId": "doc_id", "edition": ""},
+    }
+    with pytest.raises(ValueError, match="'my_special_doc'"):
+        validate_chunk_nested_objects(chunk, 5, "my_special_doc")
+
+
+# =============================================================================
+# Tests d'intégration (scénarios réels)
+# =============================================================================
+
+
+def test_integration_scenario_peirce_collected_papers() -> None:
+    """Test avec métadonnées réelles de Peirce Collected Papers."""
+    # Métadonnées valides
+    validate_document_metadata(
+        doc_name="peirce_collected_papers_fixed",
+        metadata={
+            "title": "Collected Papers of Charles Sanders Peirce",
+            "author": "Charles Sanders PEIRCE",
+        },
+        language="en",
+    )
+
+    # Chunk valide
+    chunk = {
+        "text": "Logic is the science of the necessary laws of thought...",
+        "work": {
+            "title": "Collected Papers of Charles Sanders Peirce",
+            "author": "Charles Sanders PEIRCE",
+        },
+        "document": {
+            "sourceId": "peirce_collected_papers_fixed",
+            "edition": "Harvard University Press",
+        },
+    }
+    validate_chunk_nested_objects(chunk, 0, "peirce_collected_papers_fixed")
+
+
+def test_integration_scenario_platon_menon() -> None:
+    """Test avec métadonnées réelles de Platon - Ménon."""
+    validate_document_metadata(
+        doc_name="Platon_-_Menon_trad._Cousin",
+        metadata={
+            "title": "Ménon",
+            "author": "Platon",
+            "edition": "trad. Cousin",
+        },
+        language="gr",
+    )
+
+    chunk = {
+        "text": "Peux-tu me dire, Socrate...",
+        "work": {"title": "Ménon", "author": "Platon"},
+        "document": {
+            "sourceId": "Platon_-_Menon_trad._Cousin",
+            "edition": "trad. Cousin",
+        },
+    }
+    validate_chunk_nested_objects(chunk, 0, "Platon_-_Menon_trad._Cousin")
+
+
+def test_integration_scenario_malformed_metadata_caught() -> None:
+    """Test que métadonnées malformées sont détectées avant ingestion."""
+    # Scénario réel : metadata dict sans author
+    with pytest.raises(ValueError, match="'author' is missing"):
+        validate_document_metadata(
+            doc_name="broken_doc",
+            metadata={"title": "Some Title"},  # Manque author !
+            language="fr",
+        )
+
+
+def test_integration_scenario_none_values_caught() -> None:
+    """Test que valeurs None sont détectées (bug fréquent)."""
+    # Scénario réel : LLM extraction rate et retourne None
+    with pytest.raises(ValueError, match="'author' is missing"):
+        validate_document_metadata(
+            doc_name="llm_failed_extraction",
+            metadata={"title": "Title", "author": None},  # LLM a échoué
+            language="fr",
+        )