feat: Add data quality verification & cleanup scripts

## Data Quality & Cleanup (Priorities 1-6)

Added comprehensive data quality verification and cleanup system:

**Scripts créés**:
- verify_data_quality.py: Analyse qualité complète œuvre par œuvre
- clean_duplicate_documents.py: Nettoyage doublons Documents
- populate_work_collection.py/clean.py: Peuplement Work collection
- fix_chunks_count.py: Correction chunksCount incohérents
- manage_orphan_chunks.py: Gestion chunks orphelins (3 options)
- clean_orphan_works.py: Suppression Works sans chunks
- add_missing_work.py: Création Work manquant
- generate_schema_stats.py: Génération stats auto
- migrate_add_work_collection.py: Migration sûre Work collection

**Documentation**:
- WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes)
- WEAVIATE_SCHEMA.md: Référence schéma rapide
- NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session
- ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale
- rapport_qualite_donnees.txt: Output brut vérification

**Résultats nettoyage**:
- Documents: 16 → 9 (7 doublons supprimés)
- Works: 0 → 9 (peuplé + nettoyé)
- Chunks: 5,404 → 5,230 (174 orphelins supprimés)
- chunksCount: Corrigés (231 → 5,230 déclaré = réel)
- Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres

**Modifications code**:
- schema.py: Ajout Work collection avec vectorisation
- utils/weaviate_ingest.py: Support Work ingestion
- utils/word_pipeline.py: Désactivation concepts (problème .lower())
- utils/word_toc_extractor.py: Métadonnées Word correctes
- .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-01 11:57:26 +01:00
parent 845ffb4b06
commit 04ee3f9e39
26 changed files with 6945 additions and 16 deletions

View File

@@ -0,0 +1,356 @@
#!/usr/bin/env python3
"""Tests unitaires pour la validation stricte des métadonnées et nested objects.
Ce module teste les fonctions de validation ajoutées dans weaviate_ingest.py
pour prévenir les erreurs silencieuses causées par des métadonnées invalides.
Run:
pytest tests/test_validation_stricte.py -v
"""
import pytest
from typing import Any, Dict
from utils.weaviate_ingest import (
validate_document_metadata,
validate_chunk_nested_objects,
)
# =============================================================================
# Tests pour validate_document_metadata()
# =============================================================================
def test_validate_document_metadata_valid() -> None:
"""Test validation avec métadonnées valides."""
# Should not raise
validate_document_metadata(
doc_name="platon_republique",
metadata={"title": "La République", "author": "Platon"},
language="fr",
)
def test_validate_document_metadata_valid_with_work_key() -> None:
"""Test validation avec key 'work' au lieu de 'title'."""
# Should not raise
validate_document_metadata(
doc_name="test_doc",
metadata={"work": "Test Work", "author": "Test Author"},
language="en",
)
def test_validate_document_metadata_empty_doc_name() -> None:
"""Test que doc_name vide lève ValueError."""
with pytest.raises(ValueError, match="Invalid doc_name: empty"):
validate_document_metadata(
doc_name="",
metadata={"title": "Title", "author": "Author"},
language="fr",
)
def test_validate_document_metadata_whitespace_doc_name() -> None:
"""Test que doc_name whitespace-only lève ValueError."""
with pytest.raises(ValueError, match="Invalid doc_name: empty"):
validate_document_metadata(
doc_name=" ",
metadata={"title": "Title", "author": "Author"},
language="fr",
)
def test_validate_document_metadata_missing_title() -> None:
"""Test que title manquant lève ValueError."""
with pytest.raises(ValueError, match="'title' is missing or empty"):
validate_document_metadata(
doc_name="test_doc",
metadata={"author": "Author"},
language="fr",
)
def test_validate_document_metadata_empty_title() -> None:
"""Test que title vide lève ValueError."""
with pytest.raises(ValueError, match="'title' is missing or empty"):
validate_document_metadata(
doc_name="test_doc",
metadata={"title": "", "author": "Author"},
language="fr",
)
def test_validate_document_metadata_whitespace_title() -> None:
"""Test que title whitespace-only lève ValueError."""
with pytest.raises(ValueError, match="'title' is missing or empty"):
validate_document_metadata(
doc_name="test_doc",
metadata={"title": " ", "author": "Author"},
language="fr",
)
def test_validate_document_metadata_missing_author() -> None:
"""Test que author manquant lève ValueError."""
with pytest.raises(ValueError, match="'author' is missing or empty"):
validate_document_metadata(
doc_name="test_doc",
metadata={"title": "Title"},
language="fr",
)
def test_validate_document_metadata_empty_author() -> None:
"""Test que author vide lève ValueError."""
with pytest.raises(ValueError, match="'author' is missing or empty"):
validate_document_metadata(
doc_name="test_doc",
metadata={"title": "Title", "author": ""},
language="fr",
)
def test_validate_document_metadata_none_author() -> None:
"""Test que author=None lève ValueError."""
with pytest.raises(ValueError, match="'author' is missing or empty"):
validate_document_metadata(
doc_name="test_doc",
metadata={"title": "Title", "author": None},
language="fr",
)
def test_validate_document_metadata_empty_language() -> None:
"""Test que language vide lève ValueError."""
with pytest.raises(ValueError, match="Invalid language.*empty"):
validate_document_metadata(
doc_name="test_doc",
metadata={"title": "Title", "author": "Author"},
language="",
)
def test_validate_document_metadata_optional_edition() -> None:
"""Test que edition est optionnel (peut être vide)."""
# Should not raise - edition is optional
validate_document_metadata(
doc_name="test_doc",
metadata={"title": "Title", "author": "Author", "edition": ""},
language="fr",
)
# =============================================================================
# Tests pour validate_chunk_nested_objects()
# =============================================================================
def test_validate_chunk_nested_objects_valid() -> None:
"""Test validation avec chunk valide."""
chunk = {
"text": "Some text",
"work": {"title": "La République", "author": "Platon"},
"document": {"sourceId": "platon_republique", "edition": "GF"},
}
# Should not raise
validate_chunk_nested_objects(chunk, 0, "platon_republique")
def test_validate_chunk_nested_objects_empty_edition_ok() -> None:
"""Test que edition vide est accepté (optionnel)."""
chunk = {
"text": "Some text",
"work": {"title": "Title", "author": "Author"},
"document": {"sourceId": "doc_id", "edition": ""},
}
# Should not raise
validate_chunk_nested_objects(chunk, 0, "doc_id")
def test_validate_chunk_nested_objects_work_not_dict() -> None:
"""Test que work non-dict lève ValueError."""
chunk = {
"text": "Some text",
"work": "not a dict",
"document": {"sourceId": "doc_id", "edition": ""},
}
with pytest.raises(ValueError, match="work is not a dict"):
validate_chunk_nested_objects(chunk, 5, "doc_id")
def test_validate_chunk_nested_objects_empty_work_title() -> None:
"""Test que work.title vide lève ValueError."""
chunk = {
"text": "Some text",
"work": {"title": "", "author": "Author"},
"document": {"sourceId": "doc_id", "edition": ""},
}
with pytest.raises(ValueError, match="work.title is empty"):
validate_chunk_nested_objects(chunk, 10, "doc_id")
def test_validate_chunk_nested_objects_none_work_title() -> None:
"""Test que work.title=None lève ValueError."""
chunk = {
"text": "Some text",
"work": {"title": None, "author": "Author"},
"document": {"sourceId": "doc_id", "edition": ""},
}
with pytest.raises(ValueError, match="work.title is empty"):
validate_chunk_nested_objects(chunk, 3, "doc_id")
def test_validate_chunk_nested_objects_whitespace_work_title() -> None:
"""Test que work.title whitespace-only lève ValueError."""
chunk = {
"text": "Some text",
"work": {"title": " ", "author": "Author"},
"document": {"sourceId": "doc_id", "edition": ""},
}
with pytest.raises(ValueError, match="work.title is empty"):
validate_chunk_nested_objects(chunk, 7, "doc_id")
def test_validate_chunk_nested_objects_empty_work_author() -> None:
"""Test que work.author vide lève ValueError."""
chunk = {
"text": "Some text",
"work": {"title": "Title", "author": ""},
"document": {"sourceId": "doc_id", "edition": ""},
}
with pytest.raises(ValueError, match="work.author is empty"):
validate_chunk_nested_objects(chunk, 2, "doc_id")
def test_validate_chunk_nested_objects_document_not_dict() -> None:
"""Test que document non-dict lève ValueError."""
chunk = {
"text": "Some text",
"work": {"title": "Title", "author": "Author"},
"document": ["not", "a", "dict"],
}
with pytest.raises(ValueError, match="document is not a dict"):
validate_chunk_nested_objects(chunk, 15, "doc_id")
def test_validate_chunk_nested_objects_empty_source_id() -> None:
"""Test que document.sourceId vide lève ValueError."""
chunk = {
"text": "Some text",
"work": {"title": "Title", "author": "Author"},
"document": {"sourceId": "", "edition": "Ed"},
}
with pytest.raises(ValueError, match="document.sourceId is empty"):
validate_chunk_nested_objects(chunk, 20, "doc_id")
def test_validate_chunk_nested_objects_none_source_id() -> None:
"""Test que document.sourceId=None lève ValueError."""
chunk = {
"text": "Some text",
"work": {"title": "Title", "author": "Author"},
"document": {"sourceId": None, "edition": "Ed"},
}
with pytest.raises(ValueError, match="document.sourceId is empty"):
validate_chunk_nested_objects(chunk, 25, "doc_id")
def test_validate_chunk_nested_objects_error_message_includes_index() -> None:
"""Test que le message d'erreur inclut l'index du chunk."""
chunk = {
"text": "Some text",
"work": {"title": "", "author": "Author"},
"document": {"sourceId": "doc_id", "edition": ""},
}
with pytest.raises(ValueError, match="Chunk 42"):
validate_chunk_nested_objects(chunk, 42, "my_doc")
def test_validate_chunk_nested_objects_error_message_includes_doc_name() -> None:
"""Test que le message d'erreur inclut doc_name."""
chunk = {
"text": "Some text",
"work": {"title": "", "author": "Author"},
"document": {"sourceId": "doc_id", "edition": ""},
}
with pytest.raises(ValueError, match="'my_special_doc'"):
validate_chunk_nested_objects(chunk, 5, "my_special_doc")
# =============================================================================
# Tests d'intégration (scénarios réels)
# =============================================================================
def test_integration_scenario_peirce_collected_papers() -> None:
"""Test avec métadonnées réelles de Peirce Collected Papers."""
# Métadonnées valides
validate_document_metadata(
doc_name="peirce_collected_papers_fixed",
metadata={
"title": "Collected Papers of Charles Sanders Peirce",
"author": "Charles Sanders PEIRCE",
},
language="en",
)
# Chunk valide
chunk = {
"text": "Logic is the science of the necessary laws of thought...",
"work": {
"title": "Collected Papers of Charles Sanders Peirce",
"author": "Charles Sanders PEIRCE",
},
"document": {
"sourceId": "peirce_collected_papers_fixed",
"edition": "Harvard University Press",
},
}
validate_chunk_nested_objects(chunk, 0, "peirce_collected_papers_fixed")
def test_integration_scenario_platon_menon() -> None:
"""Test avec métadonnées réelles de Platon - Ménon."""
validate_document_metadata(
doc_name="Platon_-_Menon_trad._Cousin",
metadata={
"title": "Ménon",
"author": "Platon",
"edition": "trad. Cousin",
},
language="gr",
)
chunk = {
"text": "Peux-tu me dire, Socrate...",
"work": {"title": "Ménon", "author": "Platon"},
"document": {
"sourceId": "Platon_-_Menon_trad._Cousin",
"edition": "trad. Cousin",
},
}
validate_chunk_nested_objects(chunk, 0, "Platon_-_Menon_trad._Cousin")
def test_integration_scenario_malformed_metadata_caught() -> None:
"""Test que métadonnées malformées sont détectées avant ingestion."""
# Scénario réel : metadata dict sans author
with pytest.raises(ValueError, match="'author' is missing"):
validate_document_metadata(
doc_name="broken_doc",
metadata={"title": "Some Title"}, # Manque author !
language="fr",
)
def test_integration_scenario_none_values_caught() -> None:
"""Test que valeurs None sont détectées (bug fréquent)."""
# Scénario réel : LLM extraction rate et retourne None
with pytest.raises(ValueError, match="'author' is missing"):
validate_document_metadata(
doc_name="llm_failed_extraction",
metadata={"title": "Title", "author": None}, # LLM a échoué
language="fr",
)