## Data Quality & Cleanup (Priorities 1-6) Added comprehensive data quality verification and cleanup system: **Scripts créés**: - verify_data_quality.py: Analyse qualité complète œuvre par œuvre - clean_duplicate_documents.py: Nettoyage doublons Documents - populate_work_collection.py/clean.py: Peuplement Work collection - fix_chunks_count.py: Correction chunksCount incohérents - manage_orphan_chunks.py: Gestion chunks orphelins (3 options) - clean_orphan_works.py: Suppression Works sans chunks - add_missing_work.py: Création Work manquant - generate_schema_stats.py: Génération stats auto - migrate_add_work_collection.py: Migration sûre Work collection **Documentation**: - WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes) - WEAVIATE_SCHEMA.md: Référence schéma rapide - NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session - ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale - rapport_qualite_donnees.txt: Output brut vérification **Résultats nettoyage**: - Documents: 16 → 9 (7 doublons supprimés) - Works: 0 → 9 (peuplé + nettoyé) - Chunks: 5,404 → 5,230 (174 orphelins supprimés) - chunksCount: Corrigés (231 → 5,230 déclaré = réel) - Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres **Modifications code**: - schema.py: Ajout Work collection avec vectorisation - utils/weaviate_ingest.py: Support Work ingestion - utils/word_pipeline.py: Désactivation concepts (problème .lower()) - utils/word_toc_extractor.py: Métadonnées Word correctes - .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
357 lines
12 KiB
Python
357 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""Tests unitaires pour la validation stricte des métadonnées et nested objects.
|
|
|
|
Ce module teste les fonctions de validation ajoutées dans weaviate_ingest.py
|
|
pour prévenir les erreurs silencieuses causées par des métadonnées invalides.
|
|
|
|
Run:
|
|
pytest tests/test_validation_stricte.py -v
|
|
"""
|
|
|
|
import pytest
|
|
from typing import Any, Dict
|
|
|
|
from utils.weaviate_ingest import (
|
|
validate_document_metadata,
|
|
validate_chunk_nested_objects,
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# Tests pour validate_document_metadata()
|
|
# =============================================================================
|
|
|
|
|
|
def test_validate_document_metadata_valid() -> None:
|
|
"""Test validation avec métadonnées valides."""
|
|
# Should not raise
|
|
validate_document_metadata(
|
|
doc_name="platon_republique",
|
|
metadata={"title": "La République", "author": "Platon"},
|
|
language="fr",
|
|
)
|
|
|
|
|
|
def test_validate_document_metadata_valid_with_work_key() -> None:
|
|
"""Test validation avec key 'work' au lieu de 'title'."""
|
|
# Should not raise
|
|
validate_document_metadata(
|
|
doc_name="test_doc",
|
|
metadata={"work": "Test Work", "author": "Test Author"},
|
|
language="en",
|
|
)
|
|
|
|
|
|
def test_validate_document_metadata_empty_doc_name() -> None:
|
|
"""Test que doc_name vide lève ValueError."""
|
|
with pytest.raises(ValueError, match="Invalid doc_name: empty"):
|
|
validate_document_metadata(
|
|
doc_name="",
|
|
metadata={"title": "Title", "author": "Author"},
|
|
language="fr",
|
|
)
|
|
|
|
|
|
def test_validate_document_metadata_whitespace_doc_name() -> None:
|
|
"""Test que doc_name whitespace-only lève ValueError."""
|
|
with pytest.raises(ValueError, match="Invalid doc_name: empty"):
|
|
validate_document_metadata(
|
|
doc_name=" ",
|
|
metadata={"title": "Title", "author": "Author"},
|
|
language="fr",
|
|
)
|
|
|
|
|
|
def test_validate_document_metadata_missing_title() -> None:
|
|
"""Test que title manquant lève ValueError."""
|
|
with pytest.raises(ValueError, match="'title' is missing or empty"):
|
|
validate_document_metadata(
|
|
doc_name="test_doc",
|
|
metadata={"author": "Author"},
|
|
language="fr",
|
|
)
|
|
|
|
|
|
def test_validate_document_metadata_empty_title() -> None:
|
|
"""Test que title vide lève ValueError."""
|
|
with pytest.raises(ValueError, match="'title' is missing or empty"):
|
|
validate_document_metadata(
|
|
doc_name="test_doc",
|
|
metadata={"title": "", "author": "Author"},
|
|
language="fr",
|
|
)
|
|
|
|
|
|
def test_validate_document_metadata_whitespace_title() -> None:
|
|
"""Test que title whitespace-only lève ValueError."""
|
|
with pytest.raises(ValueError, match="'title' is missing or empty"):
|
|
validate_document_metadata(
|
|
doc_name="test_doc",
|
|
metadata={"title": " ", "author": "Author"},
|
|
language="fr",
|
|
)
|
|
|
|
|
|
def test_validate_document_metadata_missing_author() -> None:
|
|
"""Test que author manquant lève ValueError."""
|
|
with pytest.raises(ValueError, match="'author' is missing or empty"):
|
|
validate_document_metadata(
|
|
doc_name="test_doc",
|
|
metadata={"title": "Title"},
|
|
language="fr",
|
|
)
|
|
|
|
|
|
def test_validate_document_metadata_empty_author() -> None:
|
|
"""Test que author vide lève ValueError."""
|
|
with pytest.raises(ValueError, match="'author' is missing or empty"):
|
|
validate_document_metadata(
|
|
doc_name="test_doc",
|
|
metadata={"title": "Title", "author": ""},
|
|
language="fr",
|
|
)
|
|
|
|
|
|
def test_validate_document_metadata_none_author() -> None:
|
|
"""Test que author=None lève ValueError."""
|
|
with pytest.raises(ValueError, match="'author' is missing or empty"):
|
|
validate_document_metadata(
|
|
doc_name="test_doc",
|
|
metadata={"title": "Title", "author": None},
|
|
language="fr",
|
|
)
|
|
|
|
|
|
def test_validate_document_metadata_empty_language() -> None:
|
|
"""Test que language vide lève ValueError."""
|
|
with pytest.raises(ValueError, match="Invalid language.*empty"):
|
|
validate_document_metadata(
|
|
doc_name="test_doc",
|
|
metadata={"title": "Title", "author": "Author"},
|
|
language="",
|
|
)
|
|
|
|
|
|
def test_validate_document_metadata_optional_edition() -> None:
|
|
"""Test que edition est optionnel (peut être vide)."""
|
|
# Should not raise - edition is optional
|
|
validate_document_metadata(
|
|
doc_name="test_doc",
|
|
metadata={"title": "Title", "author": "Author", "edition": ""},
|
|
language="fr",
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# Tests pour validate_chunk_nested_objects()
|
|
# =============================================================================
|
|
|
|
|
|
def test_validate_chunk_nested_objects_valid() -> None:
|
|
"""Test validation avec chunk valide."""
|
|
chunk = {
|
|
"text": "Some text",
|
|
"work": {"title": "La République", "author": "Platon"},
|
|
"document": {"sourceId": "platon_republique", "edition": "GF"},
|
|
}
|
|
# Should not raise
|
|
validate_chunk_nested_objects(chunk, 0, "platon_republique")
|
|
|
|
|
|
def test_validate_chunk_nested_objects_empty_edition_ok() -> None:
|
|
"""Test que edition vide est accepté (optionnel)."""
|
|
chunk = {
|
|
"text": "Some text",
|
|
"work": {"title": "Title", "author": "Author"},
|
|
"document": {"sourceId": "doc_id", "edition": ""},
|
|
}
|
|
# Should not raise
|
|
validate_chunk_nested_objects(chunk, 0, "doc_id")
|
|
|
|
|
|
def test_validate_chunk_nested_objects_work_not_dict() -> None:
|
|
"""Test que work non-dict lève ValueError."""
|
|
chunk = {
|
|
"text": "Some text",
|
|
"work": "not a dict",
|
|
"document": {"sourceId": "doc_id", "edition": ""},
|
|
}
|
|
with pytest.raises(ValueError, match="work is not a dict"):
|
|
validate_chunk_nested_objects(chunk, 5, "doc_id")
|
|
|
|
|
|
def test_validate_chunk_nested_objects_empty_work_title() -> None:
|
|
"""Test que work.title vide lève ValueError."""
|
|
chunk = {
|
|
"text": "Some text",
|
|
"work": {"title": "", "author": "Author"},
|
|
"document": {"sourceId": "doc_id", "edition": ""},
|
|
}
|
|
with pytest.raises(ValueError, match="work.title is empty"):
|
|
validate_chunk_nested_objects(chunk, 10, "doc_id")
|
|
|
|
|
|
def test_validate_chunk_nested_objects_none_work_title() -> None:
|
|
"""Test que work.title=None lève ValueError."""
|
|
chunk = {
|
|
"text": "Some text",
|
|
"work": {"title": None, "author": "Author"},
|
|
"document": {"sourceId": "doc_id", "edition": ""},
|
|
}
|
|
with pytest.raises(ValueError, match="work.title is empty"):
|
|
validate_chunk_nested_objects(chunk, 3, "doc_id")
|
|
|
|
|
|
def test_validate_chunk_nested_objects_whitespace_work_title() -> None:
|
|
"""Test que work.title whitespace-only lève ValueError."""
|
|
chunk = {
|
|
"text": "Some text",
|
|
"work": {"title": " ", "author": "Author"},
|
|
"document": {"sourceId": "doc_id", "edition": ""},
|
|
}
|
|
with pytest.raises(ValueError, match="work.title is empty"):
|
|
validate_chunk_nested_objects(chunk, 7, "doc_id")
|
|
|
|
|
|
def test_validate_chunk_nested_objects_empty_work_author() -> None:
|
|
"""Test que work.author vide lève ValueError."""
|
|
chunk = {
|
|
"text": "Some text",
|
|
"work": {"title": "Title", "author": ""},
|
|
"document": {"sourceId": "doc_id", "edition": ""},
|
|
}
|
|
with pytest.raises(ValueError, match="work.author is empty"):
|
|
validate_chunk_nested_objects(chunk, 2, "doc_id")
|
|
|
|
|
|
def test_validate_chunk_nested_objects_document_not_dict() -> None:
|
|
"""Test que document non-dict lève ValueError."""
|
|
chunk = {
|
|
"text": "Some text",
|
|
"work": {"title": "Title", "author": "Author"},
|
|
"document": ["not", "a", "dict"],
|
|
}
|
|
with pytest.raises(ValueError, match="document is not a dict"):
|
|
validate_chunk_nested_objects(chunk, 15, "doc_id")
|
|
|
|
|
|
def test_validate_chunk_nested_objects_empty_source_id() -> None:
|
|
"""Test que document.sourceId vide lève ValueError."""
|
|
chunk = {
|
|
"text": "Some text",
|
|
"work": {"title": "Title", "author": "Author"},
|
|
"document": {"sourceId": "", "edition": "Ed"},
|
|
}
|
|
with pytest.raises(ValueError, match="document.sourceId is empty"):
|
|
validate_chunk_nested_objects(chunk, 20, "doc_id")
|
|
|
|
|
|
def test_validate_chunk_nested_objects_none_source_id() -> None:
|
|
"""Test que document.sourceId=None lève ValueError."""
|
|
chunk = {
|
|
"text": "Some text",
|
|
"work": {"title": "Title", "author": "Author"},
|
|
"document": {"sourceId": None, "edition": "Ed"},
|
|
}
|
|
with pytest.raises(ValueError, match="document.sourceId is empty"):
|
|
validate_chunk_nested_objects(chunk, 25, "doc_id")
|
|
|
|
|
|
def test_validate_chunk_nested_objects_error_message_includes_index() -> None:
|
|
"""Test que le message d'erreur inclut l'index du chunk."""
|
|
chunk = {
|
|
"text": "Some text",
|
|
"work": {"title": "", "author": "Author"},
|
|
"document": {"sourceId": "doc_id", "edition": ""},
|
|
}
|
|
with pytest.raises(ValueError, match="Chunk 42"):
|
|
validate_chunk_nested_objects(chunk, 42, "my_doc")
|
|
|
|
|
|
def test_validate_chunk_nested_objects_error_message_includes_doc_name() -> None:
|
|
"""Test que le message d'erreur inclut doc_name."""
|
|
chunk = {
|
|
"text": "Some text",
|
|
"work": {"title": "", "author": "Author"},
|
|
"document": {"sourceId": "doc_id", "edition": ""},
|
|
}
|
|
with pytest.raises(ValueError, match="'my_special_doc'"):
|
|
validate_chunk_nested_objects(chunk, 5, "my_special_doc")
|
|
|
|
|
|
# =============================================================================
|
|
# Tests d'intégration (scénarios réels)
|
|
# =============================================================================
|
|
|
|
|
|
def test_integration_scenario_peirce_collected_papers() -> None:
|
|
"""Test avec métadonnées réelles de Peirce Collected Papers."""
|
|
# Métadonnées valides
|
|
validate_document_metadata(
|
|
doc_name="peirce_collected_papers_fixed",
|
|
metadata={
|
|
"title": "Collected Papers of Charles Sanders Peirce",
|
|
"author": "Charles Sanders PEIRCE",
|
|
},
|
|
language="en",
|
|
)
|
|
|
|
# Chunk valide
|
|
chunk = {
|
|
"text": "Logic is the science of the necessary laws of thought...",
|
|
"work": {
|
|
"title": "Collected Papers of Charles Sanders Peirce",
|
|
"author": "Charles Sanders PEIRCE",
|
|
},
|
|
"document": {
|
|
"sourceId": "peirce_collected_papers_fixed",
|
|
"edition": "Harvard University Press",
|
|
},
|
|
}
|
|
validate_chunk_nested_objects(chunk, 0, "peirce_collected_papers_fixed")
|
|
|
|
|
|
def test_integration_scenario_platon_menon() -> None:
|
|
"""Test avec métadonnées réelles de Platon - Ménon."""
|
|
validate_document_metadata(
|
|
doc_name="Platon_-_Menon_trad._Cousin",
|
|
metadata={
|
|
"title": "Ménon",
|
|
"author": "Platon",
|
|
"edition": "trad. Cousin",
|
|
},
|
|
language="gr",
|
|
)
|
|
|
|
chunk = {
|
|
"text": "Peux-tu me dire, Socrate...",
|
|
"work": {"title": "Ménon", "author": "Platon"},
|
|
"document": {
|
|
"sourceId": "Platon_-_Menon_trad._Cousin",
|
|
"edition": "trad. Cousin",
|
|
},
|
|
}
|
|
validate_chunk_nested_objects(chunk, 0, "Platon_-_Menon_trad._Cousin")
|
|
|
|
|
|
def test_integration_scenario_malformed_metadata_caught() -> None:
|
|
"""Test que métadonnées malformées sont détectées avant ingestion."""
|
|
# Scénario réel : metadata dict sans author
|
|
with pytest.raises(ValueError, match="'author' is missing"):
|
|
validate_document_metadata(
|
|
doc_name="broken_doc",
|
|
metadata={"title": "Some Title"}, # Manque author !
|
|
language="fr",
|
|
)
|
|
|
|
|
|
def test_integration_scenario_none_values_caught() -> None:
|
|
"""Test que valeurs None sont détectées (bug fréquent)."""
|
|
# Scénario réel : LLM extraction rate et retourne None
|
|
with pytest.raises(ValueError, match="'author' is missing"):
|
|
validate_document_metadata(
|
|
doc_name="llm_failed_extraction",
|
|
metadata={"title": "Title", "author": None}, # LLM a échoué
|
|
language="fr",
|
|
)
|