feat: Add data quality verification & cleanup scripts
## Data Quality & Cleanup (Priorities 1-6) Added comprehensive data quality verification and cleanup system: **Scripts créés**: - verify_data_quality.py: Analyse qualité complète œuvre par œuvre - clean_duplicate_documents.py: Nettoyage doublons Documents - populate_work_collection.py/clean.py: Peuplement Work collection - fix_chunks_count.py: Correction chunksCount incohérents - manage_orphan_chunks.py: Gestion chunks orphelins (3 options) - clean_orphan_works.py: Suppression Works sans chunks - add_missing_work.py: Création Work manquant - generate_schema_stats.py: Génération stats auto - migrate_add_work_collection.py: Migration sûre Work collection **Documentation**: - WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes) - WEAVIATE_SCHEMA.md: Référence schéma rapide - NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session - ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale - rapport_qualite_donnees.txt: Output brut vérification **Résultats nettoyage**: - Documents: 16 → 9 (7 doublons supprimés) - Works: 0 → 9 (peuplé + nettoyé) - Chunks: 5,404 → 5,230 (174 orphelins supprimés) - chunksCount: Corrigés (231 → 5,230 déclaré = réel) - Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres **Modifications code**: - schema.py: Ajout Work collection avec vectorisation - utils/weaviate_ingest.py: Support Work ingestion - utils/word_pipeline.py: Désactivation concepts (problème .lower()) - utils/word_toc_extractor.py: Métadonnées Word correctes - .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
91
generations/library_rag/show_works.py
Normal file
91
generations/library_rag/show_works.py
Normal file
@@ -0,0 +1,91 @@
|
||||
"""Script to display all documents from the Weaviate Document collection in table format.
|
||||
|
||||
Usage:
|
||||
python show_works.py
|
||||
"""
|
||||
|
||||
import weaviate
|
||||
from typing import Any
|
||||
from tabulate import tabulate
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def format_date(date_val: Any) -> str:
|
||||
"""Format date for display.
|
||||
|
||||
Args:
|
||||
date_val: Date value (string or datetime).
|
||||
|
||||
Returns:
|
||||
Formatted date string.
|
||||
"""
|
||||
if date_val is None:
|
||||
return "-"
|
||||
if isinstance(date_val, str):
|
||||
try:
|
||||
dt = datetime.fromisoformat(date_val.replace('Z', '+00:00'))
|
||||
return dt.strftime("%Y-%m-%d %H:%M")
|
||||
except:
|
||||
return date_val
|
||||
return str(date_val)
|
||||
|
||||
|
||||
def display_documents() -> None:
|
||||
"""Connect to Weaviate and display all Document objects in table format."""
|
||||
try:
|
||||
# Connect to local Weaviate instance
|
||||
client = weaviate.connect_to_local()
|
||||
|
||||
try:
|
||||
# Get Document collection
|
||||
document_collection = client.collections.get("Document")
|
||||
|
||||
# Fetch all documents
|
||||
response = document_collection.query.fetch_objects(limit=1000)
|
||||
|
||||
if not response.objects:
|
||||
print("No documents found in the collection.")
|
||||
return
|
||||
|
||||
# Prepare data for table
|
||||
table_data = []
|
||||
for obj in response.objects:
|
||||
props = obj.properties
|
||||
|
||||
# Extract nested work object
|
||||
work = props.get("work", {})
|
||||
work_title = work.get("title", "N/A") if isinstance(work, dict) else "N/A"
|
||||
work_author = work.get("author", "N/A") if isinstance(work, dict) else "N/A"
|
||||
|
||||
table_data.append([
|
||||
props.get("sourceId", "N/A"),
|
||||
work_title,
|
||||
work_author,
|
||||
props.get("edition", "-"),
|
||||
props.get("pages", "-"),
|
||||
props.get("chunksCount", "-"),
|
||||
props.get("language", "-"),
|
||||
format_date(props.get("createdAt")),
|
||||
])
|
||||
|
||||
# Display header
|
||||
print(f"\n{'='*120}")
|
||||
print(f"Collection Document - {len(response.objects)} document(s) trouvé(s)")
|
||||
print(f"{'='*120}\n")
|
||||
|
||||
# Display table
|
||||
headers = ["Source ID", "Work Title", "Author", "Edition", "Pages", "Chunks", "Lang", "Created At"]
|
||||
print(tabulate(table_data, headers=headers, tablefmt="grid"))
|
||||
print()
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error connecting to Weaviate: {e}")
|
||||
print("\nMake sure Weaviate is running:")
|
||||
print(" docker compose up -d")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
display_documents()
|
||||
Reference in New Issue
Block a user