## Data Quality & Cleanup (Priorities 1-6) Added comprehensive data quality verification and cleanup system: **Scripts créés**: - verify_data_quality.py: Analyse qualité complète œuvre par œuvre - clean_duplicate_documents.py: Nettoyage doublons Documents - populate_work_collection.py/clean.py: Peuplement Work collection - fix_chunks_count.py: Correction chunksCount incohérents - manage_orphan_chunks.py: Gestion chunks orphelins (3 options) - clean_orphan_works.py: Suppression Works sans chunks - add_missing_work.py: Création Work manquant - generate_schema_stats.py: Génération stats auto - migrate_add_work_collection.py: Migration sûre Work collection **Documentation**: - WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes) - WEAVIATE_SCHEMA.md: Référence schéma rapide - NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session - ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale - rapport_qualite_donnees.txt: Output brut vérification **Résultats nettoyage**: - Documents: 16 → 9 (7 doublons supprimés) - Works: 0 → 9 (peuplé + nettoyé) - Chunks: 5,404 → 5,230 (174 orphelins supprimés) - chunksCount: Corrigés (231 → 5,230 déclaré = réel) - Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres **Modifications code**: - schema.py: Ajout Work collection avec vectorisation - utils/weaviate_ingest.py: Support Work ingestion - utils/word_pipeline.py: Désactivation concepts (problème .lower()) - utils/word_toc_extractor.py: Métadonnées Word correctes - .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
92 lines
2.7 KiB
Python
92 lines
2.7 KiB
Python
"""Script to display all documents from the Weaviate Document collection in table format.
|
|
|
|
Usage:
|
|
python show_works.py
|
|
"""
|
|
|
|
import weaviate
|
|
from typing import Any
|
|
from tabulate import tabulate
|
|
from datetime import datetime
|
|
|
|
|
|
def format_date(date_val: Any) -> str:
|
|
"""Format date for display.
|
|
|
|
Args:
|
|
date_val: Date value (string or datetime).
|
|
|
|
Returns:
|
|
Formatted date string.
|
|
"""
|
|
if date_val is None:
|
|
return "-"
|
|
if isinstance(date_val, str):
|
|
try:
|
|
dt = datetime.fromisoformat(date_val.replace('Z', '+00:00'))
|
|
return dt.strftime("%Y-%m-%d %H:%M")
|
|
except:
|
|
return date_val
|
|
return str(date_val)
|
|
|
|
|
|
def display_documents() -> None:
|
|
"""Connect to Weaviate and display all Document objects in table format."""
|
|
try:
|
|
# Connect to local Weaviate instance
|
|
client = weaviate.connect_to_local()
|
|
|
|
try:
|
|
# Get Document collection
|
|
document_collection = client.collections.get("Document")
|
|
|
|
# Fetch all documents
|
|
response = document_collection.query.fetch_objects(limit=1000)
|
|
|
|
if not response.objects:
|
|
print("No documents found in the collection.")
|
|
return
|
|
|
|
# Prepare data for table
|
|
table_data = []
|
|
for obj in response.objects:
|
|
props = obj.properties
|
|
|
|
# Extract nested work object
|
|
work = props.get("work", {})
|
|
work_title = work.get("title", "N/A") if isinstance(work, dict) else "N/A"
|
|
work_author = work.get("author", "N/A") if isinstance(work, dict) else "N/A"
|
|
|
|
table_data.append([
|
|
props.get("sourceId", "N/A"),
|
|
work_title,
|
|
work_author,
|
|
props.get("edition", "-"),
|
|
props.get("pages", "-"),
|
|
props.get("chunksCount", "-"),
|
|
props.get("language", "-"),
|
|
format_date(props.get("createdAt")),
|
|
])
|
|
|
|
# Display header
|
|
print(f"\n{'='*120}")
|
|
print(f"Collection Document - {len(response.objects)} document(s) trouvé(s)")
|
|
print(f"{'='*120}\n")
|
|
|
|
# Display table
|
|
headers = ["Source ID", "Work Title", "Author", "Edition", "Pages", "Chunks", "Lang", "Created At"]
|
|
print(tabulate(table_data, headers=headers, tablefmt="grid"))
|
|
print()
|
|
|
|
finally:
|
|
client.close()
|
|
|
|
except Exception as e:
|
|
print(f"Error connecting to Weaviate: {e}")
|
|
print("\nMake sure Weaviate is running:")
|
|
print(" docker compose up -d")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
display_documents()
|