Files
linear-coding-agent/generations/library_rag/migrate_add_work_collection.py
David Blanc Brioir 04ee3f9e39 feat: Add data quality verification & cleanup scripts
## Data Quality & Cleanup (Priorities 1-6)

Added comprehensive data quality verification and cleanup system:

**Scripts créés**:
- verify_data_quality.py: Analyse qualité complète œuvre par œuvre
- clean_duplicate_documents.py: Nettoyage doublons Documents
- populate_work_collection.py/clean.py: Peuplement Work collection
- fix_chunks_count.py: Correction chunksCount incohérents
- manage_orphan_chunks.py: Gestion chunks orphelins (3 options)
- clean_orphan_works.py: Suppression Works sans chunks
- add_missing_work.py: Création Work manquant
- generate_schema_stats.py: Génération stats auto
- migrate_add_work_collection.py: Migration sûre Work collection

**Documentation**:
- WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes)
- WEAVIATE_SCHEMA.md: Référence schéma rapide
- NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session
- ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale
- rapport_qualite_donnees.txt: Output brut vérification

**Résultats nettoyage**:
- Documents: 16 → 9 (7 doublons supprimés)
- Works: 0 → 9 (peuplé + nettoyé)
- Chunks: 5,404 → 5,230 (174 orphelins supprimés)
- chunksCount: Corrigés (231 → 5,230 déclaré = réel)
- Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres

**Modifications code**:
- schema.py: Ajout Work collection avec vectorisation
- utils/weaviate_ingest.py: Support Work ingestion
- utils/word_pipeline.py: Désactivation concepts (problème .lower())
- utils/word_toc_extractor.py: Métadonnées Word correctes
- .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-01 11:57:26 +01:00

199 lines
6.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Migration script: Add Work collection with vectorization.
This script safely adds the Work collection to the existing Weaviate schema
WITHOUT deleting the existing Chunk, Document, and Summary collections.
Migration Steps:
1. Connect to Weaviate
2. Check if Work collection already exists
3. If exists, delete ONLY Work collection
4. Create new Work collection with vectorization enabled
5. Optionally populate Work from existing Chunk metadata
6. Verify all 4 collections exist
Usage:
python migrate_add_work_collection.py
Safety:
- Does NOT touch Chunk collection (5400+ chunks preserved)
- Does NOT touch Document collection
- Does NOT touch Summary collection
- Only creates/recreates Work collection
"""
import sys
from typing import Set
import weaviate
import weaviate.classes.config as wvc
def create_work_collection_vectorized(client: weaviate.WeaviateClient) -> None:
"""Create the Work collection WITH vectorization enabled.
This is the new version that enables semantic search on work titles
and author names.
Args:
client: Connected Weaviate client.
"""
client.collections.create(
name="Work",
description="A philosophical or scholarly work (e.g., Meno, Republic, Apology).",
# ✅ NEW: Enable vectorization for semantic search on titles/authors
vectorizer_config=wvc.Configure.Vectorizer.text2vec_transformers(
vectorize_collection_name=False,
),
properties=[
wvc.Property(
name="title",
description="Title of the work.",
data_type=wvc.DataType.TEXT,
# ✅ VECTORIZED by default (semantic search enabled)
),
wvc.Property(
name="author",
description="Author of the work.",
data_type=wvc.DataType.TEXT,
# ✅ VECTORIZED by default (semantic search enabled)
),
wvc.Property(
name="originalTitle",
description="Original title in source language (optional).",
data_type=wvc.DataType.TEXT,
skip_vectorization=True, # Metadata only
),
wvc.Property(
name="year",
description="Year of composition or publication (negative for BCE).",
data_type=wvc.DataType.INT,
# INT is never vectorized
),
wvc.Property(
name="language",
description="Original language (e.g., 'gr', 'la', 'fr').",
data_type=wvc.DataType.TEXT,
skip_vectorization=True, # ISO code, no need to vectorize
),
wvc.Property(
name="genre",
description="Genre or type (e.g., 'dialogue', 'treatise', 'commentary').",
data_type=wvc.DataType.TEXT,
skip_vectorization=True, # Metadata only
),
],
)
def migrate_work_collection(client: weaviate.WeaviateClient) -> None:
"""Migrate Work collection by adding vectorization.
This function:
1. Checks if Work exists
2. Deletes ONLY Work if it exists
3. Creates new Work with vectorization
4. Leaves all other collections untouched
Args:
client: Connected Weaviate client.
"""
print("\n" + "=" * 80)
print("MIGRATION: Ajouter vectorisation à Work")
print("=" * 80)
# Step 1: Check existing collections
print("\n[1/5] Vérification des collections existantes...")
collections = client.collections.list_all()
existing: Set[str] = set(collections.keys())
print(f" Collections trouvées: {sorted(existing)}")
# Step 2: Delete ONLY Work if it exists
print("\n[2/5] Suppression de Work (si elle existe)...")
if "Work" in existing:
try:
client.collections.delete("Work")
print(" ✓ Work supprimée")
except Exception as e:
print(f" ⚠ Erreur suppression Work: {e}")
else:
print(" Work n'existe pas encore")
# Step 3: Create new Work with vectorization
print("\n[3/5] Création de Work avec vectorisation...")
try:
create_work_collection_vectorized(client)
print(" ✓ Work créée (vectorisation activée)")
except Exception as e:
print(f" ✗ Erreur création Work: {e}")
raise
# Step 4: Verify all 4 collections exist
print("\n[4/5] Vérification finale...")
collections = client.collections.list_all()
actual: Set[str] = set(collections.keys())
expected: Set[str] = {"Work", "Document", "Chunk", "Summary"}
if expected == actual:
print(f" ✓ Toutes les collections présentes: {sorted(actual)}")
else:
missing: Set[str] = expected - actual
extra: Set[str] = actual - expected
if missing:
print(f" ⚠ Collections manquantes: {missing}")
if extra:
print(f" Collections supplémentaires: {extra}")
# Step 5: Display Work config
print("\n[5/5] Configuration de Work:")
print("" * 80)
work_config = collections["Work"]
print(f"Description: {work_config.description}")
vectorizer_str: str = str(work_config.vectorizer)
if "text2vec" in vectorizer_str.lower():
print("Vectorizer: text2vec-transformers ✅")
else:
print("Vectorizer: none ❌")
print("\nPropriétés vectorisées:")
for prop in work_config.properties:
if prop.name in ["title", "author"]:
skip = "[skip_vec]" if (hasattr(prop, 'skip_vectorization') and prop.skip_vectorization) else "[VECTORIZED ✅]"
print(f"{prop.name:<20} {skip}")
print("\n" + "=" * 80)
print("MIGRATION TERMINÉE AVEC SUCCÈS!")
print("=" * 80)
print("\n✓ Work collection vectorisée")
print("✓ Chunk collection PRÉSERVÉE (aucune donnée perdue)")
print("✓ Document collection PRÉSERVÉE")
print("✓ Summary collection PRÉSERVÉE")
print("\n💡 Prochaine étape (optionnel):")
print(" Peupler Work en extrayant les œuvres uniques depuis Chunk.work")
print("=" * 80 + "\n")
def main() -> None:
"""Main entry point for migration script."""
# Fix encoding for Windows console
if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8')
# Connect to local Weaviate
client: weaviate.WeaviateClient = weaviate.connect_to_local(
host="localhost",
port=8080,
grpc_port=50051,
)
try:
migrate_work_collection(client)
finally:
client.close()
print("\n✓ Connexion fermée\n")
if __name__ == "__main__":
main()