feat: Remove Document collection from schema
BREAKING CHANGE: Document collection removed from Weaviate schema Architecture simplification: - Removed Document collection (unused by Flask app) - All metadata now in Work collection or file-based (chunks.json) - Simplified from 4 collections to 3 (Work, Chunk_v2, Summary_v2) Schema changes (schema.py): - Removed create_document_collection() function - Updated verify_schema() to expect 3 collections - Updated display_schema() and print_summary() - Updated documentation to reflect Chunk_v2/Summary_v2 Ingestion changes (weaviate_ingest.py): - Removed ingest_document_metadata() function - Removed ingest_document_collection parameter - Updated IngestResult to use work_uuid instead of document_uuid - Removed Document deletion from delete_document_chunks() - Updated DeleteResult TypedDict Type changes (types.py): - WeaviateIngestResult: document_uuid → work_uuid Documentation updates (.claude/CLAUDE.md): - Updated schema diagram (4 → 3 collections) - Removed Document references - Updated to reflect manual GPU vectorization Database changes: - Deleted Document collection (13 objects) - Deleted Chunk collection (0 objects, old schema) Benefits: - Simpler architecture (3 collections vs 4) - No redundant data storage - All metadata available via Work or file-based storage - Reduced Weaviate memory footprint Migration: - See DOCUMENT_COLLECTION_ANALYSIS.md for detailed analysis - See migrate_chunk_v2_to_none_vectorizer.py for vectorizer migration Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -190,9 +190,8 @@ class DeleteResult(TypedDict, total=False):
|
||||
Attributes:
|
||||
success: Whether deletion succeeded.
|
||||
error: Error message if deletion failed.
|
||||
deleted_chunks: Number of chunks deleted from Chunk collection.
|
||||
deleted_summaries: Number of summaries deleted from Summary collection.
|
||||
deleted_document: Whether the Document object was deleted.
|
||||
deleted_chunks: Number of chunks deleted from Chunk_v2 collection.
|
||||
deleted_summaries: Number of summaries deleted from Summary_v2 collection.
|
||||
|
||||
Example:
|
||||
>>> result = delete_document_chunks("platon_republique")
|
||||
@@ -203,7 +202,6 @@ class DeleteResult(TypedDict, total=False):
|
||||
error: str
|
||||
deleted_chunks: int
|
||||
deleted_summaries: int
|
||||
deleted_document: bool
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -379,7 +377,8 @@ def validate_document_metadata(
|
||||
)
|
||||
|
||||
# Validate title (required for work.title nested object)
|
||||
title = metadata.get("title") or metadata.get("work")
|
||||
# Priority: work > original_title > title (to avoid LLM prompt instructions)
|
||||
title = metadata.get("work") or metadata.get("original_title") or metadata.get("title")
|
||||
if not title or not str(title).strip():
|
||||
raise ValueError(
|
||||
f"Invalid metadata for '{doc_name}': 'title' is missing or empty. "
|
||||
@@ -388,7 +387,8 @@ def validate_document_metadata(
|
||||
)
|
||||
|
||||
# Validate author (required for work.author nested object)
|
||||
author = metadata.get("author")
|
||||
# Priority: original_author > author (to avoid LLM prompt instructions)
|
||||
author = metadata.get("original_author") or metadata.get("author")
|
||||
if not author or not str(author).strip():
|
||||
raise ValueError(
|
||||
f"Invalid metadata for '{doc_name}': 'author' is missing or empty. "
|
||||
@@ -649,8 +649,10 @@ def create_or_get_work(
|
||||
logger.warning(f"Collection Work non trouvée: {e}")
|
||||
return None
|
||||
|
||||
title = metadata.get("title") or doc_name
|
||||
author = metadata.get("author") or "Inconnu"
|
||||
# Priority: work > original_title > title (to avoid LLM prompt instructions)
|
||||
title = metadata.get("work") or metadata.get("original_title") or metadata.get("title") or doc_name
|
||||
# Priority: original_author > author (to avoid LLM prompt instructions)
|
||||
author = metadata.get("original_author") or metadata.get("author") or "Inconnu"
|
||||
year = metadata.get("year", 0) if metadata.get("year") else 0
|
||||
|
||||
try:
|
||||
@@ -686,76 +688,6 @@ def create_or_get_work(
|
||||
return None
|
||||
|
||||
|
||||
def ingest_document_metadata(
|
||||
client: WeaviateClient,
|
||||
doc_name: str,
|
||||
metadata: Dict[str, Any],
|
||||
toc: List[Dict[str, Any]],
|
||||
hierarchy: Dict[str, Any],
|
||||
chunks_count: int,
|
||||
pages: int,
|
||||
) -> Optional[str]:
|
||||
"""Insert document metadata into the Document collection.
|
||||
|
||||
Creates a Document object containing metadata about a processed document,
|
||||
including its table of contents, hierarchy structure, and statistics.
|
||||
|
||||
Args:
|
||||
client: Active Weaviate client connection.
|
||||
doc_name: Unique document identifier (sourceId).
|
||||
metadata: Extracted metadata dict with keys: title, author, language.
|
||||
toc: Table of contents as a hierarchical list of dicts.
|
||||
hierarchy: Complete document hierarchy structure.
|
||||
chunks_count: Total number of chunks in the document.
|
||||
pages: Number of pages in the source PDF.
|
||||
|
||||
Returns:
|
||||
UUID string of the created Document object, or None if insertion failed.
|
||||
|
||||
Example:
|
||||
>>> with get_weaviate_client() as client:
|
||||
... uuid = ingest_document_metadata(
|
||||
... client,
|
||||
... doc_name="platon_republique",
|
||||
... metadata={"title": "La Republique", "author": "Platon"},
|
||||
... toc=[{"title": "Livre I", "level": 1}],
|
||||
... hierarchy={},
|
||||
... chunks_count=150,
|
||||
... pages=300,
|
||||
... )
|
||||
|
||||
Note:
|
||||
The TOC and hierarchy are serialized to JSON strings for storage.
|
||||
The createdAt field is set to the current timestamp.
|
||||
"""
|
||||
try:
|
||||
doc_collection: Collection[Any, Any] = client.collections.get("Document")
|
||||
except Exception as e:
|
||||
logger.warning(f"Collection Document non trouvée: {e}")
|
||||
return None
|
||||
|
||||
try:
|
||||
doc_obj: Dict[str, Any] = {
|
||||
"sourceId": doc_name,
|
||||
"title": metadata.get("title") or doc_name,
|
||||
"author": metadata.get("author") or "Inconnu",
|
||||
"toc": json.dumps(toc, ensure_ascii=False) if toc else "[]",
|
||||
"hierarchy": json.dumps(hierarchy, ensure_ascii=False) if hierarchy else "{}",
|
||||
"pages": pages,
|
||||
"chunksCount": chunks_count,
|
||||
"language": metadata.get("language", "fr"),
|
||||
"createdAt": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
|
||||
result = doc_collection.data.insert(doc_obj)
|
||||
logger.info(f"Document metadata ingéré: {doc_name}")
|
||||
return str(result)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur ingestion document metadata: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def ingest_summaries(
|
||||
client: WeaviateClient,
|
||||
doc_name: str,
|
||||
@@ -897,14 +829,13 @@ def ingest_document(
|
||||
toc: Optional[List[Dict[str, Any]]] = None,
|
||||
hierarchy: Optional[Dict[str, Any]] = None,
|
||||
pages: int = 0,
|
||||
ingest_document_collection: bool = True,
|
||||
ingest_summary_collection: bool = False,
|
||||
) -> IngestResult:
|
||||
"""Ingest document chunks into Weaviate with nested objects.
|
||||
|
||||
Main ingestion function that inserts chunks into the Chunk collection
|
||||
with nested Work and Document references. Optionally also creates
|
||||
entries in the Document and Summary collections.
|
||||
Main ingestion function that inserts chunks into the Chunk_v2 collection
|
||||
with nested Work references. Optionally also creates entries in the
|
||||
Summary_v2 collection.
|
||||
|
||||
This function uses batch insertion for optimal performance and
|
||||
constructs proper nested objects for filtering capabilities.
|
||||
@@ -922,12 +853,10 @@ def ingest_document(
|
||||
- author: Author name
|
||||
- edition (optional): Edition identifier
|
||||
language: ISO language code. Defaults to "fr".
|
||||
toc: Optional table of contents for Document/Summary collections.
|
||||
toc: Optional table of contents for Summary collection.
|
||||
hierarchy: Optional complete document hierarchy structure.
|
||||
pages: Number of pages in source document. Defaults to 0.
|
||||
ingest_document_collection: If True, also insert into Document
|
||||
collection. Defaults to True.
|
||||
ingest_summary_collection: If True, also insert into Summary
|
||||
ingest_summary_collection: If True, also insert into Summary_v2
|
||||
collection (requires toc). Defaults to False.
|
||||
|
||||
Returns:
|
||||
@@ -937,7 +866,7 @@ def ingest_document(
|
||||
- inserted: Preview of first 10 inserted chunks
|
||||
- work: Work title
|
||||
- author: Author name
|
||||
- document_uuid: UUID of Document object (if created)
|
||||
- work_uuid: UUID of Work object (if created)
|
||||
- all_objects: Complete list of inserted ChunkObjects
|
||||
- error: Error message (if failed)
|
||||
|
||||
@@ -995,14 +924,6 @@ def ingest_document(
|
||||
client, doc_name, metadata, pages
|
||||
)
|
||||
|
||||
# Insérer les métadonnées du document (optionnel)
|
||||
doc_uuid: Optional[str] = None
|
||||
if ingest_document_collection:
|
||||
doc_uuid = ingest_document_metadata(
|
||||
client, doc_name, metadata, toc or [], hierarchy or {},
|
||||
len(chunks), pages
|
||||
)
|
||||
|
||||
# Insérer les résumés (optionnel)
|
||||
if ingest_summary_collection and toc:
|
||||
ingest_summaries(client, doc_name, toc, {})
|
||||
@@ -1018,8 +939,10 @@ def ingest_document(
|
||||
objects_to_insert: List[ChunkObject] = []
|
||||
|
||||
# Extraire et valider les métadonnées (validation déjà faite, juste extraction)
|
||||
title: str = metadata.get("title") or metadata.get("work") or doc_name
|
||||
author: str = metadata.get("author") or "Inconnu"
|
||||
# Priority: work > original_title > title (to avoid LLM prompt instructions)
|
||||
title: str = metadata.get("work") or metadata.get("original_title") or metadata.get("title") or doc_name
|
||||
# Priority: original_author > author (to avoid LLM prompt instructions)
|
||||
author: str = metadata.get("original_author") or metadata.get("author") or "Inconnu"
|
||||
edition: str = metadata.get("edition", "")
|
||||
|
||||
for idx, chunk in enumerate(chunks):
|
||||
@@ -1153,7 +1076,7 @@ def ingest_document(
|
||||
inserted=inserted_summary,
|
||||
work=title,
|
||||
author=author,
|
||||
document_uuid=doc_uuid,
|
||||
work_uuid=work_uuid,
|
||||
all_objects=objects_to_insert,
|
||||
)
|
||||
|
||||
@@ -1169,9 +1092,8 @@ def ingest_document(
|
||||
def delete_document_chunks(doc_name: str) -> DeleteResult:
|
||||
"""Delete all data for a document from Weaviate collections.
|
||||
|
||||
Removes chunks, summaries, and the document metadata from their
|
||||
respective collections. Uses nested object filtering to find
|
||||
related objects.
|
||||
Removes chunks and summaries from their respective collections.
|
||||
Uses nested object filtering to find related objects.
|
||||
|
||||
This function is useful for re-processing a document after changes
|
||||
to the processing pipeline or to clean up test data.
|
||||
@@ -1184,7 +1106,6 @@ def delete_document_chunks(doc_name: str) -> DeleteResult:
|
||||
- success: True if deletion succeeded (even if no objects found)
|
||||
- deleted_chunks: Number of Chunk objects deleted
|
||||
- deleted_summaries: Number of Summary objects deleted
|
||||
- deleted_document: True if Document object was deleted
|
||||
- error: Error message (if failed)
|
||||
|
||||
Example:
|
||||
@@ -1227,23 +1148,12 @@ def delete_document_chunks(doc_name: str) -> DeleteResult:
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur suppression summaries: {e}")
|
||||
|
||||
# Supprimer le document
|
||||
try:
|
||||
doc_collection: Collection[Any, Any] = client.collections.get("Document")
|
||||
result = doc_collection.data.delete_many(
|
||||
where=wvq.Filter.by_property("sourceId").equal(doc_name)
|
||||
)
|
||||
deleted_document = result.successful > 0
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur suppression document: {e}")
|
||||
|
||||
logger.info(f"Suppression: {deleted_chunks} chunks, {deleted_summaries} summaries pour {doc_name}")
|
||||
|
||||
return DeleteResult(
|
||||
success=True,
|
||||
deleted_chunks=deleted_chunks,
|
||||
deleted_summaries=deleted_summaries,
|
||||
deleted_document=deleted_document,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user