## Data Quality & Cleanup (Priorities 1-6) Added comprehensive data quality verification and cleanup system: **Scripts créés**: - verify_data_quality.py: Analyse qualité complète œuvre par œuvre - clean_duplicate_documents.py: Nettoyage doublons Documents - populate_work_collection.py/clean.py: Peuplement Work collection - fix_chunks_count.py: Correction chunksCount incohérents - manage_orphan_chunks.py: Gestion chunks orphelins (3 options) - clean_orphan_works.py: Suppression Works sans chunks - add_missing_work.py: Création Work manquant - generate_schema_stats.py: Génération stats auto - migrate_add_work_collection.py: Migration sûre Work collection **Documentation**: - WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes) - WEAVIATE_SCHEMA.md: Référence schéma rapide - NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session - ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale - rapport_qualite_donnees.txt: Output brut vérification **Résultats nettoyage**: - Documents: 16 → 9 (7 doublons supprimés) - Works: 0 → 9 (peuplé + nettoyé) - Chunks: 5,404 → 5,230 (174 orphelins supprimés) - chunksCount: Corrigés (231 → 5,230 déclaré = réel) - Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres **Modifications code**: - schema.py: Ajout Work collection avec vectorisation - utils/weaviate_ingest.py: Support Work ingestion - utils/word_pipeline.py: Désactivation concepts (problème .lower()) - utils/word_toc_extractor.py: Métadonnées Word correctes - .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
345 lines
11 KiB
Python
345 lines
11 KiB
Python
"""Extract hierarchical table of contents from Word document headings.
|
|
|
|
This module builds a structured TOC from Word heading styles (Heading 1-9),
|
|
generating section paths compatible with the existing RAG pipeline and Weaviate
|
|
schema (e.g., "1.2.3" for chapter 1, section 2, subsection 3).
|
|
|
|
Example:
|
|
Build TOC from Word headings:
|
|
|
|
from pathlib import Path
|
|
from utils.word_processor import extract_word_content
|
|
from utils.word_toc_extractor import build_toc_from_headings
|
|
|
|
content = extract_word_content(Path("doc.docx"))
|
|
toc = build_toc_from_headings(content["headings"])
|
|
|
|
for entry in toc:
|
|
print(f"{entry['sectionPath']}: {entry['title']}")
|
|
|
|
Output:
|
|
1: Introduction
|
|
1.1: Background
|
|
1.2: Methodology
|
|
2: Results
|
|
2.1: Analysis
|
|
|
|
Note:
|
|
Compatible with existing TOCEntry TypedDict from utils.types
|
|
"""
|
|
|
|
from typing import List, Dict, Any, Optional
|
|
from utils.types import TOCEntry
|
|
|
|
|
|
def _generate_section_path(
|
|
level: int,
|
|
counters: List[int],
|
|
) -> str:
|
|
"""Generate section path string from level counters.
|
|
|
|
Args:
|
|
level: Current heading level (1-9).
|
|
counters: List of counters for each level [c1, c2, c3, ...].
|
|
|
|
Returns:
|
|
Section path string (e.g., "1.2.3").
|
|
|
|
Example:
|
|
>>> _generate_section_path(3, [1, 2, 3, 0, 0])
|
|
'1.2.3'
|
|
>>> _generate_section_path(1, [2, 0, 0])
|
|
'2'
|
|
"""
|
|
# Take counters up to current level
|
|
path_parts = [str(c) for c in counters[:level] if c > 0]
|
|
return ".".join(path_parts) if path_parts else "1"
|
|
|
|
|
|
def build_toc_from_headings(
|
|
headings: List[Dict[str, Any]],
|
|
max_level: int = 9,
|
|
) -> List[TOCEntry]:
|
|
"""Build hierarchical table of contents from Word headings.
|
|
|
|
Processes a list of heading paragraphs (with level attribute) and constructs
|
|
a hierarchical TOC structure with section paths (1, 1.1, 1.2, 2, 2.1, etc.).
|
|
Handles nested headings and missing intermediate levels gracefully.
|
|
|
|
Args:
|
|
headings: List of heading dicts from word_processor.extract_word_content().
|
|
Each dict must have:
|
|
- text (str): Heading text
|
|
- level (int): Heading level (1-9)
|
|
- index (int): Paragraph index in document
|
|
max_level: Maximum heading level to process (default: 9).
|
|
|
|
Returns:
|
|
List of TOCEntry dicts with hierarchical structure:
|
|
- title (str): Heading text
|
|
- level (int): Heading level (1-9)
|
|
- sectionPath (str): Section path (e.g., "1.2.3")
|
|
- pageRange (str): Empty string (not applicable for Word)
|
|
- children (List[TOCEntry]): Nested sub-headings
|
|
|
|
Example:
|
|
>>> headings = [
|
|
... {"text": "Chapter 1", "level": 1, "index": 0},
|
|
... {"text": "Section 1.1", "level": 2, "index": 1},
|
|
... {"text": "Section 1.2", "level": 2, "index": 2},
|
|
... {"text": "Chapter 2", "level": 1, "index": 3},
|
|
... ]
|
|
>>> toc = build_toc_from_headings(headings)
|
|
>>> print(toc[0]["title"])
|
|
'Chapter 1'
|
|
>>> print(toc[0]["sectionPath"])
|
|
'1'
|
|
>>> print(toc[0]["children"][0]["sectionPath"])
|
|
'1.1'
|
|
|
|
Note:
|
|
- Empty headings are skipped
|
|
- Handles missing intermediate levels (e.g., H1 → H3 without H2)
|
|
- Section paths are 1-indexed (start from 1, not 0)
|
|
"""
|
|
if not headings:
|
|
return []
|
|
|
|
toc: List[TOCEntry] = []
|
|
counters = [0] * max_level # Track counters for each level [h1, h2, h3, ...]
|
|
parent_stack: List[TOCEntry] = [] # Stack to track parent headings
|
|
|
|
for heading in headings:
|
|
text = heading.get("text", "").strip()
|
|
level = heading.get("level")
|
|
|
|
# Skip empty headings or invalid levels
|
|
if not text or level is None or level < 1 or level > max_level:
|
|
continue
|
|
|
|
level_idx = level - 1 # Convert to 0-indexed
|
|
|
|
# Increment counter for this level
|
|
counters[level_idx] += 1
|
|
|
|
# Reset all deeper level counters
|
|
for i in range(level_idx + 1, max_level):
|
|
counters[i] = 0
|
|
|
|
# Generate section path
|
|
section_path = _generate_section_path(level, counters)
|
|
|
|
# Create TOC entry
|
|
entry: TOCEntry = {
|
|
"title": text,
|
|
"level": level,
|
|
"sectionPath": section_path,
|
|
"pageRange": "", # Not applicable for Word documents
|
|
"children": [],
|
|
}
|
|
|
|
# Determine parent and add to appropriate location
|
|
if level == 1:
|
|
# Top-level heading - add to root
|
|
toc.append(entry)
|
|
parent_stack = [entry] # Reset parent stack
|
|
else:
|
|
# Find appropriate parent in stack
|
|
# Pop stack until we find a parent at level < current level
|
|
while parent_stack and parent_stack[-1]["level"] >= level:
|
|
parent_stack.pop()
|
|
|
|
if parent_stack:
|
|
# Add to parent's children
|
|
parent_stack[-1]["children"].append(entry)
|
|
else:
|
|
# No valid parent found (missing intermediate levels)
|
|
# Add to root as a fallback
|
|
toc.append(entry)
|
|
|
|
# Add current entry to parent stack
|
|
parent_stack.append(entry)
|
|
|
|
return toc
|
|
|
|
|
|
def flatten_toc(toc: List[TOCEntry]) -> List[TOCEntry]:
|
|
"""Flatten hierarchical TOC into a flat list.
|
|
|
|
Converts nested TOC structure to a flat list while preserving section paths
|
|
and hierarchy information. Useful for iteration and database ingestion.
|
|
|
|
Args:
|
|
toc: Hierarchical TOC from build_toc_from_headings().
|
|
|
|
Returns:
|
|
Flat list of all TOC entries (depth-first traversal).
|
|
|
|
Example:
|
|
>>> toc = build_toc_from_headings(headings)
|
|
>>> flat = flatten_toc(toc)
|
|
>>> for entry in flat:
|
|
... indent = " " * (entry["level"] - 1)
|
|
... print(f"{indent}{entry['sectionPath']}: {entry['title']}")
|
|
"""
|
|
flat: List[TOCEntry] = []
|
|
|
|
def _traverse(entries: List[TOCEntry]) -> None:
|
|
for entry in entries:
|
|
# Add current entry (create a copy to avoid mutation)
|
|
flat_entry: TOCEntry = {
|
|
"title": entry["title"],
|
|
"level": entry["level"],
|
|
"sectionPath": entry["sectionPath"],
|
|
"pageRange": entry["pageRange"],
|
|
"children": [], # Don't include children in flat list
|
|
}
|
|
flat.append(flat_entry)
|
|
|
|
# Recursively traverse children
|
|
if entry["children"]:
|
|
_traverse(entry["children"])
|
|
|
|
_traverse(toc)
|
|
return flat
|
|
|
|
|
|
def print_toc_tree(
|
|
toc: List[TOCEntry],
|
|
indent: str = "",
|
|
) -> None:
|
|
"""Print TOC tree structure to console (debug helper).
|
|
|
|
Args:
|
|
toc: Hierarchical TOC from build_toc_from_headings().
|
|
indent: Indentation string for nested levels (internal use).
|
|
|
|
Example:
|
|
>>> toc = build_toc_from_headings(headings)
|
|
>>> print_toc_tree(toc)
|
|
1: Introduction
|
|
1.1: Background
|
|
1.2: Methodology
|
|
2: Results
|
|
2.1: Analysis
|
|
"""
|
|
for entry in toc:
|
|
print(f"{indent}{entry['sectionPath']}: {entry['title']}")
|
|
if entry["children"]:
|
|
print_toc_tree(entry["children"], indent + " ")
|
|
|
|
|
|
def _roman_to_int(roman: str) -> int:
|
|
"""Convert Roman numeral to integer.
|
|
|
|
Args:
|
|
roman: Roman numeral string (I, II, III, IV, V, VI, VII, etc.).
|
|
|
|
Returns:
|
|
Integer value.
|
|
|
|
Example:
|
|
>>> _roman_to_int("I")
|
|
1
|
|
>>> _roman_to_int("IV")
|
|
4
|
|
>>> _roman_to_int("VII")
|
|
7
|
|
"""
|
|
roman_values = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
|
|
result = 0
|
|
prev_value = 0
|
|
|
|
for char in reversed(roman.upper()):
|
|
value = roman_values.get(char, 0)
|
|
if value < prev_value:
|
|
result -= value
|
|
else:
|
|
result += value
|
|
prev_value = value
|
|
|
|
return result
|
|
|
|
|
|
def extract_toc_from_chapter_summaries(paragraphs: List[Dict[str, Any]]) -> List[TOCEntry]:
|
|
"""Extract TOC from chapter summary paragraphs (CHAPTER I, CHAPTER II, etc.).
|
|
|
|
Many Word documents have a "RESUME DES CHAPITRES" or "TABLE OF CONTENTS" section
|
|
with paragraphs like:
|
|
CHAPTER I.
|
|
VARIATION UNDER DOMESTICATION.
|
|
Description...
|
|
|
|
This function extracts those into a proper TOC structure.
|
|
|
|
Args:
|
|
paragraphs: List of paragraph dicts from word_processor.extract_word_content().
|
|
Each dict must have:
|
|
- text (str): Paragraph text
|
|
- is_heading (bool): Whether it's a heading
|
|
- index (int): Paragraph index
|
|
|
|
Returns:
|
|
List of TOCEntry dicts with hierarchical structure.
|
|
|
|
Example:
|
|
>>> paragraphs = [...]
|
|
>>> toc = extract_toc_from_chapter_summaries(paragraphs)
|
|
>>> print(toc[0]["title"])
|
|
'VARIATION UNDER DOMESTICATION'
|
|
>>> print(toc[0]["sectionPath"])
|
|
'1'
|
|
"""
|
|
import re
|
|
|
|
toc: List[TOCEntry] = []
|
|
toc_started = False
|
|
|
|
for para in paragraphs:
|
|
text = para.get("text", "").strip()
|
|
|
|
# Detect TOC start (multiple possible markers)
|
|
if any(marker in text.upper() for marker in [
|
|
'RESUME DES CHAPITRES',
|
|
'TABLE OF CONTENTS',
|
|
'CONTENTS',
|
|
'CHAPITRES',
|
|
]):
|
|
toc_started = True
|
|
continue
|
|
|
|
# Extract chapters
|
|
if toc_started and text.startswith('CHAPTER'):
|
|
# Split by newlines to get chapter number and title
|
|
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
|
|
|
if len(lines) >= 2:
|
|
chapter_line = lines[0]
|
|
title_line = lines[1]
|
|
|
|
# Extract chapter number (roman or arabic)
|
|
match = re.match(r'CHAPTER\s+([IVXLCDM]+|\d+)', chapter_line, re.IGNORECASE)
|
|
if match:
|
|
chapter_num_str = match.group(1)
|
|
|
|
# Convert to integer
|
|
if chapter_num_str.isdigit():
|
|
chapter_num = int(chapter_num_str)
|
|
else:
|
|
chapter_num = _roman_to_int(chapter_num_str)
|
|
|
|
# Remove trailing dots
|
|
title_clean = title_line.rstrip('.')
|
|
|
|
entry: TOCEntry = {
|
|
"title": title_clean,
|
|
"level": 1, # All chapters are top-level
|
|
"sectionPath": str(chapter_num),
|
|
"pageRange": "",
|
|
"children": [],
|
|
}
|
|
|
|
toc.append(entry)
|
|
|
|
return toc
|