Ajout pipeline Word (.docx) pour ingestion RAG
Nouveaux modules (3 fichiers, ~850 lignes): - word_processor.py: Extraction contenu Word (texte, headings, images, métadonnées) - word_toc_extractor.py: Construction TOC hiérarchique depuis styles Heading - word_pipeline.py: Orchestrateur complet réutilisant modules LLM existants Fonctionnalités: - Extraction native Word (pas d'OCR, économie ~0.003€/page) - Support Heading 1-9 pour TOC hiérarchique - Section paths compatibles Weaviate (1, 1.1, 1.2, etc.) - Métadonnées depuis propriétés Word + extraction paragraphes - Markdown compatible avec pipeline existant - Extraction images inline - Réutilise 100% des modules LLM (metadata, classifier, chunker, cleaner, validator) Pipeline testé: - Fichier exemple: "On the origin - 10 pages.docx" - 48 paragraphes, 2 headings extraits - 37 chunks créés - Output: markdown + JSON chunks Architecture: 1. Extraction Word → 2. Markdown → 3. TOC → 4-9. Modules LLM réutilisés → 10. Weaviate Prochaine étape: Intégration Flask (route upload Word) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
229
generations/library_rag/utils/word_toc_extractor.py
Normal file
229
generations/library_rag/utils/word_toc_extractor.py
Normal file
@@ -0,0 +1,229 @@
|
||||
"""Extract hierarchical table of contents from Word document headings.
|
||||
|
||||
This module builds a structured TOC from Word heading styles (Heading 1-9),
|
||||
generating section paths compatible with the existing RAG pipeline and Weaviate
|
||||
schema (e.g., "1.2.3" for chapter 1, section 2, subsection 3).
|
||||
|
||||
Example:
|
||||
Build TOC from Word headings:
|
||||
|
||||
from pathlib import Path
|
||||
from utils.word_processor import extract_word_content
|
||||
from utils.word_toc_extractor import build_toc_from_headings
|
||||
|
||||
content = extract_word_content(Path("doc.docx"))
|
||||
toc = build_toc_from_headings(content["headings"])
|
||||
|
||||
for entry in toc:
|
||||
print(f"{entry['sectionPath']}: {entry['title']}")
|
||||
|
||||
Output:
|
||||
1: Introduction
|
||||
1.1: Background
|
||||
1.2: Methodology
|
||||
2: Results
|
||||
2.1: Analysis
|
||||
|
||||
Note:
|
||||
Compatible with existing TOCEntry TypedDict from utils.types
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any, Optional
|
||||
from utils.types import TOCEntry
|
||||
|
||||
|
||||
def _generate_section_path(
|
||||
level: int,
|
||||
counters: List[int],
|
||||
) -> str:
|
||||
"""Generate section path string from level counters.
|
||||
|
||||
Args:
|
||||
level: Current heading level (1-9).
|
||||
counters: List of counters for each level [c1, c2, c3, ...].
|
||||
|
||||
Returns:
|
||||
Section path string (e.g., "1.2.3").
|
||||
|
||||
Example:
|
||||
>>> _generate_section_path(3, [1, 2, 3, 0, 0])
|
||||
'1.2.3'
|
||||
>>> _generate_section_path(1, [2, 0, 0])
|
||||
'2'
|
||||
"""
|
||||
# Take counters up to current level
|
||||
path_parts = [str(c) for c in counters[:level] if c > 0]
|
||||
return ".".join(path_parts) if path_parts else "1"
|
||||
|
||||
|
||||
def build_toc_from_headings(
|
||||
headings: List[Dict[str, Any]],
|
||||
max_level: int = 9,
|
||||
) -> List[TOCEntry]:
|
||||
"""Build hierarchical table of contents from Word headings.
|
||||
|
||||
Processes a list of heading paragraphs (with level attribute) and constructs
|
||||
a hierarchical TOC structure with section paths (1, 1.1, 1.2, 2, 2.1, etc.).
|
||||
Handles nested headings and missing intermediate levels gracefully.
|
||||
|
||||
Args:
|
||||
headings: List of heading dicts from word_processor.extract_word_content().
|
||||
Each dict must have:
|
||||
- text (str): Heading text
|
||||
- level (int): Heading level (1-9)
|
||||
- index (int): Paragraph index in document
|
||||
max_level: Maximum heading level to process (default: 9).
|
||||
|
||||
Returns:
|
||||
List of TOCEntry dicts with hierarchical structure:
|
||||
- title (str): Heading text
|
||||
- level (int): Heading level (1-9)
|
||||
- sectionPath (str): Section path (e.g., "1.2.3")
|
||||
- pageRange (str): Empty string (not applicable for Word)
|
||||
- children (List[TOCEntry]): Nested sub-headings
|
||||
|
||||
Example:
|
||||
>>> headings = [
|
||||
... {"text": "Chapter 1", "level": 1, "index": 0},
|
||||
... {"text": "Section 1.1", "level": 2, "index": 1},
|
||||
... {"text": "Section 1.2", "level": 2, "index": 2},
|
||||
... {"text": "Chapter 2", "level": 1, "index": 3},
|
||||
... ]
|
||||
>>> toc = build_toc_from_headings(headings)
|
||||
>>> print(toc[0]["title"])
|
||||
'Chapter 1'
|
||||
>>> print(toc[0]["sectionPath"])
|
||||
'1'
|
||||
>>> print(toc[0]["children"][0]["sectionPath"])
|
||||
'1.1'
|
||||
|
||||
Note:
|
||||
- Empty headings are skipped
|
||||
- Handles missing intermediate levels (e.g., H1 → H3 without H2)
|
||||
- Section paths are 1-indexed (start from 1, not 0)
|
||||
"""
|
||||
if not headings:
|
||||
return []
|
||||
|
||||
toc: List[TOCEntry] = []
|
||||
counters = [0] * max_level # Track counters for each level [h1, h2, h3, ...]
|
||||
parent_stack: List[TOCEntry] = [] # Stack to track parent headings
|
||||
|
||||
for heading in headings:
|
||||
text = heading.get("text", "").strip()
|
||||
level = heading.get("level")
|
||||
|
||||
# Skip empty headings or invalid levels
|
||||
if not text or level is None or level < 1 or level > max_level:
|
||||
continue
|
||||
|
||||
level_idx = level - 1 # Convert to 0-indexed
|
||||
|
||||
# Increment counter for this level
|
||||
counters[level_idx] += 1
|
||||
|
||||
# Reset all deeper level counters
|
||||
for i in range(level_idx + 1, max_level):
|
||||
counters[i] = 0
|
||||
|
||||
# Generate section path
|
||||
section_path = _generate_section_path(level, counters)
|
||||
|
||||
# Create TOC entry
|
||||
entry: TOCEntry = {
|
||||
"title": text,
|
||||
"level": level,
|
||||
"sectionPath": section_path,
|
||||
"pageRange": "", # Not applicable for Word documents
|
||||
"children": [],
|
||||
}
|
||||
|
||||
# Determine parent and add to appropriate location
|
||||
if level == 1:
|
||||
# Top-level heading - add to root
|
||||
toc.append(entry)
|
||||
parent_stack = [entry] # Reset parent stack
|
||||
else:
|
||||
# Find appropriate parent in stack
|
||||
# Pop stack until we find a parent at level < current level
|
||||
while parent_stack and parent_stack[-1]["level"] >= level:
|
||||
parent_stack.pop()
|
||||
|
||||
if parent_stack:
|
||||
# Add to parent's children
|
||||
parent_stack[-1]["children"].append(entry)
|
||||
else:
|
||||
# No valid parent found (missing intermediate levels)
|
||||
# Add to root as a fallback
|
||||
toc.append(entry)
|
||||
|
||||
# Add current entry to parent stack
|
||||
parent_stack.append(entry)
|
||||
|
||||
return toc
|
||||
|
||||
|
||||
def flatten_toc(toc: List[TOCEntry]) -> List[TOCEntry]:
|
||||
"""Flatten hierarchical TOC into a flat list.
|
||||
|
||||
Converts nested TOC structure to a flat list while preserving section paths
|
||||
and hierarchy information. Useful for iteration and database ingestion.
|
||||
|
||||
Args:
|
||||
toc: Hierarchical TOC from build_toc_from_headings().
|
||||
|
||||
Returns:
|
||||
Flat list of all TOC entries (depth-first traversal).
|
||||
|
||||
Example:
|
||||
>>> toc = build_toc_from_headings(headings)
|
||||
>>> flat = flatten_toc(toc)
|
||||
>>> for entry in flat:
|
||||
... indent = " " * (entry["level"] - 1)
|
||||
... print(f"{indent}{entry['sectionPath']}: {entry['title']}")
|
||||
"""
|
||||
flat: List[TOCEntry] = []
|
||||
|
||||
def _traverse(entries: List[TOCEntry]) -> None:
|
||||
for entry in entries:
|
||||
# Add current entry (create a copy to avoid mutation)
|
||||
flat_entry: TOCEntry = {
|
||||
"title": entry["title"],
|
||||
"level": entry["level"],
|
||||
"sectionPath": entry["sectionPath"],
|
||||
"pageRange": entry["pageRange"],
|
||||
"children": [], # Don't include children in flat list
|
||||
}
|
||||
flat.append(flat_entry)
|
||||
|
||||
# Recursively traverse children
|
||||
if entry["children"]:
|
||||
_traverse(entry["children"])
|
||||
|
||||
_traverse(toc)
|
||||
return flat
|
||||
|
||||
|
||||
def print_toc_tree(
|
||||
toc: List[TOCEntry],
|
||||
indent: str = "",
|
||||
) -> None:
|
||||
"""Print TOC tree structure to console (debug helper).
|
||||
|
||||
Args:
|
||||
toc: Hierarchical TOC from build_toc_from_headings().
|
||||
indent: Indentation string for nested levels (internal use).
|
||||
|
||||
Example:
|
||||
>>> toc = build_toc_from_headings(headings)
|
||||
>>> print_toc_tree(toc)
|
||||
1: Introduction
|
||||
1.1: Background
|
||||
1.2: Methodology
|
||||
2: Results
|
||||
2.1: Analysis
|
||||
"""
|
||||
for entry in toc:
|
||||
print(f"{indent}{entry['sectionPath']}: {entry['title']}")
|
||||
if entry["children"]:
|
||||
print_toc_tree(entry["children"], indent + " ")
|
||||
Reference in New Issue
Block a user