"""Extract hierarchical table of contents from Word document headings. This module builds a structured TOC from Word heading styles (Heading 1-9), generating section paths compatible with the existing RAG pipeline and Weaviate schema (e.g., "1.2.3" for chapter 1, section 2, subsection 3). Example: Build TOC from Word headings: from pathlib import Path from utils.word_processor import extract_word_content from utils.word_toc_extractor import build_toc_from_headings content = extract_word_content(Path("doc.docx")) toc = build_toc_from_headings(content["headings"]) for entry in toc: print(f"{entry['sectionPath']}: {entry['title']}") Output: 1: Introduction 1.1: Background 1.2: Methodology 2: Results 2.1: Analysis Note: Compatible with existing TOCEntry TypedDict from utils.types """ from typing import List, Dict, Any, Optional from utils.types import TOCEntry def _generate_section_path( level: int, counters: List[int], ) -> str: """Generate section path string from level counters. Args: level: Current heading level (1-9). counters: List of counters for each level [c1, c2, c3, ...]. Returns: Section path string (e.g., "1.2.3"). Example: >>> _generate_section_path(3, [1, 2, 3, 0, 0]) '1.2.3' >>> _generate_section_path(1, [2, 0, 0]) '2' """ # Take counters up to current level path_parts = [str(c) for c in counters[:level] if c > 0] return ".".join(path_parts) if path_parts else "1" def build_toc_from_headings( headings: List[Dict[str, Any]], max_level: int = 9, ) -> List[TOCEntry]: """Build hierarchical table of contents from Word headings. Processes a list of heading paragraphs (with level attribute) and constructs a hierarchical TOC structure with section paths (1, 1.1, 1.2, 2, 2.1, etc.). Handles nested headings and missing intermediate levels gracefully. Args: headings: List of heading dicts from word_processor.extract_word_content(). Each dict must have: - text (str): Heading text - level (int): Heading level (1-9) - index (int): Paragraph index in document max_level: Maximum heading level to process (default: 9). Returns: List of TOCEntry dicts with hierarchical structure: - title (str): Heading text - level (int): Heading level (1-9) - sectionPath (str): Section path (e.g., "1.2.3") - pageRange (str): Empty string (not applicable for Word) - children (List[TOCEntry]): Nested sub-headings Example: >>> headings = [ ... {"text": "Chapter 1", "level": 1, "index": 0}, ... {"text": "Section 1.1", "level": 2, "index": 1}, ... {"text": "Section 1.2", "level": 2, "index": 2}, ... {"text": "Chapter 2", "level": 1, "index": 3}, ... ] >>> toc = build_toc_from_headings(headings) >>> print(toc[0]["title"]) 'Chapter 1' >>> print(toc[0]["sectionPath"]) '1' >>> print(toc[0]["children"][0]["sectionPath"]) '1.1' Note: - Empty headings are skipped - Handles missing intermediate levels (e.g., H1 → H3 without H2) - Section paths are 1-indexed (start from 1, not 0) """ if not headings: return [] toc: List[TOCEntry] = [] counters = [0] * max_level # Track counters for each level [h1, h2, h3, ...] parent_stack: List[TOCEntry] = [] # Stack to track parent headings for heading in headings: text = heading.get("text", "").strip() level = heading.get("level") # Skip empty headings or invalid levels if not text or level is None or level < 1 or level > max_level: continue level_idx = level - 1 # Convert to 0-indexed # Increment counter for this level counters[level_idx] += 1 # Reset all deeper level counters for i in range(level_idx + 1, max_level): counters[i] = 0 # Generate section path section_path = _generate_section_path(level, counters) # Create TOC entry entry: TOCEntry = { "title": text, "level": level, "sectionPath": section_path, "pageRange": "", # Not applicable for Word documents "children": [], } # Determine parent and add to appropriate location if level == 1: # Top-level heading - add to root toc.append(entry) parent_stack = [entry] # Reset parent stack else: # Find appropriate parent in stack # Pop stack until we find a parent at level < current level while parent_stack and parent_stack[-1]["level"] >= level: parent_stack.pop() if parent_stack: # Add to parent's children parent_stack[-1]["children"].append(entry) else: # No valid parent found (missing intermediate levels) # Add to root as a fallback toc.append(entry) # Add current entry to parent stack parent_stack.append(entry) return toc def flatten_toc(toc: List[TOCEntry]) -> List[TOCEntry]: """Flatten hierarchical TOC into a flat list. Converts nested TOC structure to a flat list while preserving section paths and hierarchy information. Useful for iteration and database ingestion. Args: toc: Hierarchical TOC from build_toc_from_headings(). Returns: Flat list of all TOC entries (depth-first traversal). Example: >>> toc = build_toc_from_headings(headings) >>> flat = flatten_toc(toc) >>> for entry in flat: ... indent = " " * (entry["level"] - 1) ... print(f"{indent}{entry['sectionPath']}: {entry['title']}") """ flat: List[TOCEntry] = [] def _traverse(entries: List[TOCEntry]) -> None: for entry in entries: # Add current entry (create a copy to avoid mutation) flat_entry: TOCEntry = { "title": entry["title"], "level": entry["level"], "sectionPath": entry["sectionPath"], "pageRange": entry["pageRange"], "children": [], # Don't include children in flat list } flat.append(flat_entry) # Recursively traverse children if entry["children"]: _traverse(entry["children"]) _traverse(toc) return flat def print_toc_tree( toc: List[TOCEntry], indent: str = "", ) -> None: """Print TOC tree structure to console (debug helper). Args: toc: Hierarchical TOC from build_toc_from_headings(). indent: Indentation string for nested levels (internal use). Example: >>> toc = build_toc_from_headings(headings) >>> print_toc_tree(toc) 1: Introduction 1.1: Background 1.2: Methodology 2: Results 2.1: Analysis """ for entry in toc: print(f"{indent}{entry['sectionPath']}: {entry['title']}") if entry["children"]: print_toc_tree(entry["children"], indent + " ") def _roman_to_int(roman: str) -> int: """Convert Roman numeral to integer. Args: roman: Roman numeral string (I, II, III, IV, V, VI, VII, etc.). Returns: Integer value. Example: >>> _roman_to_int("I") 1 >>> _roman_to_int("IV") 4 >>> _roman_to_int("VII") 7 """ roman_values = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} result = 0 prev_value = 0 for char in reversed(roman.upper()): value = roman_values.get(char, 0) if value < prev_value: result -= value else: result += value prev_value = value return result def extract_toc_from_chapter_summaries(paragraphs: List[Dict[str, Any]]) -> List[TOCEntry]: """Extract TOC from chapter summary paragraphs (CHAPTER I, CHAPTER II, etc.). Many Word documents have a "RESUME DES CHAPITRES" or "TABLE OF CONTENTS" section with paragraphs like: CHAPTER I. VARIATION UNDER DOMESTICATION. Description... This function extracts those into a proper TOC structure. Args: paragraphs: List of paragraph dicts from word_processor.extract_word_content(). Each dict must have: - text (str): Paragraph text - is_heading (bool): Whether it's a heading - index (int): Paragraph index Returns: List of TOCEntry dicts with hierarchical structure. Example: >>> paragraphs = [...] >>> toc = extract_toc_from_chapter_summaries(paragraphs) >>> print(toc[0]["title"]) 'VARIATION UNDER DOMESTICATION' >>> print(toc[0]["sectionPath"]) '1' """ import re toc: List[TOCEntry] = [] toc_started = False for para in paragraphs: text = para.get("text", "").strip() # Detect TOC start (multiple possible markers) if any(marker in text.upper() for marker in [ 'RESUME DES CHAPITRES', 'TABLE OF CONTENTS', 'CONTENTS', 'CHAPITRES', ]): toc_started = True continue # Extract chapters if toc_started and text.startswith('CHAPTER'): # Split by newlines to get chapter number and title lines = [line.strip() for line in text.split('\n') if line.strip()] if len(lines) >= 2: chapter_line = lines[0] title_line = lines[1] # Extract chapter number (roman or arabic) match = re.match(r'CHAPTER\s+([IVXLCDM]+|\d+)', chapter_line, re.IGNORECASE) if match: chapter_num_str = match.group(1) # Convert to integer if chapter_num_str.isdigit(): chapter_num = int(chapter_num_str) else: chapter_num = _roman_to_int(chapter_num_str) # Remove trailing dots title_clean = title_line.rstrip('.') entry: TOCEntry = { "title": title_clean, "level": 1, # All chapters are top-level "sectionPath": str(chapter_num), "pageRange": "", "children": [], } toc.append(entry) return toc