Ajout pipeline Word (.docx) pour ingestion RAG

Nouveaux modules (3 fichiers, ~850 lignes): - word_processor.py: Extraction contenu Word (texte, headings, images, métadonnées) - word_toc_extractor.py: Construction TOC hiérarchique depuis styles Heading - word_pipeline.py: Orchestrateur complet réutilisant modules LLM existants Fonctionnalités: - Extraction native Word (pas d'OCR, économie ~0.003€/page) - Support Heading 1-9 pour TOC hiérarchique - Section paths compatibles Weaviate (1, 1.1, 1.2, etc.) - Métadonnées depuis propriétés Word + extraction paragraphes - Markdown compatible avec pipeline existant - Extraction images inline - Réutilise 100% des modules LLM (metadata, classifier, chunker, cleaner, validator) Pipeline testé: - Fichier exemple: "On the origin - 10 pages.docx" - 48 paragraphes, 2 headings extraits - 37 chunks créés - Output: markdown + JSON chunks Architecture: 1. Extraction Word → 2. Markdown → 3. TOC → 4-9. Modules LLM réutilisés → 10. Weaviate Prochaine étape: Intégration Flask (route upload Word) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 21:58:43 +01:00
parent fd66917f03
commit 4de645145a
3 changed files with 1077 additions and 0 deletions
--- a/generations/library_rag/utils/word_processor.py
+++ b/generations/library_rag/utils/word_processor.py
@@ -0,0 +1,329 @@
+"""Extract structured content from Microsoft Word documents (.docx).
+
+This module provides functionality to extract text, headings, images, and metadata
+from Word documents using python-docx. The extracted content is structured to be
+compatible with the existing RAG pipeline (LLM processing and Weaviate ingestion).
+
+Example:
+    Extract content from a Word document:
+
+        from pathlib import Path
+        from utils.word_processor import extract_word_content
+
+        result = extract_word_content(Path("document.docx"))
+        print(f"Extracted {len(result['paragraphs'])} paragraphs")
+        print(f"Found {len(result['headings'])} headings")
+
+    Extract only metadata:
+
+        metadata = extract_word_metadata(Path("document.docx"))
+        print(f"Title: {metadata['title']}")
+        print(f"Author: {metadata['author']}")
+
+Note:
+    Requires python-docx library: pip install python-docx>=0.8.11
+"""
+
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+from datetime import datetime
+import io
+import re
+
+try:
+    from docx import Document
+    from docx.oxml.text.paragraph import CT_P
+    from docx.oxml.table import CT_Tbl
+    from docx.table import _Cell, Table
+    from docx.text.paragraph import Paragraph
+except ImportError:
+    raise ImportError(
+        "python-docx library is required for Word processing. "
+        "Install with: pip install python-docx>=0.8.11"
+    )
+
+from utils.types import TOCEntry
+
+
+def extract_word_metadata(docx_path: Path) -> Dict[str, Any]:
+    """Extract metadata from Word document core properties.
+
+    Reads the document's core properties (title, author, created date, etc.)
+    and attempts to extract additional metadata from the first few paragraphs
+    if core properties are missing.
+
+    Args:
+        docx_path: Path to the .docx file.
+
+    Returns:
+        Dictionary containing metadata fields:
+        - title (str): Document title
+        - author (str): Document author
+        - created (datetime): Creation date
+        - modified (datetime): Last modified date
+        - language (str): Document language (if available)
+        - edition (str): Edition info (if found in content)
+
+    Example:
+        >>> metadata = extract_word_metadata(Path("doc.docx"))
+        >>> print(metadata["title"])
+        'On the Origin of Species'
+    """
+    doc = Document(docx_path)
+    core_props = doc.core_properties
+
+    metadata = {
+        "title": core_props.title or "",
+        "author": core_props.author or "",
+        "created": core_props.created,
+        "modified": core_props.modified,
+        "language": "",
+        "edition": "",
+    }
+
+    # If metadata missing, try to extract from first paragraphs
+    # Common pattern: "TITRE: ...", "AUTEUR: ...", "EDITION: ..."
+    if not metadata["title"] or not metadata["author"]:
+        for para in doc.paragraphs[:10]:  # Check first 10 paragraphs
+            text = para.text.strip()
+
+            # Match patterns like "TITRE : On the Origin..."
+            if text.upper().startswith("TITRE") and ":" in text:
+                metadata["title"] = text.split(":", 1)[1].strip()
+
+            # Match patterns like "AUTEUR Charles DARWIN"
+            elif text.upper().startswith("AUTEUR") and ":" in text:
+                metadata["author"] = text.split(":", 1)[1].strip()
+            elif text.upper().startswith("AUTEUR "):
+                metadata["author"] = text[7:].strip()  # Remove "AUTEUR "
+
+            # Match patterns like "EDITION : Sixth London Edition..."
+            elif text.upper().startswith("EDITION") and ":" in text:
+                metadata["edition"] = text.split(":", 1)[1].strip()
+
+    return metadata
+
+
+def _get_heading_level(style_name: str) -> Optional[int]:
+    """Extract heading level from Word style name.
+
+    Args:
+        style_name: Word paragraph style name (e.g., "Heading 1", "Heading 2").
+
+    Returns:
+        Heading level (1-9) if it's a heading style, None otherwise.
+
+    Example:
+        >>> _get_heading_level("Heading 1")
+        1
+        >>> _get_heading_level("Heading 3")
+        3
+        >>> _get_heading_level("Normal")
+        None
+    """
+    # Match patterns: "Heading 1", "Heading 2", etc.
+    match = re.match(r"Heading (\d)", style_name)
+    if match:
+        level = int(match.group(1))
+        return level if 1 <= level <= 9 else None
+    return None
+
+
+def extract_word_images(
+    doc: Document,
+    output_dir: Path,
+    doc_name: str,
+) -> List[Path]:
+    """Extract inline images from Word document.
+
+    Saves all inline images (shapes, pictures) to the output directory
+    with sequential numbering.
+
+    Args:
+        doc: python-docx Document object.
+        output_dir: Directory to save extracted images.
+        doc_name: Document name for image filename prefix.
+
+    Returns:
+        List of paths to extracted image files.
+
+    Example:
+        >>> doc = Document("doc.docx")
+        >>> images = extract_word_images(doc, Path("output"), "darwin")
+        >>> print(f"Extracted {len(images)} images")
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    image_paths: List[Path] = []
+
+    image_counter = 0
+
+    # Extract images from document relationships
+    for rel in doc.part.rels.values():
+        if "image" in rel.target_ref:
+            try:
+                image_data = rel.target_part.blob
+
+                # Determine file extension from content type
+                content_type = rel.target_part.content_type
+                ext = "png"  # default
+                if "jpeg" in content_type or "jpg" in content_type:
+                    ext = "jpg"
+                elif "png" in content_type:
+                    ext = "png"
+                elif "gif" in content_type:
+                    ext = "gif"
+
+                # Save image
+                image_filename = f"{doc_name}_image_{image_counter}.{ext}"
+                image_path = output_dir / image_filename
+
+                with open(image_path, "wb") as f:
+                    f.write(image_data)
+
+                image_paths.append(image_path)
+                image_counter += 1
+
+            except Exception as e:
+                print(f"Warning: Failed to extract image {image_counter}: {e}")
+
+    return image_paths
+
+
+def extract_word_content(docx_path: Path) -> Dict[str, Any]:
+    """Extract complete structured content from Word document.
+
+    Main extraction function that processes a Word document and extracts:
+    - Full text content
+    - Paragraph structure with styles
+    - Heading hierarchy
+    - Images (if any)
+    - Raw metadata
+
+    Args:
+        docx_path: Path to the .docx file.
+
+    Returns:
+        Dictionary containing:
+        - raw_text (str): Complete document text
+        - paragraphs (List[Dict]): List of paragraph dicts with:
+            - index (int): Paragraph index
+            - style (str): Word style name
+            - text (str): Paragraph text content
+            - level (Optional[int]): Heading level (1-9) if heading
+            - is_heading (bool): True if paragraph is a heading
+        - headings (List[Dict]): List of heading paragraphs only
+        - metadata_raw (Dict): Raw metadata from core properties
+        - total_paragraphs (int): Total paragraph count
+        - has_images (bool): Whether document contains images
+
+    Raises:
+        FileNotFoundError: If docx_path does not exist.
+        ValueError: If file is not a valid .docx document.
+
+    Example:
+        >>> content = extract_word_content(Path("darwin.docx"))
+        >>> print(f"Document has {content['total_paragraphs']} paragraphs")
+        >>> print(f"Found {len(content['headings'])} headings")
+        >>> for h in content['headings']:
+        ...     print(f"H{h['level']}: {h['text'][:50]}")
+    """
+    if not docx_path.exists():
+        raise FileNotFoundError(f"Word document not found: {docx_path}")
+
+    if not docx_path.suffix.lower() == ".docx":
+        raise ValueError(f"File must be .docx format: {docx_path}")
+
+    # Load document
+    doc = Document(docx_path)
+
+    # Extract metadata
+    metadata_raw = extract_word_metadata(docx_path)
+
+    # Process paragraphs
+    paragraphs: List[Dict[str, Any]] = []
+    headings: List[Dict[str, Any]] = []
+    full_text_parts: List[str] = []
+
+    for idx, para in enumerate(doc.paragraphs):
+        text = para.text.strip()
+        style_name = para.style.name
+
+        # Determine if this is a heading and its level
+        heading_level = _get_heading_level(style_name)
+        is_heading = heading_level is not None
+
+        para_dict = {
+            "index": idx,
+            "style": style_name,
+            "text": text,
+            "level": heading_level,
+            "is_heading": is_heading,
+        }
+
+        paragraphs.append(para_dict)
+
+        if is_heading and text:
+            headings.append(para_dict)
+
+        # Add to full text (skip empty paragraphs)
+        if text:
+            full_text_parts.append(text)
+
+    raw_text = "\n\n".join(full_text_parts)
+
+    # Check for images (we'll extract them later if needed)
+    has_images = len(doc.part.rels) > 1  # More than just the document.xml relationship
+
+    return {
+        "raw_text": raw_text,
+        "paragraphs": paragraphs,
+        "headings": headings,
+        "metadata_raw": metadata_raw,
+        "total_paragraphs": len(paragraphs),
+        "has_images": has_images,
+    }
+
+
+def build_markdown_from_word(
+    paragraphs: List[Dict[str, Any]],
+    skip_metadata_lines: int = 5,
+) -> str:
+    """Build Markdown text from Word document paragraphs.
+
+    Converts Word document structure to Markdown format compatible with
+    the existing RAG pipeline. Heading styles are converted to Markdown
+    headers (#, ##, ###, etc.).
+
+    Args:
+        paragraphs: List of paragraph dicts from extract_word_content().
+        skip_metadata_lines: Number of initial paragraphs to skip (metadata).
+            Default: 5 (skip TITRE, AUTEUR, EDITION lines).
+
+    Returns:
+        Markdown-formatted text.
+
+    Example:
+        >>> content = extract_word_content(Path("doc.docx"))
+        >>> markdown = build_markdown_from_word(content["paragraphs"])
+        >>> with open("output.md", "w") as f:
+        ...     f.write(markdown)
+    """
+    markdown_lines: List[str] = []
+
+    for para in paragraphs[skip_metadata_lines:]:
+        text = para["text"]
+
+        if not text:
+            continue
+
+        if para["is_heading"] and para["level"]:
+            # Convert heading to Markdown: Heading 1 -> #, Heading 2 -> ##, etc.
+            level = para["level"]
+            markdown_lines.append(f"{'#' * level} {text}")
+            markdown_lines.append("")  # Blank line after heading
+        else:
+            # Normal paragraph
+            markdown_lines.append(text)
+            markdown_lines.append("")  # Blank line after paragraph
+
+    return "\n".join(markdown_lines).strip()