Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions
--- a/generations/library_rag/templates/upload.html
+++ b/generations/library_rag/templates/upload.html
@@ -0,0 +1,178 @@
+{% extends "base.html" %}
+
+{% block title %}Upload Document{% endblock %}
+
+{% block content %}
+<section class="section">
+    <h1>📄 Parser PDF/Markdown</h1>
+    <p class="lead">Uploadez un fichier PDF ou Markdown pour l'analyser et structurer son contenu</p>
+
+    {% if error %}
+        <div class="alert alert-warning">
+            <strong>Erreur :</strong> {{ error }}
+        </div>
+    {% endif %}
+
+    <div class="search-box">
+        <form method="post" enctype="multipart/form-data">
+            <div class="form-group">
+                <label class="form-label" for="file">Fichier PDF ou Markdown</label>
+                <input 
+                    type="file" 
+                    name="file" 
+                    id="file" 
+                    class="form-control"
+                    accept=".pdf,.md"
+                    required
+                >
+                <div class="caption mt-1">Taille maximale : 50 MB</div>
+                <div class="caption" style="color: var(--color-accent); margin-top: 0.25rem;">
+                    💡 Pour retester un document existant sans refaire l'OCR payant, cochez "Skip OCR"
+                </div>
+            </div>
+
+            <div class="form-row mt-3">
+                <div class="form-group">
+                    <label class="form-label">Options</label>
+                    <div style="display: flex; flex-direction: column; gap: 0.5rem;">
+                        <div style="display: flex; align-items: center; gap: 0.5rem;">
+                            <input 
+                                type="checkbox" 
+                                name="skip_ocr" 
+                                id="skip_ocr" 
+                                style="width: auto;"
+                            >
+                            <label for="skip_ocr" style="margin: 0; font-size: 0.95rem; text-transform: none; letter-spacing: 0;">
+                                ⚡ Skip OCR (réutiliser markdown existant)
+                            </label>
+                        </div>
+                        <div style="display: flex; align-items: center; gap: 0.5rem;">
+                            <input 
+                                type="checkbox" 
+                                name="use_llm" 
+                                id="use_llm" 
+                                checked
+                                style="width: auto;"
+                            >
+                            <label for="use_llm" style="margin: 0; font-size: 0.95rem; text-transform: none; letter-spacing: 0;">
+                                Activer la structuration LLM (Ollama)
+                            </label>
+                        </div>
+                        <div style="display: flex; align-items: center; gap: 0.5rem;">
+                            <input 
+                                type="checkbox" 
+                                name="ingest_weaviate" 
+                                id="ingest_weaviate" 
+                                checked
+                                style="width: auto;"
+                            >
+                            <label for="ingest_weaviate" style="margin: 0; font-size: 0.95rem; text-transform: none; letter-spacing: 0;">
+                                Insérer dans Weaviate (vectorisation)
+                            </label>
+                        </div>
+                    </div>
+                </div>
+                <div class="form-group">
+                    <label class="form-label" for="llm_provider">Provider LLM</label>
+                    <select name="llm_provider" id="llm_provider" class="form-control" onchange="updateModelOptions()">
+                        <option value="mistral" selected>⚡ Mistral API (rapide)</option>
+                        <option value="ollama">🖥️ Ollama (local, lent)</option>
+                    </select>
+                </div>
+                <div class="form-group">
+                    <label class="form-label" for="llm_model">Modèle LLM</label>
+                    <select name="llm_model" id="llm_model" class="form-control">
+                        <!-- Options Mistral API -->
+                        <option value="mistral-small-latest" selected>mistral-small (rapide, économique)</option>
+                        <option value="mistral-medium-latest">mistral-medium (équilibré)</option>
+                        <option value="mistral-large-latest">mistral-large (puissant)</option>
+                    </select>
+                </div>
+                <script>
+                function updateModelOptions() {
+                    const provider = document.getElementById('llm_provider').value;
+                    const modelSelect = document.getElementById('llm_model');
+                    
+                    if (provider === 'mistral') {
+                        modelSelect.innerHTML = `
+                            <option value="mistral-small-latest" selected>mistral-small (rapide, économique)</option>
+                            <option value="mistral-medium-latest">mistral-medium (équilibré)</option>
+                            <option value="mistral-large-latest">mistral-large (puissant)</option>
+                        `;
+                    } else {
+                        modelSelect.innerHTML = `
+                            <option value="qwen2.5:7b" selected>qwen2.5:7b (recommandé)</option>
+                            <option value="qwen2.5:14b">qwen2.5:14b</option>
+                            <option value="llama3.2:3b">llama3.2:3b (rapide)</option>
+                            <option value="mistral:7b">mistral:7b</option>
+                        `;
+                    }
+                }
+                </script>
+            </div>
+
+            <!-- Options Extraction TOC améliorée -->
+            <div class="card mt-4" style="border-left: 3px solid #4CAF50;">
+                <h4 style="color: #4CAF50;">📑 Extraction TOC améliorée (Recommandé)</h4>
+                <p style="font-size: 0.9rem; color: #666;">
+                    Analyse l'indentation du texte pour détecter automatiquement la hiérarchie de la table des matières.
+                    <br><strong style="color: #4CAF50;">✅ Fiable, rapide et sans coût supplémentaire</strong>
+                </p>
+                <div style="display: flex; align-items: center; gap: 0.5rem; margin-top: 1rem;">
+                    <input 
+                        type="checkbox" 
+                        name="use_ocr_annotations" 
+                        id="use_ocr_annotations"
+                        style="width: auto;"
+                        checked
+                    >
+                    <label for="use_ocr_annotations" style="margin: 0; font-size: 0.95rem; font-weight: 600;">
+                        Activer l'analyse d'indentation pour la TOC
+                    </label>
+                </div>
+                <div style="margin-top: 0.75rem; padding: 0.75rem; background: #f0f9f0; border-radius: 4px; font-size: 0.85rem;">
+                    <strong>Fonctionnement :</strong> Détecte les niveaux hiérarchiques en comptant les espaces d'indentation dans la table des matières.
+                    <br>
+                    <em>Idéal pour les documents académiques avec TOC structurée.</em>
+                </div>
+            </div>
+
+            <div class="mt-3">
+                <button type="submit" class="btn btn-primary">
+                    Analyser le document
+                </button>
+            </div>
+        </form>
+    </div>
+
+    <hr class="divider">
+
+    <div class="card">
+        <h3>📋 Pipeline de traitement</h3>
+        <div class="mt-2">
+            <p><strong>1. OCR Mistral</strong> — Extraction du texte et des images via l'API Mistral</p>
+            <p><strong>2. Markdown</strong> — Construction du document Markdown avec images</p>
+            <p><strong>3. Hiérarchie</strong> — Analyse des titres pour créer une structure arborescente</p>
+            <p><strong>4. LLM (optionnel)</strong> — Amélioration de la structure via Ollama</p>
+        </div>
+    </div>
+
+    <div class="card mt-3">
+        <h3>📁 Fichiers générés</h3>
+        <div class="mt-2">
+            <ul style="list-style: none;">
+                <li class="mb-1"><span class="badge">document.md</span> Texte Markdown OCR</li>
+                <li class="mb-1"><span class="badge">document_chunks.json</span> Chunks hiérarchiques</li>
+                <li class="mb-1"><span class="badge">document_structured.json</span> Structure LLM</li>
+                <li class="mb-1"><span class="badge">document_ocr.json</span> Réponse OCR brute</li>
+                <li><span class="badge">images/</span> Images extraites</li>
+            </ul>
+        </div>
+    </div>
+
+    <div class="text-center mt-4">
+        <a href="/documents" class="btn">Voir les documents traités</a>
+    </div>
+</section>
+{% endblock %}
+