Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server)
  - PDF processing pipeline with OCR and LLM extraction
  - Weaviate vector database integration (BGE-M3 embeddings)
  - Flask web interface with search and document management
  - MCP server for Claude Desktop integration
  - Comprehensive test suite (134 tests)

- Clean up root directory
  - Remove obsolete documentation files
  - Remove backup and temporary files
  - Update autonomous agent configuration

- Update prompts
  - Enhance initializer bis prompt with better instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions

View File

@@ -0,0 +1,91 @@
#!/usr/bin/env python3
"""
Exemple d'utilisation DIRECTE du pipeline PDF (sans MCP).
Plus simple et plus de contrôle sur les paramètres!
"""
from pathlib import Path
from utils.pdf_pipeline import process_pdf, process_pdf_bytes
import weaviate
from weaviate.classes.query import Filter
def example_process_local_file():
"""Traiter un fichier local (PDF ou Markdown)."""
result = process_pdf(
pdf_path=Path("md/peirce_collected_papers_fixed.md"),
output_dir=Path("output"),
# Paramètres personnalisables
skip_ocr=True, # Déjà en Markdown
use_llm=False, # Pas besoin de LLM pour Peirce
use_semantic_chunking=False, # Chunking basique (rapide)
ingest_to_weaviate=True, # Ingérer dans Weaviate
)
if result.get("success"):
print(f"{result['document_name']}: {result['chunks_count']} chunks")
print(f" Coût total: {result['cost_total']:.4f}")
else:
print(f"✗ Erreur: {result.get('error')}")
def example_process_from_url():
"""Télécharger et traiter depuis une URL."""
import httpx
url = "https://example.com/document.pdf"
# Télécharger
response = httpx.get(url, follow_redirects=True)
pdf_bytes = response.content
# Traiter
result = process_pdf_bytes(
file_bytes=pdf_bytes,
filename="document.pdf",
output_dir=Path("output"),
# Paramètres optimaux
use_llm=True,
llm_provider="mistral", # Ou "ollama"
use_semantic_chunking=True,
ingest_to_weaviate=True,
)
return result
def example_search():
"""Rechercher directement dans Weaviate."""
client = weaviate.connect_to_local()
try:
collection = client.collections.get('Chunk')
# Recherche sémantique
response = collection.query.near_text(
query="nominalism and realism",
limit=10,
)
print(f"Trouvé {len(response.objects)} résultats:")
for obj in response.objects[:3]:
props = obj.properties
print(f"\n- {props.get('sectionPath', 'N/A')}")
print(f" {props.get('text', '')[:150]}...")
finally:
client.close()
if __name__ == "__main__":
# Choisir un exemple
# example_process_local_file()
# example_process_from_url()
example_search()