Add Library RAG project and cleanup root directory
- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
91
generations/library_rag/examples/example_direct_pipeline.py
Normal file
91
generations/library_rag/examples/example_direct_pipeline.py
Normal file
@@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Exemple d'utilisation DIRECTE du pipeline PDF (sans MCP).
|
||||
|
||||
Plus simple et plus de contrôle sur les paramètres!
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from utils.pdf_pipeline import process_pdf, process_pdf_bytes
|
||||
import weaviate
|
||||
from weaviate.classes.query import Filter
|
||||
|
||||
|
||||
def example_process_local_file():
|
||||
"""Traiter un fichier local (PDF ou Markdown)."""
|
||||
|
||||
result = process_pdf(
|
||||
pdf_path=Path("md/peirce_collected_papers_fixed.md"),
|
||||
output_dir=Path("output"),
|
||||
|
||||
# Paramètres personnalisables
|
||||
skip_ocr=True, # Déjà en Markdown
|
||||
use_llm=False, # Pas besoin de LLM pour Peirce
|
||||
use_semantic_chunking=False, # Chunking basique (rapide)
|
||||
ingest_to_weaviate=True, # Ingérer dans Weaviate
|
||||
)
|
||||
|
||||
if result.get("success"):
|
||||
print(f"✓ {result['document_name']}: {result['chunks_count']} chunks")
|
||||
print(f" Coût total: {result['cost_total']:.4f}€")
|
||||
else:
|
||||
print(f"✗ Erreur: {result.get('error')}")
|
||||
|
||||
|
||||
def example_process_from_url():
|
||||
"""Télécharger et traiter depuis une URL."""
|
||||
|
||||
import httpx
|
||||
|
||||
url = "https://example.com/document.pdf"
|
||||
|
||||
# Télécharger
|
||||
response = httpx.get(url, follow_redirects=True)
|
||||
pdf_bytes = response.content
|
||||
|
||||
# Traiter
|
||||
result = process_pdf_bytes(
|
||||
file_bytes=pdf_bytes,
|
||||
filename="document.pdf",
|
||||
output_dir=Path("output"),
|
||||
|
||||
# Paramètres optimaux
|
||||
use_llm=True,
|
||||
llm_provider="mistral", # Ou "ollama"
|
||||
use_semantic_chunking=True,
|
||||
ingest_to_weaviate=True,
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def example_search():
|
||||
"""Rechercher directement dans Weaviate."""
|
||||
|
||||
client = weaviate.connect_to_local()
|
||||
|
||||
try:
|
||||
collection = client.collections.get('Chunk')
|
||||
|
||||
# Recherche sémantique
|
||||
response = collection.query.near_text(
|
||||
query="nominalism and realism",
|
||||
limit=10,
|
||||
)
|
||||
|
||||
print(f"Trouvé {len(response.objects)} résultats:")
|
||||
for obj in response.objects[:3]:
|
||||
props = obj.properties
|
||||
print(f"\n- {props.get('sectionPath', 'N/A')}")
|
||||
print(f" {props.get('text', '')[:150]}...")
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Choisir un exemple
|
||||
|
||||
# example_process_local_file()
|
||||
# example_process_from_url()
|
||||
example_search()
|
||||
Reference in New Issue
Block a user