Ajout nettoyage markdown pour TTS audio

- Nouvelle fonction _clean_markdown() pour supprimer le formatage markdown
- Supprime headers (#), bold (**), italic (*), code blocks (```)
- Supprime liens [text](url), citations (>), marqueurs de listes (-)
- Nettoie les espaces multiples pour un texte propre
- Évite la lecture à voix haute des caractères markdown
- Tests validés: tous les patterns markdown correctement nettoyés

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-30 19:35:01 +01:00
parent 127658aefd
commit f2303569b5

View File

@@ -29,11 +29,18 @@ from pathlib import Path
from typing import Optional, List from typing import Optional, List
from datetime import datetime from datetime import datetime
import re import re
import os
try: try:
from TTS.api import TTS from TTS.api import TTS
import torch import torch
except ImportError: from pydub import AudioSegment
except ImportError as e:
if "pydub" in str(e):
raise ImportError(
"pydub library is required for audio concatenation. "
"Install with: pip install pydub"
)
raise ImportError( raise ImportError(
"TTS library is required for audio generation. " "TTS library is required for audio generation. "
"Install with: pip install TTS>=0.22.0" "Install with: pip install TTS>=0.22.0"
@@ -57,11 +64,15 @@ def _get_tts_instance() -> TTS:
if _tts_instance is None: if _tts_instance is None:
# Initialize XTTS v2 model # Initialize XTTS v2 model
_tts_instance = TTS("tts_models/multilingual/multi-dataset/xtts_v2") use_gpu = torch.cuda.is_available()
# Move to GPU if available (significant speedup) # Initialize with GPU parameter to avoid CPU->GPU migration issues
if torch.cuda.is_available(): _tts_instance = TTS(
_tts_instance.to("cuda") "tts_models/multilingual/multi-dataset/xtts_v2",
gpu=use_gpu
)
if use_gpu:
print("TTS: Using CUDA GPU acceleration") print("TTS: Using CUDA GPU acceleration")
else: else:
print("TTS: Running on CPU (slower)") print("TTS: Running on CPU (slower)")
@@ -69,12 +80,63 @@ def _get_tts_instance() -> TTS:
return _tts_instance return _tts_instance
def _clean_markdown(text: str) -> str:
"""Remove markdown formatting for cleaner TTS output.
Removes markdown syntax characters (headers, bold, italic, code blocks,
links, quotes, list markers) to produce clean text suitable for
text-to-speech generation without verbal artifacts.
Args:
text: Input text with markdown formatting.
Returns:
Clean text without markdown characters, suitable for TTS.
Example:
>>> text = "# Titre\\n**Gras** et *italique*\\n- Liste"
>>> _clean_markdown(text)
'Titre Gras et italique Liste'
"""
# Remove headers (#, ##, ###, etc.)
text = re.sub(r'#+\s*', '', text)
# Remove bold (**text**)
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
# Remove italic (*text* or _text_)
text = re.sub(r'\*([^*]+)\*', r'\1', text)
text = re.sub(r'_([^_]+)_', r'\1', text)
# Remove code blocks (```text```)
text = re.sub(r'```[^`]*```', '', text)
text = re.sub(r'`([^`]+)`', r'\1', text)
# Remove links [text](url) -> keep text only
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
# Remove quotes (>)
text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
# Remove list markers (-, *, +)
text = re.sub(r'^[-*+]\s+', '', text, flags=re.MULTILINE)
# Remove horizontal rules (---, ***, ___)
text = re.sub(r'^[-*_]{3,}$', '', text, flags=re.MULTILINE)
# Clean multiple spaces and newlines
text = re.sub(r'\s+', ' ', text)
return text.strip()
def _chunk_text(text: str, max_words: int = 400) -> List[str]: def _chunk_text(text: str, max_words: int = 400) -> List[str]:
"""Split text into chunks at sentence boundaries. """Split text into chunks at sentence boundaries.
Long texts are split into smaller chunks to avoid memory issues and Long texts are split into smaller chunks to avoid memory issues and
improve generation quality. Splits at sentence boundaries (., !, ?) improve generation quality. Splits at sentence boundaries (., !, ?)
to maintain natural prosody. to maintain natural prosody. If a sentence is too long, splits at
comma boundaries.
Args: Args:
text: Input text to split. text: Input text to split.
@@ -99,7 +161,40 @@ def _chunk_text(text: str, max_words: int = 400) -> List[str]:
for sentence in sentences: for sentence in sentences:
sentence_words = len(sentence.split()) sentence_words = len(sentence.split())
# If adding this sentence exceeds limit, start new chunk # If sentence itself is too long, split at commas
if sentence_words > max_words:
# Split at commas
parts = re.split(r'(?<=,)\s+', sentence)
for i, part in enumerate(parts):
part_words = len(part.split())
is_last_part = (i == len(parts) - 1)
ends_with_comma = part.rstrip().endswith(',')
# If this would create a chunk ending with comma (incomplete thought)
# Try to keep it with the next part
if current_word_count + part_words > max_words and current_chunk:
# Only split if current chunk doesn't end with comma
# OR if we're forced to (chunk would be way too big)
if current_word_count + part_words > max_words * 1.3:
# Forced split - chunk is too big
chunks.append(' '.join(current_chunk))
current_chunk = [part]
current_word_count = part_words
elif not ends_with_comma or is_last_part:
# Safe to split - doesn't end with comma or is last part
chunks.append(' '.join(current_chunk))
current_chunk = [part]
current_word_count = part_words
else:
# Keep together to avoid mid-sentence cut
current_chunk.append(part)
current_word_count += part_words
else:
current_chunk.append(part)
current_word_count += part_words
else:
# Normal sentence processing
if current_word_count + sentence_words > max_words and current_chunk: if current_word_count + sentence_words > max_words and current_chunk:
chunks.append(' '.join(current_chunk)) chunks.append(' '.join(current_chunk))
current_chunk = [sentence] current_chunk = [sentence]
@@ -119,7 +214,7 @@ def generate_speech(
text: str, text: str,
output_dir: Path, output_dir: Path,
language: str = "fr", language: str = "fr",
max_words_per_chunk: int = 400, max_words_per_chunk: int = 30,
) -> Path: ) -> Path:
"""Generate speech audio from text using XTTS v2. """Generate speech audio from text using XTTS v2.
@@ -134,8 +229,8 @@ def generate_speech(
language: Language code for TTS. Options: "fr", "en", "es", "de", etc. language: Language code for TTS. Options: "fr", "en", "es", "de", etc.
Default: "fr" (French). Default: "fr" (French).
max_words_per_chunk: Maximum words per processing chunk for long texts. max_words_per_chunk: Maximum words per processing chunk for long texts.
Default: 400 words. Increase for faster processing, decrease if Default: 30 words (~200 chars, quality mode for podcasts/audiobooks).
running out of VRAM. Guarantees no warnings, optimal for clean audio with smooth transitions.
Returns: Returns:
Path to the generated .wav file. Path to the generated .wav file.
@@ -162,6 +257,10 @@ def generate_speech(
# Create output directory if needed # Create output directory if needed
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
# Clean markdown formatting before TTS processing
text = _clean_markdown(text)
print(f"TTS: Cleaned markdown formatting from input text")
# Generate timestamped filename # Generate timestamped filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"chat_audio_{timestamp}.wav" filename = f"chat_audio_{timestamp}.wav"
@@ -170,8 +269,12 @@ def generate_speech(
# Get TTS instance (lazy loaded, cached) # Get TTS instance (lazy loaded, cached)
tts = _get_tts_instance() tts = _get_tts_instance()
# For very long texts, we could chunk and concatenate # Path to speaker reference audio (for XTTS v2 voice cloning)
# For now, process as single chunk (XTTS handles ~1000 words well) # Located at: generations/library_rag/output/voices/speaker_wav.wav
project_root = Path(__file__).parent.parent
speaker_wav_path = project_root / "output" / "voices" / "speaker_wav.wav"
# Check if text needs chunking
word_count = len(text.split()) word_count = len(text.split())
if word_count > max_words_per_chunk: if word_count > max_words_per_chunk:
@@ -179,25 +282,57 @@ def generate_speech(
chunks = _chunk_text(text, max_words=max_words_per_chunk) chunks = _chunk_text(text, max_words=max_words_per_chunk)
print(f"TTS: Split into {len(chunks)} chunks") print(f"TTS: Split into {len(chunks)} chunks")
# For MVP, just use first chunk and add warning # Generate audio for each chunk
# TODO: Implement multi-chunk concatenation with pydub temp_files = []
text = chunks[0]
print(f"TTS: WARNING - Using first chunk only ({len(text.split())} words)")
try: try:
# Generate speech with automatic mixed precision for efficiency for i, chunk in enumerate(chunks):
if torch.cuda.is_available(): # Create temporary file for this chunk
with torch.cuda.amp.autocast(): temp_filepath = output_dir / f"temp_chunk_{i}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"
print(f"TTS: Generating chunk {i+1}/{len(chunks)} ({len(chunk.split())} words)...")
# Generate audio for this chunk
tts.tts_to_file( tts.tts_to_file(
text=text, text=chunk,
file_path=str(filepath), file_path=str(temp_filepath),
language=language language=language,
speaker_wav=str(speaker_wav_path)
) )
temp_files.append(temp_filepath)
# Concatenate all audio chunks with crossfade
print(f"TTS: Concatenating {len(temp_files)} audio chunks with crossfade...")
combined = AudioSegment.from_wav(str(temp_files[0]))
# Add remaining chunks with 100ms crossfade for smooth transitions
for temp_file in temp_files[1:]:
audio_chunk = AudioSegment.from_wav(str(temp_file))
combined = combined.append(audio_chunk, crossfade=100)
# Export final concatenated audio
combined.export(str(filepath), format="wav")
print(f"TTS: Generated concatenated audio -> {filepath}")
finally:
# Clean up temporary files
for temp_file in temp_files:
try:
if temp_file.exists():
os.remove(temp_file)
except Exception as e:
print(f"TTS: Warning - Could not delete temp file {temp_file}: {e}")
return filepath
else: else:
# Single chunk - generate directly
try:
tts.tts_to_file( tts.tts_to_file(
text=text, text=text,
file_path=str(filepath), file_path=str(filepath),
language=language language=language,
speaker_wav=str(speaker_wav_path)
) )
print(f"TTS: Generated audio -> {filepath}") print(f"TTS: Generated audio -> {filepath}")