File size: 5,239 Bytes
67dff27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# voice_narration_tool.py
#-------------libraries to install----------#
#pip install g2p-en, nltk, smolagents
import torch
import soundfile as sf
import nltk
from transformers import FastSpeech2ConformerTokenizer, FastSpeech2ConformerWithHifiGan
from smolagents import tool
import os
import warnings
warnings.filterwarnings("ignore")
nltk.download('all')

# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# Global FastSpeech2 model for voice narration
VOICE_MODEL = "espnet/fastspeech2_conformer_with_hifigan"
_tokenizer_voice = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
_model_voice = FastSpeech2ConformerWithHifiGan.from_pretrained(VOICE_MODEL)
_model_voice.eval()
# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”

@tool
def generate_voice_narration(text: str, output_filename: str = "narration.wav") -> str:
    """
    Generate voice narration from text using FastSpeech2 TTS model.
    
    Args:
        text (str): The text to convert to speech
        output_filename (str): Output audio filename (default: "narration.wav")
    
    Returns:
        str: Path to the generated audio file
    """
    try:
        # Clean and prepare text
        clean_text = text.strip()
        if not clean_text:
            return "Error: Empty text provided"
        
        # Tokenize the input text
        inputs = _tokenizer_voice(clean_text, return_tensors="pt")
        
        # Generate audio
        with torch.no_grad():
            # Move inputs to same device as model if needed
            if torch.cuda.is_available() and next(_model_voice.parameters()).is_cuda:
                inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
            
            # Generate speech
            output = _model_voice(**inputs)
            audio = output.waveform.squeeze().cpu().numpy()
        
        # Ensure output directory exists
        os.makedirs(os.path.dirname(output_filename) if os.path.dirname(output_filename) else ".", exist_ok=True)
        
        # Save audio file
        sample_rate = 22050  # FastSpeech2 default sample rate
        sf.write(output_filename, audio, samplerate=sample_rate)
        
        return f"Voice narration saved to: {output_filename}"
        
    except Exception as e:
        return f"Error generating voice narration: {str(e)}"

@tool
def generate_story_narration(story_text: str, chapter_name: str = "chapter") -> str:
    """
    Generate voice narration for story text, handling longer texts by splitting into sentences.
    
    Args:
        story_text (str): The story text to narrate
        chapter_name (str): Name prefix for the output file
    
    Returns:
        str: Status message with output file path
    """
    try:
        # Clean text
        clean_text = story_text.strip()
        if not clean_text:
            return "Error: Empty story text provided"
        
        # For longer texts, we might want to split into sentences and combine
        # or just process the whole text at once (FastSpeech2 can handle reasonably long texts)
        
        output_filename = f"{chapter_name}_narration.wav"
        
        # Tokenize and generate
        inputs = _tokenizer_voice(clean_text, return_tensors="pt")
        
        with torch.no_grad():
            # Move to appropriate device
            if torch.cuda.is_available() and next(_model_voice.parameters()).is_cuda:
                inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
            
            # Generate speech
            output = _model_voice(**inputs)
            audio = output.waveform.squeeze().cpu().numpy()
        
        # Save the narration
        sample_rate = 22050
        sf.write(output_filename, audio, samplerate=sample_rate)
        
        # Calculate duration for user feedback
        duration = len(audio) / sample_rate
        
        return f"Story narration completed! Saved to: {output_filename} (Duration: {duration:.2f} seconds)"
        
    except Exception as e:
        return f"Error generating story narration: {str(e)}"

if __name__ == "__main__":
    # --- Test Cases ---
    test_texts = [
        "The warrior stepped into the shadowy forest, his sword gleaming under the moonlight.....there appeared a monster and he said WHAT ARE YOU DOING I WILL KILL YOU ",
        "Captain Sarah Martinez floated weightlessly in the observation deck of the starship Enterprise.",
        "The old lighthouse keeper climbed the spiral stairs one last time, as the storm raged outside."
    ]
    
    print("Testing voice narration tool...")
    
    for i, text in enumerate(test_texts, 1):
        print(f"\nTest {i}: Generating narration for: '{text[:50]}...'")
        result = generate_voice_narration(text=text, output_filename=f"test_narration_{i}.wav")
        print(f"Result: {result}")
    
    # Test story narration
    story = " ".join(test_texts)
    print(f"\nTesting story narration with combined text...")
    story_result = generate_story_narration(story_text=story, chapter_name="test_story")
    print(f"Story Result: {story_result}")