File size: 3,273 Bytes
a1e47c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import numpy as np

from mcp.server import FastMCP
from pydantic import Field

from aworld.utils import import_package
from aworld.logs.util import logger

# Import required packages
import_package('gtts', install_name='gTTS')
import_package('pyttsx3', install_name='pyttsx3')
import_package('librosa', install_name='librosa')
import_package('soundfile', install_name='soundfile')
import pyttsx3
from gtts import gTTS
import librosa
import soundfile as sf

mcp = FastMCP("text_to_audio")

@mcp.tool()
def convert_text_to_audio(
    text: str = Field(description="Text to convert to audio"),
    output_file: str = Field(description="Path to the generated audio file")
) -> str:
    """Convert input text to audio with child-friendly settings.
    
    Args:
        text: Input text to convert
        output_file: Path to the generated audio file
    Returns:
        str: Path to the generated audio file
    """
    engine = pyttsx3.init()
    # Set default properties for child-friendly speech
    engine.setProperty('rate', 150)  # Slower speaking rate
    engine.setProperty('volume', 0.9)
    try:
        # Use default params if none provided
        params = {
            "speed": 0.9,
            "pitch": 1.1,
            "language": "en-US",
            "output_file": output_file,
            "use_gtts": True
        }

        # Preprocess text for child-friendly output
        text = _preprocess_text(text)
        if params.get("use_gtts", False):
            # Use gTTS for more natural sound
            tts = gTTS(text=text, lang=params["language"], slow=True)
            tts.save(params["output_file"])

        # Post-process audio if needed (adjust volume, remove noise, etc.)
        _post_process_audio(params["output_file"])
        return params["output_file"]

    except Exception as e:
        logger.error("Error in text-to-audio conversion: %s", str(e))
        raise

def _preprocess_text(text: str) -> str:
    """Preprocess text for child-friendly output.
    
    - Add pauses between sentences
    - Emphasize important words
    - Handle special characters
    """
    # Add slight pauses between sentences
    text = text.replace('. ', '... ')
    # Add emphasis on important words (can be customized)
    text = text.replace('!', '! ... ')
    return text

def _post_process_audio(audio_file: str) -> None:
    """Optimized post-processing for audio files."""
    try:
        # Load with a lower sample rate and mono channel
        y, sr = librosa.load(audio_file, sr=16000, mono=True)
        # Use faster normalization method
        y_norm = y / np.max(np.abs(y))
        # Write with optimized settings
        sf.write(
            audio_file,
            y_norm,
            sr,
            format='mp4',
            subtype='MP4'
        )
    except (IOError, ValueError, RuntimeError) as e:
        logger.warning("Audio post-processing failed: %s", e)

# Main function
if __name__ == "__main__":
    mcp.settings.port = 8888
    mcp.run(transport='sse')

    # text = "Hello, this is a test of the text-to-audio conversion."
    # output_file = "output1.mp4"
    # print(f"Converting text to audio: {text}")
    # audio_file = convert_text_to_audio(text, output_file)
    # print(f"Audio file saved to: {audio_file}")