Update app.py
Browse files
app.py
CHANGED
@@ -3,30 +3,19 @@ import requests
|
|
3 |
import random
|
4 |
import json
|
5 |
import os
|
6 |
-
import urllib.parse
|
7 |
-
import tempfile
|
8 |
from together import Together
|
9 |
|
10 |
# --- Environment Variable for Together API Key ---
|
11 |
TOGETHER_API_KEY = os.environ.get('TOGETHER_API_KEY')
|
12 |
|
13 |
# --- Together API Client Configuration ---
|
14 |
-
|
15 |
-
client = None
|
16 |
-
if TOGETHER_API_KEY:
|
17 |
-
client = Together(api_key=TOGETHER_API_KEY)
|
18 |
-
else:
|
19 |
-
print("Warning: TOGETHER_API_KEY not set. Together AI tabs will not function.")
|
20 |
-
|
21 |
|
22 |
# --- Pixabay API Configuration ---
|
23 |
PIXABAY_API_KEY = os.environ.get('PIXABAY_API_KEY')
|
24 |
IMAGE_API_URL = 'https://pixabay.com/api/'
|
25 |
VIDEO_API_URL = 'https://pixabay.com/api/videos/'
|
26 |
PER_PAGE = 5
|
27 |
-
if not PIXABAY_API_KEY:
|
28 |
-
print("Warning: PIXABAY_API_KEY not set. Pixabay Search tab will not function.")
|
29 |
-
|
30 |
|
31 |
def image_to_url(image_path):
|
32 |
"""
|
@@ -224,7 +213,7 @@ def together_image_to_image(image_path: str = None, prompt: str = ""):
|
|
224 |
# --- Gradio Blocks Interface Definition ---
|
225 |
with gr.Blocks(title="Media Generation and Search Explorer") as demo:
|
226 |
gr.Markdown("## Media Generation and Search Explorer")
|
227 |
-
gr.Markdown("Explore royalty-free media from Pixabay and generate/transform images
|
228 |
|
229 |
with gr.Tab("Pixabay Search"):
|
230 |
gr.Markdown("Search for royalty-free images and videos on Pixabay.")
|
@@ -311,10 +300,10 @@ with gr.Blocks(title="Media Generation and Search Explorer") as demo:
|
|
311 |
outputs=together_image_to_image_output,
|
312 |
)
|
313 |
|
314 |
-
with gr.Tab("Together AI - Text to
|
315 |
gr.Markdown("Generate audio from text using Together AI's text-to-speech models.")
|
316 |
gr.Warning("This requires setting the TOGETHER_API_KEY environment variable.")
|
317 |
-
|
318 |
with gr.Row():
|
319 |
tts_input_text = gr.Textbox(label="Enter text to convert to speech", lines=3)
|
320 |
tts_voice_selection = gr.Dropdown(
|
@@ -323,17 +312,17 @@ with gr.Blocks(title="Media Generation and Search Explorer") as demo:
|
|
323 |
value="helpful woman"
|
324 |
)
|
325 |
tts_generate_button = gr.Button("Generate Audio")
|
326 |
-
|
327 |
tts_audio_output = gr.Audio(label="Generated Audio", interactive=False)
|
328 |
-
|
329 |
-
def
|
330 |
"""
|
331 |
Converts text to speech using Together AI's audio API.
|
332 |
-
|
333 |
Args:
|
334 |
text (str): The text to convert to speech
|
335 |
voice (str): The voice to use for speech synthesis. All available voices are: calm lady, meditation lady, storyteller lady, wise lady, teacher lady, wise man, customer support man, tutorial man, helpful woman, customer support lady, asmr lady, pleasant man, professional woman, reading lady, reading man. Default is Helpful Woman.
|
336 |
-
|
337 |
Returns:
|
338 |
str: Path to the generated audio file or error message
|
339 |
"""
|
@@ -341,7 +330,7 @@ with gr.Blocks(title="Media Generation and Search Explorer") as demo:
|
|
341 |
return None, "Together AI client not initialized. Please set the TOGETHER_API_KEY environment variable."
|
342 |
if not text:
|
343 |
return None, "Please enter text to convert to speech."
|
344 |
-
|
345 |
try:
|
346 |
speech_file_path = "speech.mp3"
|
347 |
response = client.audio.speech.create(
|
@@ -352,188 +341,14 @@ with gr.Blocks(title="Media Generation and Search Explorer") as demo:
|
|
352 |
response.stream_to_file(speech_file_path)
|
353 |
return speech_file_path
|
354 |
except Exception as e:
|
355 |
-
return f"Error generating speech: {e}"
|
356 |
-
|
357 |
-
|
358 |
tts_generate_button.click(
|
359 |
-
fn=
|
360 |
inputs=[tts_input_text, tts_voice_selection],
|
361 |
outputs=tts_audio_output
|
362 |
)
|
363 |
|
364 |
-
with gr.Tab("OpenAI - Text to Speech"):
|
365 |
-
gr.Markdown("Generate audio from text using an OpenAI-compatible text-to-speech API.")
|
366 |
-
gr.Warning("This requires setting the OPENAI_TTS_TEMPLATE environment variable.")
|
367 |
-
|
368 |
-
# --- Environment Variable for OpenAI TTS Template ---
|
369 |
-
# Changed to match the new POST endpoint structure
|
370 |
-
OPENAI_TTS_TEMPLATE = "https://www.openai.fm/api/generate"
|
371 |
-
|
372 |
-
if OPENAI_TTS_TEMPLATE == "https://www.openai.fm/api/generate" and not os.getenv("OPENAI_TTS_TEMPLATE"):
|
373 |
-
gr.Warning(f"Using default OPENAI_TTS_TEMPLATE: {OPENAI_TTS_TEMPLATE}. You can override this by setting the OPENAI_TTS_TEMPLATE environment variable.")
|
374 |
-
elif not OPENAI_TTS_TEMPLATE:
|
375 |
-
gr.Warning("Warning: OPENAI_TTS_TEMPLATE not set. OpenAI TTS tab will not function.")
|
376 |
-
|
377 |
-
|
378 |
-
# A list of available voices for the TTS model.
|
379 |
-
OPENAI_VOICES = [
|
380 |
-
"alloy", "echo", "fable", "onyx", "nova", "shimmer",
|
381 |
-
"coral", "verse", "ballad", "ash", "sage", "amuch", "dan"
|
382 |
-
]
|
383 |
-
|
384 |
-
def openai_generate_audio(prompt: str, voice: str, emotion_style: str) -> bytes:
|
385 |
-
"""
|
386 |
-
Generates audio by calling the specified OpenAI-compatible TTS API endpoint using POST.
|
387 |
-
|
388 |
-
This function constructs the API request body, sends the POST request, and handles
|
389 |
-
the response, returning the audio content as bytes.
|
390 |
-
|
391 |
-
Args:
|
392 |
-
prompt: The text to be converted to speech.
|
393 |
-
voice: The voice model to use for generation (e.g., 'alloy', 'coral').
|
394 |
-
emotion_style: A description of the desired emotional tone (e.g., 'happy', 'sad').
|
395 |
-
|
396 |
-
Returns:
|
397 |
-
The raw audio data in bytes.
|
398 |
-
|
399 |
-
Raises:
|
400 |
-
gr.Error: If the API call fails, times out, or returns an unexpected content type.
|
401 |
-
"""
|
402 |
-
if not OPENAI_TTS_TEMPLATE:
|
403 |
-
raise gr.Error("OPENAI_TTS_TEMPLATE is not configured.")
|
404 |
-
|
405 |
-
try:
|
406 |
-
# Construct the POST request body as form data
|
407 |
-
data = {
|
408 |
-
'input': prompt,
|
409 |
-
'voice': voice,
|
410 |
-
'prompt': f"Voice: {emotion_style}", # Format for the 'prompt' field in the POST body
|
411 |
-
'vibe': 'null' # As observed in the payload
|
412 |
-
}
|
413 |
-
|
414 |
-
response = requests.post(OPENAI_TTS_TEMPLATE, data=data, timeout=60)
|
415 |
-
response.raise_for_status()
|
416 |
-
|
417 |
-
content_type = response.headers.get('content-type', '').lower()
|
418 |
-
if 'audio' not in content_type:
|
419 |
-
print(f"Warning: Unexpected content type '{content_type}'. Response: {response.text[:500]}")
|
420 |
-
# Check if the response might contain an error message as JSON
|
421 |
-
try:
|
422 |
-
error_data = response.json()
|
423 |
-
error_message = error_data.get('error', 'Unknown error from API')
|
424 |
-
raise gr.Error(f"The API did not return an audio file: {error_message}")
|
425 |
-
except json.JSONDecodeError:
|
426 |
-
raise gr.Error("The API did not return an audio file. It may be temporarily down or returned a non-audio response.")
|
427 |
-
|
428 |
-
|
429 |
-
return response.content
|
430 |
-
|
431 |
-
except requests.exceptions.RequestException as e:
|
432 |
-
print(f"Error during audio generation: {e}")
|
433 |
-
raise gr.Error("Failed to generate audio. The external API may be busy or down. Please try again later.")
|
434 |
-
except Exception as e:
|
435 |
-
print(f"Unexpected error during audio generation: {e}")
|
436 |
-
raise gr.Error("An unexpected error occurred during audio generation.")
|
437 |
-
|
438 |
-
|
439 |
-
def openai_text_to_speech_app(prompt: str, voice: str, emotion_style: str):
|
440 |
-
"""
|
441 |
-
Main Gradio function for OpenAI TTS to handle the text-to-speech conversion process.
|
442 |
-
|
443 |
-
It validates user inputs, calls the audio generation function, saves the
|
444 |
-
resulting audio to a temporary file, and returns the file path and a
|
445 |
-
status message to the Gradio interface.
|
446 |
-
|
447 |
-
Args:
|
448 |
-
prompt: The text input from the user.
|
449 |
-
voice: The selected voice.
|
450 |
-
emotion_style: The desired emotional style.
|
451 |
-
|
452 |
-
Returns:
|
453 |
-
A tuple containing the filepath to the generated audio and a status string.
|
454 |
-
"""
|
455 |
-
if not prompt:
|
456 |
-
raise gr.Error("Prompt cannot be empty.")
|
457 |
-
if not voice:
|
458 |
-
raise gr.Error("Please select a voice.")
|
459 |
-
# Allow empty emotion style, default handling is in openai_generate_audio if needed
|
460 |
-
if not emotion_style:
|
461 |
-
emotion_style = "neutral"
|
462 |
-
|
463 |
-
|
464 |
-
try:
|
465 |
-
audio_bytes = openai_generate_audio(prompt, voice, emotion_style)
|
466 |
-
|
467 |
-
# Determine audio format from content type if possible, default to mp3
|
468 |
-
content_type = requests.head(OPENAI_TTS_TEMPLATE).headers.get('content-type', '').lower()
|
469 |
-
suffix = ".mp3" # Default
|
470 |
-
if 'wav' in content_type:
|
471 |
-
suffix = ".wav"
|
472 |
-
elif 'ogg' in content_type:
|
473 |
-
suffix = ".ogg"
|
474 |
-
elif 'aac' in content_type:
|
475 |
-
suffix = ".aac"
|
476 |
-
|
477 |
-
|
478 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_audio_file:
|
479 |
-
temp_audio_file.write(audio_bytes)
|
480 |
-
temp_file_path = temp_audio_file.name
|
481 |
-
|
482 |
-
status_message = f"Audio generated successfully with voice '{voice}' and style '{emotion_style}'."
|
483 |
-
return temp_file_path, status_message
|
484 |
-
|
485 |
-
except gr.Error as e:
|
486 |
-
return None, str(e)
|
487 |
-
except Exception as e:
|
488 |
-
print(f"Unexpected error in main function: {e}")
|
489 |
-
return None, f"An unexpected error occurred: {e}"
|
490 |
-
|
491 |
-
|
492 |
-
with gr.Row():
|
493 |
-
with gr.Column(scale=2):
|
494 |
-
openai_prompt_input = gr.Textbox(
|
495 |
-
label="Prompt",
|
496 |
-
placeholder="Enter the text you want to convert to speech..."
|
497 |
-
)
|
498 |
-
openai_emotion_input = gr.Textbox(
|
499 |
-
label="Emotion Style",
|
500 |
-
placeholder="e.g., happy, sad, excited, sarcastic and mocking..."
|
501 |
-
)
|
502 |
-
openai_voice_dropdown = gr.Dropdown(
|
503 |
-
label="Voice",
|
504 |
-
choices=OPENAI_VOICES,
|
505 |
-
value="alloy"
|
506 |
-
)
|
507 |
-
|
508 |
-
openai_submit_button = gr.Button("Generate Audio", variant="primary")
|
509 |
-
|
510 |
-
with gr.Row():
|
511 |
-
openai_audio_output = gr.Audio(label="Generated Audio", type="filepath")
|
512 |
-
openai_status_output = gr.Textbox(label="Status", interactive=False)
|
513 |
-
|
514 |
-
openai_submit_button.click(
|
515 |
-
fn=openai_text_to_speech_app,
|
516 |
-
inputs=[
|
517 |
-
openai_prompt_input,
|
518 |
-
openai_voice_dropdown,
|
519 |
-
openai_emotion_input,
|
520 |
-
],
|
521 |
-
outputs=[openai_audio_output, openai_status_output],
|
522 |
-
)
|
523 |
-
|
524 |
-
gr.Examples(
|
525 |
-
examples=[
|
526 |
-
["Hello there! This is a test of the text-to-speech system.", "alloy", "neutral"],
|
527 |
-
["This technology is absolutely amazing! I can't believe how real it sounds.", "nova", "excited and joyful"],
|
528 |
-
["Surely *you* wouldn't want *that*. [laughs]", "shimmer", "sarcastic and mocking"],
|
529 |
-
["[sobbing] I am feeling... [sighs] a bit down today. [cry]", "ballad", "sad and depressed, with stammering"],
|
530 |
-
],
|
531 |
-
inputs=[openai_prompt_input, openai_voice_dropdown, openai_emotion_input],
|
532 |
-
outputs=[openai_audio_output, openai_status_output],
|
533 |
-
fn=openai_text_to_speech_app,
|
534 |
-
cache_examples=False,
|
535 |
-
)
|
536 |
-
|
537 |
|
538 |
# --- Launch the Gradio app ---
|
539 |
if __name__ == "__main__":
|
|
|
3 |
import random
|
4 |
import json
|
5 |
import os
|
|
|
|
|
6 |
from together import Together
|
7 |
|
8 |
# --- Environment Variable for Together API Key ---
|
9 |
TOGETHER_API_KEY = os.environ.get('TOGETHER_API_KEY')
|
10 |
|
11 |
# --- Together API Client Configuration ---
|
12 |
+
client = Together(api_key=TOGETHER_API_KEY)
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# --- Pixabay API Configuration ---
|
15 |
PIXABAY_API_KEY = os.environ.get('PIXABAY_API_KEY')
|
16 |
IMAGE_API_URL = 'https://pixabay.com/api/'
|
17 |
VIDEO_API_URL = 'https://pixabay.com/api/videos/'
|
18 |
PER_PAGE = 5
|
|
|
|
|
|
|
19 |
|
20 |
def image_to_url(image_path):
|
21 |
"""
|
|
|
213 |
# --- Gradio Blocks Interface Definition ---
|
214 |
with gr.Blocks(title="Media Generation and Search Explorer") as demo:
|
215 |
gr.Markdown("## Media Generation and Search Explorer")
|
216 |
+
gr.Markdown("Explore royalty-free media from Pixabay and generate/transform images using Together AI.")
|
217 |
|
218 |
with gr.Tab("Pixabay Search"):
|
219 |
gr.Markdown("Search for royalty-free images and videos on Pixabay.")
|
|
|
300 |
outputs=together_image_to_image_output,
|
301 |
)
|
302 |
|
303 |
+
with gr.Tab("Together AI - Text to Audio"):
|
304 |
gr.Markdown("Generate audio from text using Together AI's text-to-speech models.")
|
305 |
gr.Warning("This requires setting the TOGETHER_API_KEY environment variable.")
|
306 |
+
|
307 |
with gr.Row():
|
308 |
tts_input_text = gr.Textbox(label="Enter text to convert to speech", lines=3)
|
309 |
tts_voice_selection = gr.Dropdown(
|
|
|
312 |
value="helpful woman"
|
313 |
)
|
314 |
tts_generate_button = gr.Button("Generate Audio")
|
315 |
+
|
316 |
tts_audio_output = gr.Audio(label="Generated Audio", interactive=False)
|
317 |
+
|
318 |
+
def text_to_speech(text: str = "", voice: str = ""):
|
319 |
"""
|
320 |
Converts text to speech using Together AI's audio API.
|
321 |
+
|
322 |
Args:
|
323 |
text (str): The text to convert to speech
|
324 |
voice (str): The voice to use for speech synthesis. All available voices are: calm lady, meditation lady, storyteller lady, wise lady, teacher lady, wise man, customer support man, tutorial man, helpful woman, customer support lady, asmr lady, pleasant man, professional woman, reading lady, reading man. Default is Helpful Woman.
|
325 |
+
|
326 |
Returns:
|
327 |
str: Path to the generated audio file or error message
|
328 |
"""
|
|
|
330 |
return None, "Together AI client not initialized. Please set the TOGETHER_API_KEY environment variable."
|
331 |
if not text:
|
332 |
return None, "Please enter text to convert to speech."
|
333 |
+
|
334 |
try:
|
335 |
speech_file_path = "speech.mp3"
|
336 |
response = client.audio.speech.create(
|
|
|
341 |
response.stream_to_file(speech_file_path)
|
342 |
return speech_file_path
|
343 |
except Exception as e:
|
344 |
+
return None, f"Error generating speech: {e}"
|
345 |
+
|
|
|
346 |
tts_generate_button.click(
|
347 |
+
fn=text_to_speech,
|
348 |
inputs=[tts_input_text, tts_voice_selection],
|
349 |
outputs=tts_audio_output
|
350 |
)
|
351 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
|
353 |
# --- Launch the Gradio app ---
|
354 |
if __name__ == "__main__":
|