Spaces:

m3nnoun
/

new-vision

Sleeping

App Files Files Community

new-vision / app.py

m3nnoun

Update app.py

d5920d2 verified 4 months ago

raw

history blame contribute delete

7.02 kB

	import gradio as gr
	import torch
	from transformers import AutoTokenizer, AutoModel
	import numpy as np
	import soundfile as sf
	import io
	import tempfile
	import os

	# Load your fine-tuned model
	MODEL_NAME = "m3nnoun/lora_model_semantic"

	def load_model():
	"""Load the TTS model and tokenizer"""
	try:
	# Adjust these based on your specific model architecture
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModel.from_pretrained(MODEL_NAME)
	model.eval()
	return tokenizer, model
	except Exception as e:
	print(f"Error loading model: {e}")
	return None, None

	# Initialize model
	tokenizer, model = load_model()

	def text_to_speech(text, voice_speed=1.0, voice_pitch=1.0):
	"""
	Convert text to speech using your fine-tuned model

	Args:
	text (str): Input text to convert to speech
	voice_speed (float): Speed of the generated speech
	voice_pitch (float): Pitch of the generated speech

	Returns:
	tuple: (sample_rate, audio_array) for Gradio audio output
	"""
	if not text.strip():
	return None

	if tokenizer is None or model is None:
	return None

	try:
	# Tokenize input text
	inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

	# Generate speech with your model
	with torch.no_grad():
	# This is a placeholder - adjust based on your model's actual interface
	# Different TTS models have different forward pass requirements
	outputs = model(**inputs)

	# Extract audio from model outputs
	# This part depends on your model's output format
	if hasattr(outputs, 'audio'):
	audio = outputs.audio
	elif hasattr(outputs, 'waveform'):
	audio = outputs.waveform
	else:
	# If output is different, extract the audio tensor
	audio = outputs.last_hidden_state # Adjust based on your model

	# Convert to numpy array
	if torch.is_tensor(audio):
	audio = audio.squeeze().cpu().numpy()

	# Apply speed and pitch modifications (basic implementation)
	if voice_speed != 1.0:
	# Simple speed change by resampling
	indices = np.arange(0, len(audio), voice_speed)
	audio = np.interp(indices, np.arange(len(audio)), audio)

	# Ensure audio is in the right format
	audio = np.array(audio, dtype=np.float32)

	# Normalize audio
	if len(audio) > 0:
	audio = audio / np.max(np.abs(audio))

	# Return sample rate and audio array
	sample_rate = 22050 # Adjust based on your model's sample rate
	return sample_rate, audio

	except Exception as e:
	print(f"Error in text_to_speech: {e}")
	return None

	def create_interface():
	"""Create the Gradio interface"""

	with gr.Blocks(title="TTS Model - Text to Speech", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# 🎙️ Text-to-Speech Generator
	Enter your text below and generate high-quality speech using our fine-tuned TTS model.
	"""
	)

	with gr.Row():
	with gr.Column(scale=2):
	# Text input
	text_input = gr.Textbox(
	label="Enter Text",
	placeholder="Type the text you want to convert to speech...",
	lines=4,
	max_lines=10
	)

	# Voice controls
	with gr.Row():
	speed_slider = gr.Slider(
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.1,
	label="Speech Speed"
	)
	pitch_slider = gr.Slider(
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.1,
	label="Speech Pitch"
	)

	# Generate button
	generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")

	with gr.Column(scale=1):
	# Audio output
	audio_output = gr.Audio(
	label="Generated Speech",
	type="numpy",
	interactive=False
	)

	# Status/Info
	status_text = gr.Textbox(
	label="Status",
	value="Ready to generate speech",
	interactive=False,
	lines=2
	)

	# Example texts
	gr.Markdown("### 📝 Example Texts")
	examples = gr.Examples(
	examples=[
	["Hello! Welcome to our text-to-speech service."],
	["The quick brown fox jumps over the lazy dog."],
	["Artificial intelligence is revolutionizing how we interact with technology."],
	["Thank you for using our TTS model. We hope you enjoy the generated speech!"]
	],
	inputs=[text_input],
	label="Click on an example to try it"
	)

	# Event handlers
	def generate_and_update_status(text, speed, pitch):
	if not text.strip():
	return None, "⚠️ Please enter some text to generate speech."

	try:
	result = text_to_speech(text, speed, pitch)
	if result is None:
	return None, "❌ Error generating speech. Please try again."

	sample_rate, audio = result
	return (sample_rate, audio), f"✅ Speech generated successfully! Duration: {len(audio)/sample_rate:.2f} seconds"

	except Exception as e:
	return None, f"❌ Error: {str(e)}"

	generate_btn.click(
	generate_and_update_status,
	inputs=[text_input, speed_slider, pitch_slider],
	outputs=[audio_output, status_text]
	)

	# Auto-generate on Enter key (optional)
	text_input.submit(
	generate_and_update_status,
	inputs=[text_input, speed_slider, pitch_slider],
	outputs=[audio_output, status_text]
	)

	return demo

	# Create and launch the interface
	if __name__ == "__main__":
	demo = create_interface()

	# Launch the app
	demo.launch(
	server_name="0.0.0.0", # Important for Hugging Face Spaces
	server_port=7860, # Standard port for HF Spaces
	share=False, # Set to True if testing locally
	show_error=True
	)