Spaces:

karthikmn
/

smart-notes

Runtime error

App Files Files Community

smart-notes / app.py

karthikmn

Update app.py

1a76809 verified 3 months ago

raw

history blame contribute delete

2.72 kB

	import gradio as gr
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
	import torch
	import torchaudio

	# Load the pre-trained Wav2Vec 2.0 model and processor from Hugging Face
	processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
	model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

	# Function to convert speech to text
	def speech_to_text(audio_file):
	# Load the audio file
	audio_input, _ = torchaudio.load(audio_file)

	# Preprocess the audio input (e.g., resample, normalize, etc.)
	input_values = processor(audio_input, return_tensors="pt").input_values

	# Perform speech-to-text (CTC Decoding)
	with torch.no_grad():
	logits = model(input_values).logits
	predicted_ids = torch.argmax(logits, dim=-1)

	# Decode the predicted ids to text
	transcription = processor.decode(predicted_ids[0])

	return transcription

	# Set up the Gradio interface
	iface = gr.Interface(
	fn=speech_to_text, # Function to be executed
	inputs=gr.Audio(type="filepath"), # Correct type for file upload
	outputs=gr.Textbox(), # Display transcription in a text box
	title="Speech-to-Text Analyzer for Lectimport gradio as gr
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
	import torch
	import torchaudio

	# Load the pre-trained Wav2Vec 2.0 model and processor from Hugging Face
	processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
	model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

	# Function to convert speech to text
	def speech_to_text(audio_file):
	# Load the audio file
	audio_input, _ = torchaudio.load(audio_file)

	# Preprocess the audio input (e.g., resample, normalize, etc.)
	input_values = processor(audio_input, return_tensors="pt").input_values

	# Perform speech-to-text (CTC Decoding)
	with torch.no_grad():
	logits = model(input_values).logits
	predicted_ids = torch.argmax(logits, dim=-1)

	# Decode the predicted ids to text
	transcription = processor.decode(predicted_ids[0])

	return transcription

	# Set up the Gradio interface
	iface = gr.Interface(
	fn=speech_to_text, # Function to be executed
	inputs=gr.Audio(type="filepath"), # Correct type for file upload
	outputs=gr.Textbox(), # Display transcription in a text box
	title="Speech-to-Text Analyzer for Lecture Notes",
	description="Upload an audio file (e.g., lecture recording) to get the transcription of the speech."
	)

	# Launch the interface
	iface.launch()
	ure Notes",
	description="Upload an audio file (e.g., lecture recording) to get the transcription of the speech."
	)

	# Launch the interface
	iface.launch()