Indic_ASR_Comparison_Multi

Running

App Files Files Community

Indic_ASR_Comparison_Multi / app.py

AvtnshM

v25, added Malyalam

609956c verified 3 months ago

raw

history blame contribute delete

16.6 kB

	import gradio as gr
	import torch
	import torchaudio
	from transformers import (
	AutoModelForSpeechSeq2Seq,
	AutoProcessor,
	AutoModelForCTC,
	AutoModel,
	WhisperProcessor,
	WhisperForConditionalGeneration,
	)
	import librosa
	import numpy as np
	from jiwer import wer, cer
	import time

	# Language configurations
	LANGUAGE_CONFIGS = {
	"Hindi (हिंदी)": {
	"code": "hi",
	"script": "Devanagari",
	"models": ["AudioX-North", "IndicConformer", "MMS"]
	},
	"Gujarati (ગુજરાતી)": {
	"code": "gu",
	"script": "Gujarati",
	"models": ["AudioX-North", "IndicConformer", "MMS"]
	},
	"Marathi (मराठी)": {
	"code": "mr",
	"script": "Devanagari",
	"models": ["AudioX-North", "IndicConformer", "MMS"]
	},
	"Tamil (தமிழ்)": {
	"code": "ta",
	"script": "Tamil",
	"models": ["AudioX-South", "IndicConformer", "MMS"]
	},
	"Telugu (తెలుగు)": {
	"code": "te",
	"script": "Telugu",
	"models": ["AudioX-South", "IndicConformer", "MMS"]
	},
	"Kannada (ಕನ್ನಡ)": {
	"code": "kn",
	"script": "Kannada",
	"models": ["AudioX-South", "IndicConformer", "MMS"]
	},
	"Malayalam (മലയാളം)": {
	"code": "ml",
	"script": "Malayalam",
	"models": ["AudioX-South", "IndicConformer", "MMS"]
	}
	}

	# Model configurations
	MODEL_CONFIGS = {
	"AudioX-North": {
	"repo": "jiviai/audioX-north-v1",
	"model_type": "whisper",
	"description": "Supports Hindi, Gujarati, Marathi",
	"languages": ["hi", "gu", "mr"]
	},
	"AudioX-South": {
	"repo": "jiviai/audioX-south-v1",
	"model_type": "whisper",
	"description": "Supports Tamil, Telugu, Kannada, Malayalam",
	"languages": ["ta", "te", "kn", "ml"]
	},
	"IndicConformer": {
	"repo": "ai4bharat/indic-conformer-600m-multilingual",
	"model_type": "ctc_rnnt",
	"description": "Supports 22 Indian languages",
	"trust_remote_code": True,
	"languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml", "bn", "pa", "or", "as", "ur"]
	},
	"MMS": {
	"repo": "facebook/mms-1b-all",
	"model_type": "ctc",
	"description": "Supports 1,400+ languages",
	"languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml"]
	},
	}

	# Load model and processor
	def load_model_and_processor(model_name):
	config = MODEL_CONFIGS[model_name]
	repo = config["repo"]
	model_type = config["model_type"]
	trust_remote_code = config.get("trust_remote_code", False)

	try:
	if model_name == "IndicConformer":
	print(f"Loading {model_name}...")
	try:
	model = AutoModel.from_pretrained(
	repo,
	trust_remote_code=True,
	torch_dtype=torch.float32,
	low_cpu_mem_usage=True
	)
	except Exception as e1:
	print(f"Primary loading failed, trying fallback: {e1}")
	model = AutoModel.from_pretrained(repo, trust_remote_code=True)
	processor = None
	return model, processor, model_type

	elif model_name in ["AudioX-North", "AudioX-South"]:
	# Use Whisper processor and model for AudioX variants
	processor = WhisperProcessor.from_pretrained(repo)
	model = WhisperForConditionalGeneration.from_pretrained(repo)
	model.config.forced_decoder_ids = None
	return model, processor, model_type

	elif model_name == "MMS":
	model = AutoModelForCTC.from_pretrained(repo)
	processor = AutoProcessor.from_pretrained(repo)
	return model, processor, model_type

	except Exception as e:
	return None, None, f"Error loading model: {str(e)}"

	# Compute metrics (WER, CER, RTF)
	def compute_metrics(reference, hypothesis, audio_duration, total_time):
	if not reference or not hypothesis:
	return None, None, None, None
	try:
	reference = reference.strip().lower()
	hypothesis = hypothesis.strip().lower()
	wer_score = wer(reference, hypothesis)
	cer_score = cer(reference, hypothesis)
	rtf = total_time / audio_duration if audio_duration > 0 else None
	return wer_score, cer_score, rtf, total_time
	except Exception:
	return None, None, None, None

	# Main transcription function
	def transcribe_audio(audio_file, selected_language, selected_models, reference_text=""):
	if not audio_file:
	return "Please upload an audio file.", [], ""

	if not selected_models:
	return "Please select at least one model.", [], ""

	if not selected_language:
	return "Please select a language.", [], ""

	# Get language info
	lang_info = LANGUAGE_CONFIGS[selected_language]
	lang_code = lang_info["code"]

	table_data = []
	try:
	# Load and preprocess audio once
	audio, sr = librosa.load(audio_file, sr=16000)
	audio_duration = len(audio) / sr

	for model_name in selected_models:
	# Check if model supports the selected language
	if model_name.replace("AudioX-", "AudioX-") not in lang_info["models"]:
	table_data.append([
	model_name,
	f"Language {selected_language} not supported by this model",
	"-", "-", "-", "-"
	])
	continue

	model, processor, model_type = load_model_and_processor(model_name)
	if isinstance(model_type, str) and model_type.startswith("Error"):
	table_data.append([
	model_name,
	f"Error: {model_type}",
	"-", "-", "-", "-"
	])
	continue

	start_time = time.time()

	try:
	if model_name == "IndicConformer":
	# AI4Bharat specific processing
	wav = torch.from_numpy(audio).unsqueeze(0)
	if torch.max(torch.abs(wav)) > 0:
	wav = wav / torch.max(torch.abs(wav))

	with torch.no_grad():
	transcription = model(wav, lang_code, "rnnt")
	if isinstance(transcription, list):
	transcription = transcription[0] if transcription else ""
	transcription = str(transcription).strip()

	elif model_name in ["AudioX-North", "AudioX-South"]:
	# AudioX Whisper-based processing
	if sr != 16000:
	audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)

	input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features

	with torch.no_grad():
	predicted_ids = model.generate(
	input_features,
	task="transcribe",
	language=lang_code
	)
	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

	else: # MMS
	# Standard CTC processing for MMS
	inputs = processor(audio, sampling_rate=16000, return_tensors="pt")

	with torch.no_grad():
	input_values = inputs["input_values"]
	logits = model(input_values).logits
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

	except Exception as e:
	transcription = f"Processing error: {str(e)}"

	total_time = time.time() - start_time

	# Compute metrics
	wer_score, cer_score, rtf = "-", "-", "-"
	if reference_text and transcription and not transcription.startswith("Processing error"):
	wer_val, cer_val, rtf_val, _ = compute_metrics(
	reference_text, transcription, audio_duration, total_time
	)
	wer_score = f"{wer_val:.3f}" if wer_val is not None else "-"
	cer_score = f"{cer_val:.3f}" if cer_val is not None else "-"
	rtf = f"{rtf_val:.3f}" if rtf_val is not None else "-"

	# Add row to table
	table_data.append([
	model_name,
	transcription,
	wer_score,
	cer_score,
	rtf,
	f"{total_time:.2f}s"
	])

	# Create summary text
	summary = f"Language: {selected_language} ({lang_code})\n"
	summary += f"Audio Duration: {audio_duration:.2f}s\n"
	summary += f"Models Tested: {len(selected_models)}\n"
	if reference_text:
	summary += f"Reference Text: {reference_text[:100]}{'...' if len(reference_text) > 100 else ''}\n"

	# Create copyable text output
	copyable_text = "MULTILINGUAL SPEECH-TO-TEXT BENCHMARK RESULTS\n" + "="*55 + "\n\n"
	copyable_text += f"Language: {selected_language} ({lang_code})\n"
	copyable_text += f"Script: {lang_info['script']}\n"
	copyable_text += f"Audio Duration: {audio_duration:.2f}s\n"
	copyable_text += f"Models Tested: {len(selected_models)}\n"
	if reference_text:
	copyable_text += f"Reference Text: {reference_text}\n"
	copyable_text += "\n" + "-"*55 + "\n\n"

	for i, row in enumerate(table_data):
	copyable_text += f"MODEL {i+1}: {row[0]}\n"
	copyable_text += f"Transcription: {row[1]}\n"
	copyable_text += f"WER: {row[2]}\n"
	copyable_text += f"CER: {row[3]}\n"
	copyable_text += f"RTF: {row[4]}\n"
	copyable_text += f"Time Taken: {row[5]}\n"
	copyable_text += "\n" + "-"*35 + "\n\n"

	return summary, table_data, copyable_text
	except Exception as e:
	error_msg = f"Error during transcription: {str(e)}"
	return error_msg, [], error_msg

	# Create Gradio interface
	def create_interface():
	language_choices = list(LANGUAGE_CONFIGS.keys())

	with gr.Blocks(title="Multilingual Speech-to-Text Benchmark", css="""
	.language-info { background: #f0f8ff; padding: 10px; border-radius: 5px; margin: 10px 0; }
	.copy-area { font-family: monospace; font-size: 12px; }
	""") as iface:
	gr.Markdown("""
	# 🌐 Multilingual Speech-to-Text Benchmark

	Compare ASR models across 7 Indian Languages with comprehensive metrics.

	Supported Languages: Hindi, Gujarati, Marathi, Tamil, Telugu, Kannada, Malayalam
	""")

	with gr.Row():
	with gr.Column(scale=1):
	# Language selection
	language_selection = gr.Dropdown(
	choices=language_choices,
	label="🗣️ Select Language",
	value=language_choices[0],
	interactive=True
	)

	audio_input = gr.Audio(
	label="📹 Upload Audio File (16kHz recommended)",
	type="filepath"
	)

	# Dynamic model selection based on language
	model_selection = gr.CheckboxGroup(
	choices=["AudioX-North", "IndicConformer", "MMS"],
	label="🤖 Select Models",
	value=["AudioX-North", "IndicConformer"],
	interactive=True
	)

	reference_input = gr.Textbox(
	label="📄 Reference Text (optional, paste supported)",
	placeholder="Paste reference transcription here...",
	lines=4,
	interactive=True
	)

	submit_btn = gr.Button("🚀 Run Multilingual Benchmark", variant="primary", size="lg")

	with gr.Column(scale=2):
	summary_output = gr.Markdown(
	label="📊 Summary",
	value="Select language, upload audio file and choose models to begin..."
	)

	results_table = gr.Dataframe(
	headers=["Model", "Transcription", "WER", "CER", "RTF", "Time"],
	datatype=["str", "str", "str", "str", "str", "str"],
	label="🏆 Results Comparison",
	interactive=False,
	wrap=True,
	column_widths=[120, 350, 60, 60, 60, 80]
	)

	# Copyable results section
	with gr.Group():
	gr.Markdown("### 📋 Export Results")
	copyable_output = gr.Textbox(
	label="Copy-Paste Friendly Results",
	lines=12,
	max_lines=25,
	show_copy_button=True,
	interactive=False,
	elem_classes="copy-area",
	placeholder="Benchmark results will appear here..."
	)

	# Update model choices based on language selection
	def update_model_choices(selected_language):
	if not selected_language:
	return gr.CheckboxGroup(choices=[], value=[])

	lang_info = LANGUAGE_CONFIGS[selected_language]
	available_models = lang_info["models"]

	# Map display names
	model_map = {
	"AudioX-North": "AudioX-North",
	"AudioX-South": "AudioX-South",
	"IndicConformer": "IndicConformer",
	"MMS": "MMS"
	}

	available_choices = [model_map[model] for model in available_models if model in model_map]
	default_selection = available_choices[:2] if len(available_choices) >= 2 else available_choices

	return gr.CheckboxGroup(choices=available_choices, value=default_selection)

	# Connect language selection to model updates
	language_selection.change(
	fn=update_model_choices,
	inputs=[language_selection],
	outputs=[model_selection]
	)

	# Connect the main function
	submit_btn.click(
	fn=transcribe_audio,
	inputs=[audio_input, language_selection, model_selection, reference_input],
	outputs=[summary_output, results_table, copyable_output]
	)

	reference_input.submit(
	fn=transcribe_audio,
	inputs=[audio_input, language_selection, model_selection, reference_input],
	outputs=[summary_output, results_table, copyable_output]
	)

	# Language information display
	gr.Markdown("""
	---
	### 📤 Language & Model Support Matrix

	\| Language \| Script \| AudioX-North \| AudioX-South \| IndicConformer \| MMS \|
	\|----------\|---------\|-------------\|-------------\|---------------\|-----\|
	\| Hindi \| Devanagari \| ✅ \| ❌ \| ✅ \| ✅ \|
	\| Gujarati \| Gujarati \| ✅ \| ❌ \| ✅ \| ✅ \|
	\| Marathi \| Devanagari \| ✅ \| ❌ \| ✅ \| ✅ \|
	\| Tamil \| Tamil \| ❌ \| ✅ \| ✅ \| ✅ \|
	\| Telugu \| Telugu \| ❌ \| ✅ \| ✅ \| ✅ \|
	\| Kannada \| Kannada \| ❌ \| ✅ \| ✅ \| ✅ \|
	\| Malayalam \| Malayalam \| ❌ \| ✅ \| ✅ \| ✅ \|

	### 💡 Tips:
	- Models auto-filter based on selected language
	- Reference Text: Enable WER/CER calculation by providing ground truth
	- Copy Results: Export formatted results using the copy button
	- Best Performance: Use AudioX models for their specialized languages
	""")

	return iface

	if __name__ == "__main__":
	iface = create_interface()
	iface.launch(
	share=False,
	debug=True,
	server_name="0.0.0.0",
	server_port=7860,
	show_error=True
	)