jugaadutranslator / streamlit_app.py
krishnamishra006's picture
Update streamlit_app.py
f225ee6 verified
import streamlit as st
from datetime import datetime
import os
import json
import requests
import uuid
import torch
import whisper
from transformers import pipeline
st.set_page_config(page_title="Jugaadu Translator", layout="centered")
SUPPORTED_LANGUAGES = {
"Hindi": {
"code": "hi",
"translation_model": "Helsinki-NLP/opus-mt-hi-en",
"whisper_language": "hi"
},
"Telugu": {
"code": "te",
"translation_model": "Helsinki-NLP/opus-mt-te-en",
"whisper_language": "te"
},
"Sanskrit": {
"code": "sa",
"translation_model": "Helsinki-NLP/opus-mt-sa-en",
"whisper_language": "sa"
}
}
AUDIO_SAVE_DIR = "data/audio"
RECORDS_PATH = "data/records.json"
os.makedirs(AUDIO_SAVE_DIR, exist_ok=True)
os.makedirs("data", exist_ok=True)
# Use lightweight Whisper model for best HF Spaces compatibility!
@st.cache_resource(show_spinner="Loading Whisper model...")
def get_whisper_model():
return whisper.load_model("tiny")
@st.cache_resource(show_spinner="Loading translation models...")
def get_translator(language):
model_name = SUPPORTED_LANGUAGES[language]["translation_model"]
return pipeline("translation", model=model_name)
@st.cache_resource(show_spinner="Loading summarizer...")
def get_summarizer():
return pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
def get_location():
try:
resp = requests.get("https://ipinfo.io/json", timeout=5)
data = resp.json()
loc_str = f"{data.get('city', '')}, {data.get('region', '')}, {data.get('country', '')}"
return loc_str.strip(", ")
except:
return "Unknown Location"
def save_record(record):
if os.path.exists(RECORDS_PATH):
with open(RECORDS_PATH, "r", encoding="utf-8") as f:
records = json.load(f)
else:
records = []
records.append(record)
with open(RECORDS_PATH, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
def show_records():
if not os.path.exists(RECORDS_PATH):
st.info("No contributions yet.")
return
with open(RECORDS_PATH, "r", encoding="utf-8") as f:
records = json.load(f)
st.subheader("Previous Contributions")
for rec in reversed(records[-5:]):
st.markdown(f"**User:** {rec['username']} \n"
f"**Time:** {rec['timestamp']} \n"
f"**Location:** {rec['location']} \n"
f"**Title:** {rec['title']}")
st.markdown(f"**Idiom:** {rec['input_text']}")
st.markdown(f"**Translation:** {rec['translation']}")
st.markdown(f"**Description:** {rec['description']}")
if rec['audio_path'] and os.path.exists(rec['audio_path']):
with open(rec['audio_path'], 'rb') as f_:
st.audio(f_.read())
st.markdown("---")
if "username" not in st.session_state:
st.title("Jugaadu Translator 🧠")
st.markdown("Enter a username to begin contributing to the idioms corpus.")
username = st.text_input("Username (choose a unique handle)", max_chars=30)
if st.button("Continue") and username:
st.session_state["username"] = username.strip()
st.success(f"Welcome, {username.strip()}! Proceed to record or type idioms.")
st.experimental_rerun()
st.stop()
st.title("Jugaadu Translator 🧠")
st.markdown(f"Hi, **{st.session_state['username']}**!")
col1, col2 = st.columns(2)
with col1:
language = st.selectbox("Pick Idiom Language", list(SUPPORTED_LANGUAGES.keys()))
with col2:
input_mode = st.radio("Input Type", ["Type", "Upload Voice"])
input_text = ""
audio_path = None
if input_mode == "Type":
input_text = st.text_area("Type the idiom/dialect phrase:", height=100)
else:
st.markdown("Upload a short voice note of your idiom (.wav, .mp3):")
audio_file = st.file_uploader("Choose audio file", type=['wav', 'mp3'])
if audio_file:
uid = str(uuid.uuid4())
# Preserve file extension
audio_path = os.path.join(AUDIO_SAVE_DIR, f"{st.session_state['username']}_{uid}.{audio_file.name.split('.')[-1]}")
with open(audio_path, "wb") as f:
f.write(audio_file.read())
st.success("Audio uploaded and saved.")
asr_model = get_whisper_model()
result = asr_model.transcribe(audio_path, language=SUPPORTED_LANGUAGES[language]['whisper_language'])
input_text = result["text"]
st.markdown("**Transcription:** " + input_text)
if st.button("Translate", disabled=not input_text.strip()):
with st.spinner("Translating and generating summary..."):
translator = get_translator(language)
translation = translator(input_text)[0]['translation_text']
summarizer = get_summarizer()
try:
desc = summarizer(translation, max_length=60, min_length=15, do_sample=False)[0]['summary_text']
except Exception:
desc = translation
title = desc.split(".")[0][:40]
location = get_location()
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
record = {
"username": st.session_state['username'],
"input_text": input_text,
"translation": translation,
"audio_path": audio_path if audio_path else "",
"title": title,
"description": desc,
"timestamp": timestamp,
"location": location
}
save_record(record)
st.success("Submission saved!")
st.markdown(f"#### Title: {title}")
st.markdown(f"**Translation:** {translation}")
st.markdown(f"**Description:** {desc}")
st.markdown(f"**Location:** {location}")
if audio_path and os.path.exists(audio_path):
with open(audio_path, 'rb') as f:
st.audio(f.read())
st.balloons()
st.markdown("---")
show_records()
st.markdown("---")
st.markdown("**All data stays local! You can find files inside `data/` for research use. No cloud required.**")