# espeak.py import os import json import time import gradio as gr import speech_recognition as sr import pyttsx3 import threading from typing import Tuple # Try importing OpenAI; if not present app will use local model fallback USE_OPENAI = bool(os.getenv("OPENAI_API_KEY", "").strip()) if USE_OPENAI: import openai # Local model fallback (T5-based) try: import torch from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from happytransformer import HappyTextToText, TTSettings LOCAL_MODEL_AVAILABLE = True except Exception: LOCAL_MODEL_AVAILABLE = False # Optional: Levenshtein for better scoring try: import Levenshtein _have_lev = True except Exception: _have_lev = False APP_TITLE = "ESPeak — AI Grammar & Speech Assistant" # ---------------------- # Utilities # ---------------------- def levenshtein_distance(a: str, b: str) -> int: if _have_lev: return Levenshtein.distance(a, b) # fallback simple DP (O(len(a)*len(b))) — fine for short sentences la, lb = len(a), len(b) if la == 0: return lb if lb == 0: return la dp = [[0]*(lb+1) for _ in range(la+1)] for i in range(la+1): dp[i][0] = i for j in range(lb+1): dp[0][j] = j for i in range(1, la+1): for j in range(1, lb+1): cost = 0 if a[i-1]==b[j-1] else 1 dp[i][j] = min(dp[i-1][j]+1, dp[i][j-1]+1, dp[i-1][j-1]+cost) return dp[la][lb] def score_from_edit(orig: str, corrected: str) -> int: # Compute a simple score: smaller edit distance -> higher score if not orig.strip(): return 0 dist = levenshtein_distance(orig, corrected) # Normalize by length, clamp to [0,100] norm = max(len(orig), 1) ratio = max(0.0, 1.0 - dist / norm) score = int(round(ratio * 100)) return score # ---------------------- # Model loading # ---------------------- tokenizer = model = happy_tt = None if not USE_OPENAI and LOCAL_MODEL_AVAILABLE: def load_local_models(): global tokenizer, model, happy_tt model_name = "prithivida/grammar_error_correcter_v1" try: tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) happy_tt = HappyTextToText("T5", model_name) except Exception as e: print("Local model load failed:", e) raise load_local_models() # ---------------------- # Speech transcription helper (speech_recognition) # ---------------------- def transcribe_audio_file(audio_filepath: str) -> str: r = sr.Recognizer() try: with sr.AudioFile(audio_filepath) as source: audio_data = r.record(source) text = r.recognize_google(audio_data) return text except sr.UnknownValueError: return "" except Exception as e: return f"[transcription_error]: {str(e)}" # ---------------------- # LLM connectors # ---------------------- OPENAI_PROMPT_SYSTEM = ( "You are ESPeak Assistant — expert grammar corrector. " "Return JSON only with keys: corrected_text (string), score (0-100 integer), explanation (short string)." ) OPENAI_USER_TEMPLATE = ( "Correct this sentence for grammar, punctuation, and clarity while preserving tone:\n\n" "### INPUT\n{input_text}\n\n" "Return only JSON with corrected_text, score, and explanation." ) def call_openai_correct(text: str) -> Tuple[str,int,str]: messages = [ {"role":"system", "content": OPENAI_PROMPT_SYSTEM}, {"role":"user", "content": OPENAI_USER_TEMPLATE.format(input_text=text)} ] resp = openai.ChatCompletion.create( model="gpt-4o-mini" if "gpt-4o-mini" in openai.Model.list() else "gpt-4", messages=messages, temperature=0.0, max_tokens=300 ) content = resp["choices"][0]["message"]["content"].strip() # Try to parse JSON from response try: parsed = json.loads(content) corrected = parsed.get("corrected_text", "") score = int(parsed.get("score", score_from_edit(text, corrected))) explanation = parsed.get("explanation", "") return corrected, score, explanation except Exception: # fallback: use raw content — try to extract a JSON substring try: start = content.index("{") end = content.rindex("}")+1 data = json.loads(content[start:end]) corrected = data.get("corrected_text","") score = int(data.get("score", score_from_edit(text, corrected))) explanation = data.get("explanation","") return corrected, score, explanation except Exception: # Last resort: return plain corrected via model-less heuristic corrected = content score = score_from_edit(text, corrected) explanation = "Auto-correction from OpenAI; parsing fallback used." return corrected, score, explanation def call_local_correct(text: str) -> Tuple[str,int,str]: # Using prithivida T5 model and HappyTransformer to generate correction prefix = "gec: " + text # generate with transformers (fast) try: inputs = tokenizer.encode(prefix, return_tensors="pt", max_length=256, truncation=True) with torch.no_grad(): outputs = model.generate(inputs, max_length=256, num_beams=4) corrected = tokenizer.decode(outputs[0], skip_special_tokens=True) except Exception: corrected = text # use happy transformer to also generate explanation-like correction (best-effort) try: args = TTSettings(num_beams=4, min_length=1) happy_out = happy_tt.generate_text(prefix, args=args).text # If happy returns something meaningful, prefer it to compute score alt_correction = happy_out or corrected except Exception: alt_correction = corrected score = score_from_edit(text, alt_correction) # Basic explanation: detect what changed (very short) explanation = [] if text.strip() == alt_correction.strip(): explanation = ["No change needed."] else: explanation = ["Adjusted grammar/punctuation; minor wording edits to improve clarity."] return alt_correction, score, "; ".join(explanation) # ---------------------- # Main processing function # ---------------------- def process_input(audio, typed_text, use_tts=False, prefer_openai=False): """ audio: filepath from Gradio (or None) typed_text: str use_tts: bool -> read corrected text with local pyttsx3 prefer_openai: triage flag to prefer OpenAI (if key available) """ source_text = "" # 1) Transcribe audio if present if audio: transcribed = transcribe_audio_file(audio) if transcribed.startswith("[transcription_error]"): source_text = typed_text or "" trans_msg = transcribed else: source_text = transcribed trans_msg = f"Transcribed: {transcribed}" else: source_text = typed_text or "" trans_msg = "Typed input" if not source_text.strip(): return "No input detected.", 0, "No correction (empty input).", trans_msg, json.dumps({}) # 2) Choose backend use_openai_backend = False if USE_OPENAI and prefer_openai: use_openai_backend = True elif USE_OPENAI and not LOCAL_MODEL_AVAILABLE: use_openai_backend = True elif not USE_OPENAI and LOCAL_MODEL_AVAILABLE: use_openai_backend = False elif USE_OPENAI and LOCAL_MODEL_AVAILABLE: # default: prefer OpenAI if available (more robust), unless user opts out use_openai_backend = prefer_openai or True try: if use_openai_backend: corrected, score, explanation = call_openai_correct(source_text) else: corrected, score, explanation = call_local_correct(source_text) except Exception as e: # fallback to local heuristic if something fails corrected = source_text score = 0 explanation = f"Model error: {e}" # 3) Optionally speak corrected text (pyttsx3) tts_msg = "" if use_tts: try: def speak(text): engine = pyttsx3.init() engine.say(text) engine.runAndWait() threading.Thread(target=speak, args=(corrected,), daemon=True).start() tts_msg = "Speaking corrected text..." except Exception as e: tts_msg = f"TTS failed: {e}" # 4) Build JSON metadata meta = { "original": source_text, "corrected": corrected, "score": score, "explanation": explanation, "backend": "openai" if use_openai_backend else "local", "transcription_note": trans_msg, "timestamp": int(time.time()) } return corrected, score, explanation, trans_msg + (" • " + tts_msg if tts_msg else ""), json.dumps(meta, ensure_ascii=False, indent=2) # ---------------------- # Gradio UI # ---------------------- def build_ui(): with gr.Blocks(title=APP_TITLE, css=""" .header {background: linear-gradient(90deg,#ff8fa3,#ff6aa3); padding: 18px; border-radius: 12px; color:white} .muted {color: #6b7280} """) as demo: # Header with gr.Row(elem_id="top-row"): with gr.Column(scale=3): gr.Markdown(f"##
ESPeak — AI Grammar & Speech Assistant
") gr.Markdown("Speak or type a sentence — ESPeak will correct grammar, score it, and explain changes. Use OpenAI backend if you set `OPENAI_API_KEY` in environment.") with gr.Column(scale=1): gr.Markdown("**Quick tips**\n- Speak clearly (short sentences work best)\n- Toggle TTS to hear the corrected sentence\n- Use `Prefer OpenAI` to route to ChatGPT if available") gr.Markdown("---") with gr.Row(): with gr.Column(scale=1): audio = gr.Audio(sources="microphone", type="filepath", label="Record (microphone)") typed = gr.Textbox(lines=3, placeholder="Or type your sentence here...", label="Text input") with gr.Row(): tts_checkbox = gr.Checkbox(label="Play corrected (TTS)", value=False) prefer_openai = gr.Checkbox(label="Prefer OpenAI backend (if available)", value=True) run_btn = gr.Button("Check Grammar", variant="primary") with gr.Column(scale=2): corrected_out = gr.Textbox(label="Corrected Text", interactive=False) score_out = gr.Number(label="Grammar Score (0-100)", interactive=False) explanation_out = gr.Textbox(label="Explanation (what I changed)", interactive=False) trans_note = gr.Textbox(label="Transcription / Info", interactive=False) meta_out = gr.Code(label="JSON metadata (copyable)", language="json") def on_submit(audio_file, typed_text, use_tts, use_openai): return process_input(audio_file, typed_text, use_tts, use_openai) run_btn.click(on_submit, inputs=[audio, typed, tts_checkbox, prefer_openai], outputs=[corrected_out, score_out, explanation_out, trans_note, meta_out]) gr.Markdown("---") gr.Markdown("**ESPeak** · Built for quick grammar checking of spoken and typed English. Designed for demos and interview projects.") return demo if __name__ == "__main__": demo = build_ui() demo.launch(share=False, inbrowser=True)