import os
import gradio as gr
import pandas as pd
from optimizer import optimization

BASE_DIR = os.path.dirname(__file__)
SUMMARY_PATH = os.path.join(BASE_DIR, "region_sweep_summary.csv")

AA_ORDER = list("ACDEFGHIKLMNPQRSTVWY*")  # optional: desired row order
AA_ALLOWED = set(AA_ORDER)

def aa_percent_to_onecol_df(aa_percent: dict, digits: int = 0) -> pd.DataFrame:
    """
    Build a 2-column table: AA | Codon mix
    Example cell: 'CCC (97%) - CCG (3%)'
    """
    rows = []
    order = AA_ORDER if set(aa_percent).issubset(set(AA_ORDER)) else sorted(aa_percent)
    for aa in order:
        mix = aa_percent.get(aa, {})
        if not mix:
            rows.append([aa, "—"])
            continue
        parts = sorted(mix.items(), key=lambda kv: (-kv[1], kv[0]))
        cell = " - ".join(f"{cod} ({val*100:.{digits}f}%)" for cod, val in parts)
        rows.append([aa, cell])
    return pd.DataFrame(rows, columns=["AA", "Codon percentage"])

def _clean_aa_seq(raw: str) -> str:
    """
    Upper-case, remove whitespace, and drop any character not in AA_ORDER.
    Allows '*' (stop) because it's included in AA_ORDER.
    """
    s = (raw or "").upper()
    # keep only valid AAs; this also drops spaces, digits, punctuation, etc.
    cleaned = "".join(ch for ch in s if ch in AA_ALLOWED)
    return cleaned

def run(aa_seq, use_percent_intervals):
    # 1) sanitize the AA input
    cleaned = _clean_aa_seq(aa_seq)

    # 2) guard: empty after cleaning
    if not cleaned:
        # Gradio-friendly error (shows as a toast / modal in Spaces)
        raise gr.Error("Input sequence contains no valid amino-acid characters after cleaning.")

    # 3) proceed with your main function
    designed_nt, aa_percent, gc_percent, _ = optimization(
        summary_path=SUMMARY_PATH,
        aa_seq=cleaned,  # pass the cleaned AA sequence
        use_percent_intervals=True,
    )

    # 4) build tables
    aa_table = aa_percent_to_onecol_df(aa_percent, digits=0)

    if not isinstance(gc_percent, pd.DataFrame):
        gc_percent = pd.DataFrame(gc_percent)

    return designed_nt, aa_table, gc_percent

# ---- Gradio Interface ----
iface = gr.Interface(
    fn=run,
    inputs=[
        gr.Textbox(label="Amino Acid Sequence", lines=5, placeholder="e.g. MKKLLPTAA...")
    ],
    outputs=[
        gr.Textbox(label="Optimized Nucleotide Sequence"),
        gr.Dataframe(label="Codon Usage Percent (per AA)", wrap=True),
        gr.Dataframe(label="GC Content (%)", wrap=True),
    ],
    title="Codon Optimizer",
    flagging_mode="never",
)

if __name__ == "__main__":
    # queue() is nice for HF Spaces concurrency, but optional
    # iface.queue().launch()
    iface.launch()