File size: 2,693 Bytes
c1f1ea8
326c06d
cafd975
af773f6
326c06d
 
 
 
e4049b2
ea4ca8c
e4049b2
72ae7f9
e4049b2
72ae7f9
 
e4049b2
 
72ae7f9
 
e4049b2
 
72ae7f9
e4049b2
 
72ae7f9
 
98d3cfd
72ae7f9
ea4ca8c
 
 
 
 
 
 
 
 
7da589d
c1f1ea8
ea4ca8c
 
 
 
 
 
 
 
 
e4049b2
c1f1ea8
ea4ca8c
1357470
326c06d
c1f1ea8
ea4ca8c
72ae7f9
 
839b6bc
 
 
e4049b2
cafd975
839b6bc
ea4ca8c
c1f1ea8
 
1357470
c1f1ea8
 
 
98d3cfd
e4049b2
c1f1ea8
cafd975
ab6ae03
c1f1ea8
3fcfcd1
 
ea4ca8c
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
import gradio as gr
import pandas as pd
from optimizer import optimization

BASE_DIR = os.path.dirname(__file__)
SUMMARY_PATH = os.path.join(BASE_DIR, "region_sweep_summary.csv")

AA_ORDER = list("ACDEFGHIKLMNPQRSTVWY*")  # optional: desired row order
AA_ALLOWED = set(AA_ORDER)

def aa_percent_to_onecol_df(aa_percent: dict, digits: int = 0) -> pd.DataFrame:
    """
    Build a 2-column table: AA | Codon mix
    Example cell: 'CCC (97%) - CCG (3%)'
    """
    rows = []
    order = AA_ORDER if set(aa_percent).issubset(set(AA_ORDER)) else sorted(aa_percent)
    for aa in order:
        mix = aa_percent.get(aa, {})
        if not mix:
            rows.append([aa, "—"])
            continue
        parts = sorted(mix.items(), key=lambda kv: (-kv[1], kv[0]))
        cell = " - ".join(f"{cod} ({val*100:.{digits}f}%)" for cod, val in parts)
        rows.append([aa, cell])
    return pd.DataFrame(rows, columns=["AA", "Codon percentage"])

def _clean_aa_seq(raw: str) -> str:
    """
    Upper-case, remove whitespace, and drop any character not in AA_ORDER.
    Allows '*' (stop) because it's included in AA_ORDER.
    """
    s = (raw or "").upper()
    # keep only valid AAs; this also drops spaces, digits, punctuation, etc.
    cleaned = "".join(ch for ch in s if ch in AA_ALLOWED)
    return cleaned

def run(aa_seq, use_percent_intervals):
    # 1) sanitize the AA input
    cleaned = _clean_aa_seq(aa_seq)

    # 2) guard: empty after cleaning
    if not cleaned:
        # Gradio-friendly error (shows as a toast / modal in Spaces)
        raise gr.Error("Input sequence contains no valid amino-acid characters after cleaning.")

    # 3) proceed with your main function
    designed_nt, aa_percent, gc_percent, _ = optimization(
        summary_path=SUMMARY_PATH,
        aa_seq=cleaned,  # pass the cleaned AA sequence
        use_percent_intervals=True,
    )

    # 4) build tables
    aa_table = aa_percent_to_onecol_df(aa_percent, digits=0)

    if not isinstance(gc_percent, pd.DataFrame):
        gc_percent = pd.DataFrame(gc_percent)

    return designed_nt, aa_table, gc_percent

# ---- Gradio Interface ----
iface = gr.Interface(
    fn=run,
    inputs=[
        gr.Textbox(label="Amino Acid Sequence", lines=5, placeholder="e.g. MKKLLPTAA...")
    ],
    outputs=[
        gr.Textbox(label="Optimized Nucleotide Sequence"),
        gr.Dataframe(label="Codon Usage Percent (per AA)", wrap=True),
        gr.Dataframe(label="GC Content (%)", wrap=True),
    ],
    title="Codon Optimizer",
    flagging_mode="never",
)

if __name__ == "__main__":
    # queue() is nice for HF Spaces concurrency, but optional
    # iface.queue().launch()
    iface.launch()