Spaces:
Sleeping
Sleeping
| # app.py (Final, Robust Version) | |
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| import torch | |
| import pickle | |
| from huggingface_hub import hf_hub_download | |
| # ============================================================================= | |
| # 1. LOAD MODEL, TOKENIZER, AND LABEL ENCODER | |
| # ============================================================================= | |
| # Define the path to your model repository | |
| model_path = "Tarive/esm2_t12_35M_UR50D-finetuned-pfam-1k" | |
| print("Loading tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained(model_path) | |
| print("Loading model...") | |
| model = AutoModelForSequenceClassification.from_pretrained(model_path) | |
| # Move model to GPU if available for faster inference | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model.to(device) | |
| print(f"Model loaded on device: {device}") | |
| # Download and load the label encoder | |
| print("Downloading and loading label encoder...") | |
| encoder_path = hf_hub_download(repo_id=model_path, filename="label_encoder.pkl") | |
| with open(encoder_path, "rb") as f: | |
| label_encoder = pickle.load(f) | |
| print("Label encoder loaded.") | |
| # ============================================================================= | |
| # 2. DEFINE THE LOW-LEVEL PREDICTION FUNCTION | |
| # ============================================================================= | |
| # This function manually replicates the training data processing steps. | |
| def predict_family(sequence): | |
| # 1. Tokenize the input sequence with the exact same settings as training | |
| inputs = tokenizer( | |
| sequence, | |
| return_tensors="pt", # Return PyTorch tensors | |
| truncation=True, | |
| padding=True, | |
| max_length=256 # Ensure this matches your training max_length | |
| ).to(device) # Move tokenized inputs to the same device as the model | |
| # 2. Get model predictions (logits) | |
| with torch.no_grad(): # Disable gradient calculation for efficiency | |
| logits = model(**inputs).logits | |
| # 3. Get the top 5 predictions | |
| top_k_indices = torch.topk(logits, 5, dim=-1).indices.squeeze().tolist() | |
| # 4. Convert logits to probabilities (softmax) | |
| probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist() | |
| # 5. Decode the numerical labels back to family names | |
| results = {} | |
| for index in top_k_indices: | |
| family_name = label_encoder.inverse_transform([index])[0] | |
| confidence_score = probabilities[index] | |
| results[family_name] = confidence_score | |
| return results | |
| # ============================================================================= | |
| # 3. CREATE THE GRADIO INTERFACE (No changes here) | |
| # ============================================================================= | |
| print("Creating Gradio interface...") | |
| iface = gr.Interface( | |
| fn=predict_family, | |
| inputs=gr.Textbox( | |
| lines=10, | |
| label="Protein Amino Acid Sequence", | |
| placeholder="Paste your protein sequence here..." | |
| ), | |
| outputs=gr.Label( | |
| num_top_classes=5, | |
| label="Predicted Families" | |
| ), | |
| title="Protein Family Classifier", | |
| description="This demo uses a fine-tuned ESM-2 model to predict the protein family from its amino acid sequence. Enter a sequence to see the top 5 predictions and their confidence scores.", | |
| examples=[ | |
| ["MLLVLKISRNAITTFSKEQLDSF"], | |
| ["SNYRPFVFKENDEVLALMAVWEFDDFIYVEHLAVDSKLRGKGVGSELIKNYLNRCDKRVFLEVEPPNCEISKKRVSFYEKLGFSF"], | |
| ["KRAIDLLLTLGSAILVLPLVLAIAAWIRMDSPGSPFFTQRRIGQHGREMHILKFRTMVQNAECVLHDCLAANPALNAEWERDQKLKCDPRVTRAGAFLRKTSLDELPQLWNVLRGEMSLVGPRPIVQDEVEKYGEVFDLYTRVKPGITGLWQVSGRNDVSYPQRVEMDRYYICNWSVWFDIWILAKTVPVVLH"] | |
| ], | |
| allow_flagging="never" | |
| ) | |
| # Launch the interface! | |
| print("Launching app...") | |
| iface.launch() |