Spaces:
Runtime error
Runtime error
File size: 3,193 Bytes
bf5f62c d758ae8 bf5f62c d758ae8 bf5f62c d758ae8 bf5f62c d758ae8 bf5f62c d758ae8 bf5f62c d758ae8 bf5f62c d758ae8 bf5f62c d758ae8 bf5f62c d758ae8 bf5f62c d758ae8 bf5f62c d758ae8 bf5f62c d758ae8 bf5f62c d758ae8 bf5f62c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import pandas as pd
import json
import gradio as gr
import numpy as np
import torch
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
# Load datasets from local storage
train_df = pd.read_csv("Train_dataset.csv")
test_df = pd.read_csv("Test_dataset.csv")
# Load disease mapping
with open("disease_mapping.json", "r") as f:
disease_info = {item["Disease"]: item for item in json.load(f)}
# Encode disease labels
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['Disease'])
# Filter out test samples with unseen diseases
test_df = test_df[test_df['Disease'].isin(le.classes_)]
test_df['label'] = le.transform(test_df['Disease'])
# Load SciBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")
# Function to get [CLS] token embedding
def get_embedding(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=128)
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state[:, 0, :].squeeze().numpy()
# Generate embeddings for training and testing
print("Generating embeddings for training data...")
X_train = np.vstack([get_embedding(text) for text in train_df['text']])
y_train = train_df['label'].values
print("Generating embeddings for test data...")
X_test = np.vstack([get_embedding(text) for text in test_df['text']])
y_test = test_df['label'].values
# Train Gradient Boosting classifier
print("Training classifier...")
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
# Prediction function
def predict_disease(symptoms):
emb = get_embedding(symptoms).reshape(1, -1)
probs = clf.predict_proba(emb)[0]
top3_idx = np.argsort(probs)[::-1][:3]
results = []
for idx in top3_idx:
disease = le.inverse_transform([idx])[0]
info = disease_info.get(disease, {})
results.append({
"Disease": disease,
"Confidence": round(probs[idx] * 100, 2),
"Description": info.get("Description", "N/A"),
"Severity": info.get("Severity", "N/A"),
"Precaution": info.get("Precaution", "N/A")
})
return results
# Gradio chatbot interface
def chatbot_interface(symptom_text):
preds = predict_disease(symptom_text)
output = ""
for i, pred in enumerate(preds, 1):
output += f"### Prediction {i}\n"
output += f"- **Disease:** {pred['Disease']} ({pred['Confidence']}%)\n"
output += f"- **Description:** {pred['Description']}\n"
output += f"- **Severity:** {pred['Severity']}\n"
output += f"- **Precaution:** {pred['Precaution']}\n\n"
return output.strip()
# Launch Gradio UI
gr.Interface(
fn=chatbot_interface,
inputs=gr.Textbox(label="Enter your symptoms"),
outputs=gr.Markdown(),
title="SciBERT Medical Chatbot",
description="AI Medical Assistant that predicts diseases based on symptoms using SciBERT embeddings."
).launch()
|