Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
OpenMed NER Model Discovery App | |
A beautiful Gradio interface for exploring and discovering OpenMed NER models | |
""" | |
import gradio as gr | |
import pandas as pd | |
from pathlib import Path | |
import re | |
from collections import Counter | |
class OpenMedModelDiscovery: | |
def __init__(self): | |
self.data_file = Path(__file__).parent / "data" / "openmed_models_database.csv" | |
self.df = pd.read_csv(self.data_file) | |
# Clean and prepare data | |
self._prepare_data() | |
# Define entity colors | |
self.entity_colors = { | |
"Chemical": "#2E8B57", # SeaGreen | |
"DNA": "#4169E1", # RoyalBlue | |
"RNA": "#1E90FF", # DodgerBlue | |
"Protein": "#9932CC", # DarkOrchid | |
"Gene": "#8A2BE2", # BlueViolet | |
"Gene/Protein": "#6A5ACD", # SlateBlue | |
"Disease": "#DC143C", # Crimson | |
"Cell Line": "#FF6347", # Tomato | |
"Cell Type": "#FF4500", # OrangeRed | |
"Cell": "#FF8C00", # DarkOrange | |
"Anatomy": "#32CD32", # LimeGreen | |
"Species": "#228B22", # ForestGreen | |
"Cancer": "#8B0000", # DarkRed | |
"Clinical": "#4682B4", # SteelBlue | |
"Protein Complex": "#9370DB", # MediumPurple | |
"Protein Family": "#8B008B", # DarkMagenta | |
"Protein Variant": "#9400D3", # Violet | |
"Amino Acid": "#BA55D3", # MediumOrchid | |
"Cellular Component": "#20B2AA", # LightSeaGreen | |
"Default": "#696969", # DimGray | |
} | |
def _prepare_data(self): | |
"""Clean and prepare the data for better display""" | |
# Fill missing values | |
self.df["entities"] = self.df["entities"].fillna("") | |
self.df["size_mb"] = pd.to_numeric(self.df["size_mb"], errors="coerce") | |
# Create size categories | |
self.df["size_category"] = self.df["size_mb"].apply(self._categorize_size) | |
# Split entities into lists for easier filtering | |
self.df["entity_list"] = self.df["entities"].apply( | |
lambda x: [e.strip() for e in x.split(",")] if x else [] | |
) | |
def _categorize_size(self, size_mb): | |
"""Categorize model size""" | |
if pd.isna(size_mb): | |
return "Unknown" | |
elif size_mb < 100: | |
return "Compact (<100M)" | |
elif size_mb < 200: | |
return "Medium (100-200M)" | |
elif size_mb < 400: | |
return "Large (200-400M)" | |
else: | |
return "XLarge (>400M)" | |
def create_entity_badge(self, entity): | |
"""Create a colored badge for an entity type""" | |
color = self.entity_colors.get(entity, self.entity_colors["Default"]) | |
return f'<span style="background-color: {color}; color: white; padding: 3px 8px; border-radius: 12px; font-size: 12px; margin: 3px 4px; display: inline-block; line-height: 1.4;">{entity}</span>' | |
def create_model_card(self, row): | |
"""Create a beautiful model card HTML""" | |
entities_html = " ".join( | |
[self.create_entity_badge(e) for e in row["entity_list"] if e] | |
) | |
size_text = f"{row['size_mb']:.0f}M" if pd.notna(row["size_mb"]) else "Unknown" | |
card_html = f""" | |
<div style="border: 1px solid #ddd; border-radius: 8px; padding: 16px; margin: 8px 0; background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);"> | |
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;"> | |
<h3 style="margin: 0; color: #2c3e50; font-size: 18px;">{row['short_name']}</h3> | |
<span style="background-color: #6c757d; color: white; padding: 4px 8px; border-radius: 4px; font-size: 12px;">{row['architecture']}</span> | |
</div> | |
<div style="margin-bottom: 8px;"> | |
<strong>Domain:</strong> <span style="color: #495057;">{row['domain']}</span> | | |
<strong>Size:</strong> <span style="color: #495057;">{size_text}</span> | |
</div> | |
<div style="margin-bottom: 12px;"> | |
<strong>Entities:</strong><br> | |
<div style="margin-top: 6px; line-height: 1.6;"> | |
{entities_html if entities_html else '<span style="color: #6c757d; margin: 20px;">No entities available</span>'} | |
</div> | |
</div> | |
<div style="margin-bottom: 12px;"> | |
<strong>Description:</strong><br> | |
<span style="color: #6c757d; font-style: italic;">{row['description']}</span> | |
</div> | |
<div style="display: flex; gap: 8px; margin-bottom: 8px;"> | |
<a href="{row['hf_link']}" target="_blank" style="background-color: #007bff; color: white; padding: 6px 12px; border-radius: 4px; text-decoration: none; font-size: 12px;">π€ View on HF</a> | |
<button onclick="copyToClipboard('{row['code_snippet']}')" style="background-color: #28a745; color: white; padding: 6px 12px; border-radius: 4px; border: none; cursor: pointer; font-size: 12px;">π Copy Code</button> | |
</div> | |
<details style="margin-top: 8px;"> | |
<summary style="cursor: pointer; color: #007bff;">π Usage Code</summary> | |
<pre style="background-color: #f8f9fa; padding: 8px; border-radius: 4px; margin-top: 4px; font-size: 11px; overflow-x: auto;"><code>from transformers import {row['code_snippet']}</code></pre> | |
</details> | |
</div> | |
""" | |
return card_html | |
def search_models( | |
self, text_query, entity_filters, domain_filters, size_filters, limit=20 | |
): | |
"""Search and filter models based on criteria""" | |
filtered_df = self.df.copy() | |
# Text search | |
if text_query.strip(): | |
text_mask = ( | |
filtered_df["model_name"].str.contains(text_query, case=False, na=False) | |
| filtered_df["short_name"].str.contains( | |
text_query, case=False, na=False | |
) | |
| filtered_df["domain"].str.contains(text_query, case=False, na=False) | |
| filtered_df["description"].str.contains( | |
text_query, case=False, na=False | |
) | |
| filtered_df["entities"].str.contains(text_query, case=False, na=False) | |
) | |
filtered_df = filtered_df[text_mask] | |
# Entity filters | |
if entity_filters: | |
entity_mask = filtered_df["entity_list"].apply( | |
lambda entities: any(entity in entity_filters for entity in entities) | |
) | |
filtered_df = filtered_df[entity_mask] | |
# Domain filters | |
if domain_filters: | |
filtered_df = filtered_df[filtered_df["domain"].isin(domain_filters)] | |
# Size filters | |
if size_filters: | |
filtered_df = filtered_df[filtered_df["size_category"].isin(size_filters)] | |
# Limit results | |
filtered_df = filtered_df.head(limit) | |
if filtered_df.empty: | |
return "<div style='text-align: center; padding: 40px; color: #6c757d;'><h3>No models found π</h3><p>Try adjusting your search criteria</p></div>" | |
# Create model cards | |
cards_html = f"<div style='margin-bottom: 16px;'><h2>Found {len(filtered_df)} models</h2></div>" | |
for _, row in filtered_df.iterrows(): | |
cards_html += self.create_model_card(row) | |
return cards_html | |
def get_entity_stats(self): | |
"""Get entity statistics""" | |
all_entities = [] | |
for entity_list in self.df["entity_list"]: | |
all_entities.extend(entity_list) | |
entity_counts = Counter(all_entities) | |
# Remove empty strings | |
entity_counts = {k: v for k, v in entity_counts.items() if k} | |
return entity_counts | |
def get_filter_options(self): | |
"""Get all available filter options""" | |
# Get unique domains | |
domains = sorted(self.df["domain"].unique()) | |
# Get unique sizes | |
sizes = sorted(self.df["size_category"].unique()) | |
# Get all unique entities | |
all_entities = set() | |
for entity_list in self.df["entity_list"]: | |
all_entities.update(entity_list) | |
entities = sorted([e for e in all_entities if e]) # Remove empty strings | |
return entities, domains, sizes | |
# Initialize the app | |
app = OpenMedModelDiscovery() | |
# Get filter options | |
ALL_ENTITIES = [ | |
"amino_acid", | |
"anatomical_system", | |
"anatomy", | |
"cancer", | |
"cell", | |
"cell_line", | |
"cell_line_name", | |
"cell_type", | |
"cellular_component", | |
"chemical", | |
"clinical", | |
"developing_anatomical_structure", | |
"disease", | |
"dna", | |
"gene/protein", | |
"gene_or_protein", | |
"immaterial_anatomical_entity", | |
"multi_tissue_structure", | |
"organ", | |
"organism", | |
"organism_subdivision", | |
"organism_substance", | |
"pathological_formation", | |
"protein", | |
"protein_complex", | |
"protein_family", | |
"protein_variant", | |
"rna", | |
"species", | |
"tissue", | |
] | |
entities, domains, sizes = app.get_filter_options() | |
# Use comprehensive entity list instead of dynamic extraction for UI | |
entities = ALL_ENTITIES | |
# Custom CSS | |
custom_css = """ | |
<style> | |
.gradio-container { | |
max-width: 1200px !important; | |
} | |
.model-grid { | |
display: grid; | |
grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); | |
gap: 16px; | |
margin-top: 16px; | |
} | |
/* Copy to clipboard functionality */ | |
</style> | |
<script> | |
function copyToClipboard(text) { | |
navigator.clipboard.writeText(text).then(function() { | |
alert('Code copied to clipboard!'); | |
}); | |
} | |
</script> | |
""" | |
# Create the Gradio interface | |
with gr.Blocks( | |
theme=gr.themes.Soft( | |
primary_hue="blue", secondary_hue="green", neutral_hue="slate" | |
), | |
css=custom_css, | |
title="π¬ OpenMed NER Model Discovery App", | |
) as demo: | |
# Header | |
gr.HTML( | |
""" | |
<div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 10px; margin-bottom: 20px;"> | |
<h1 style="color: white; margin: 0; font-size: 36px;">π¬ OpenMed NER Model Discovery</h1> | |
<p style="color: white; margin: 10px 0 0 0; font-size: 18px;">Discover the perfect NER model for your biomedical text analysis from 380+ free OpenMed models</p> | |
</div> | |
""" | |
) | |
with gr.Tabs(): | |
# Search Tab | |
with gr.Tab("π Search Models", elem_id="search-tab"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("### π― Search & Filter") | |
text_search = gr.Textbox( | |
label="Search Models", | |
placeholder="e.g., chemical detection, cancer genomics, DNA...", | |
lines=1, | |
) | |
entity_filter = gr.Dropdown( | |
choices=entities, | |
label="Entities", | |
info="Search and select entities (e.g., Chemical, DNA, Disease)...", | |
multiselect=True, | |
value=[], | |
interactive=True, | |
) | |
with gr.Row(): | |
domain_filter = gr.CheckboxGroup( | |
choices=domains, label="Domains", value=[] | |
) | |
size_filter = gr.CheckboxGroup( | |
choices=sizes, label="Model Size", value=[] | |
) | |
result_limit = gr.Slider( | |
minimum=5, maximum=50, value=20, step=5, label="Max Results" | |
) | |
clear_btn = gr.Button("ποΈ Clear Filters", variant="secondary") | |
with gr.Column(scale=2): | |
gr.Markdown("### π Search Results") | |
results_display = gr.HTML() | |
# Auto-search on any input change | |
def auto_search(*args): | |
return app.search_models(*args) | |
# Connect auto-search to all inputs | |
for component in [ | |
text_search, | |
entity_filter, | |
domain_filter, | |
size_filter, | |
result_limit, | |
]: | |
component.change( | |
fn=auto_search, | |
inputs=[ | |
text_search, | |
entity_filter, | |
domain_filter, | |
size_filter, | |
result_limit, | |
], | |
outputs=results_display, | |
) | |
# Clear filters | |
def clear_filters(): | |
return "", [], [], [], 20 | |
clear_btn.click( | |
fn=clear_filters, | |
outputs=[ | |
text_search, | |
entity_filter, | |
domain_filter, | |
size_filter, | |
result_limit, | |
], | |
) | |
# About Tab | |
with gr.Tab("βΉοΈ About", elem_id="about-tab"): | |
gr.Markdown( | |
""" | |
# π¬ About OpenMed NER Model Discovery | |
## What is OpenMed? | |
OpenMed is a collection of **380+ state-of-the-art Named Entity Recognition (NER) models** for biomedical and clinical text analysis. All models are: | |
- β **Completely Free** - Apache 2.0 license | |
- β **High Performance** - F1 scores up to 99.8% | |
- β **Ready to Use** - Compatible with Hugging Face Transformers | |
- β **Diverse** - Covers 8+ medical domains and 20+ entity types | |
## π― Use Cases | |
- **Drug Discovery** - Identify chemicals and compounds | |
- **Clinical Research** - Extract diseases and symptoms | |
- **Genomics** - Detect genes, proteins, and DNA/RNA | |
- **Medical Records** - Parse anatomical terms and clinical notes | |
- **Pharmacovigilance** - Monitor drug safety and adverse events | |
## ποΈ Model Architectures | |
- **BERT** - Bidirectional transformers for robust performance | |
- **DeBERTa** - Enhanced attention mechanisms | |
- **RoBERTa** - Optimized training for biomedical text | |
- **ModernBERT** - Latest advances in transformer architecture | |
## π Coverage | |
- **8 Medical Domains** - Pharmacology, Genomics, Oncology, Pathology, etc. | |
- **20+ Entity Types** - Chemical, DNA, RNA, Protein, Disease, Anatomy, etc. | |
- **Multiple Sizes** - From 33M to 568M parameters | |
- **380+ Models** - Comprehensive coverage for any biomedical NLP task | |
## π Getting Started | |
1. **Search** - Use the search tab to find models by domain, entity type, or keywords | |
2. **Compare** - View model cards with performance metrics and descriptions | |
3. **Copy Code** - Get ready-to-use code snippets | |
4. **Deploy** - Download and use with Hugging Face Transformers | |
## π§ Contact & Support | |
- **Models** - [OpenMed on Hugging Face](https://huggingface.co/OpenMed) | |
- **Paper** - Coming soon on arXiv | |
- **Community** - Join discussions on Hugging Face | |
--- | |
Built with β€οΈ for the biomedical research community | |
""" | |
) | |
# Load initial results | |
demo.load(fn=lambda: app.search_models("", [], [], [], 20), outputs=results_display) | |
if __name__ == "__main__": | |
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True) | |