|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
|
import gradio as gr |
|
import huggingface_hub |
|
import sentence_transformers |
|
|
|
from transformers import AutoTokenizer, AutoModel |
|
import torch |
|
|
|
|
|
def func_ClearInputs(): return "", "", "" |
|
|
|
|
|
|
|
str_ModelID_ModernBERT = "answerdotai/ModernBERT-large" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
word_embedding_model = sentence_transformers.models.Transformer(str_ModelID_ModernBERT) |
|
pooling_model = sentence_transformers.models.Pooling(word_embedding_model.get_word_embedding_dimension()) |
|
sentenceModel_ModernBERT = sentence_transformers.SentenceTransformer(modules=[word_embedding_model, pooling_model]) |
|
|
|
|
|
|
|
def get_SentenceEmbeddings_ModernBERT(sentence): |
|
|
|
embeddings = sentenceModel_ModernBERT.encode(sentence) |
|
|
|
return embeddings |
|
|
|
|
|
def get_sentence_embedding(sentence: str) -> torch.Tensor: |
|
|
|
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = sentenceModel_ModernBERT(**inputs) |
|
|
|
|
|
token_embeddings = outputs.last_hidden_state |
|
|
|
|
|
attention_mask = inputs['attention_mask'] |
|
mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() |
|
sentence_embedding = torch.sum(token_embeddings * mask_expanded, dim=1) / torch.clamp(mask_expanded.sum(dim=1), |
|
min=1e-9) |
|
|
|
return sentence_embedding.squeeze() |
|
|
|
def func_sBERT_SimilarityResult(str_Text_1, str_Text_2): |
|
if not str_Text_1.strip() or not str_Text_2.strip(): |
|
return "Both text inputs must be non-empty." |
|
|
|
|
|
inferenceClient = huggingface_hub.InferenceClient(provider="hf-inference") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
arrEmbedding_Text_1 = get_SentenceEmbeddings_ModernBERT(str_Text_1) |
|
arrEmbedding_Text_2 = get_SentenceEmbeddings_ModernBERT(str_Text_2) |
|
|
|
|
|
tensor_Similarity = sentence_transformers.util.pytorch_cos_sim(arrEmbedding_Text_1, arrEmbedding_Text_2) |
|
f_Similarity = tensor_Similarity.item() |
|
|
|
return f"Clinical Similarity Score: {f_Similarity:.4f}" |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
print(f"os.getcwd() = {os.getcwd()}") |
|
os.system(f"echo ls -al {os.getcwd()} && ls -al {os.getcwd()}") |
|
os.system(f"echo ls -al /: && ls -al /") |
|
os.system(f"echo ls -al /home/: && ls -al /home/") |
|
|
|
|
|
with gr.Blocks() as grBlocks_SentenceSimilarity__MCP_Server: |
|
gr.Markdown("# ModernBERT for Clinical Text Similarity using HF Inference Server, MaxSeqLength==8192") |
|
gr.Markdown("This application calculates Cosine Similarity Score between two Texts' ModernBERT Sentence-Embeddings") |
|
|
|
with gr.Row(): |
|
grTextBox_Input_1 = gr.Textbox(label="Text Panel 1", lines=20) |
|
grTextBox_Input_2 = gr.Textbox(label="Text Panel 2", lines=20) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
grButton_Clear = gr.Button("Clear") |
|
grButton_Submit = gr.Button("Submit") |
|
with gr.Column(scale=3): |
|
grTextbox_Output = gr.Textbox(label="Similarity Result", interactive=False) |
|
|
|
|
|
grButton_Submit.click(fn=func_sBERT_SimilarityResult, inputs=[grTextBox_Input_1, grTextBox_Input_2], outputs=grTextbox_Output) |
|
grButton_Clear.click(fn=func_ClearInputs, inputs=[], outputs=[grTextBox_Input_1, grTextBox_Input_2, grTextbox_Output]) |
|
|
|
|
|
|
|
grBlocks_SentenceSimilarity__MCP_Server.launch(mcp_server=True, share=True) |