Spaces:

philtoms
/

minilm-alice-base-rsft-v1

Sleeping

File size: 6,668 Bytes

import gradio as gr
import time
import os
import json
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, util

# --- Path Configuration ---
# Get the absolute path of the directory containing this script
script_dir = os.path.dirname(os.path.abspath(__file__))

# Check if running in a Hugging Face Space
is_hf_space = "SPACE_ID" in os.environ

if is_hf_space:
    # In a Space, load model from the Hub and data from the repo root
    model_path = os.environ.get("MODEL_REPO_ID", "philtoms/minilm-alice-base-rsft-v2")
    data_path = "training_triplets.jsonl"
    print(f"Running on HF Spaces. Using model from Hub: {model_path}")
else:
    # Locally, construct absolute paths based on the script's location
    model_path = os.path.join(script_dir, "..", "models", "minilm-alice-base-rsft-v2", "final")
    data_path = os.path.join(script_dir, "..", "data", "training_triplets.jsonl")
    print(f"Running locally. Using local model at: {model_path}")

# --- Model and Tokenizer Loading ---
try:
    # model_path = "sentence-transformers/all-MiniLM-L6-v2"
    model_path = "sentence-transformers/multi-qa-mpnet-base-cos-v1"
    # model_path = "Qwen/Qwen3-Embedding-0.6B"

    # tokenizer = AutoTokenizer.from_pretrained(model_path)
    # model = AutoModel.from_pretrained(model_path)
    model = SentenceTransformer(model_path)

except Exception as e:
    raise gr.Error(f"Failed to load model from '{model_path}'. Error: {e}")

# --- Dataset Loading ---
if not os.path.exists(data_path):
     raise gr.Error(f"Data file not found at '{data_path}'. Please ensure the file exists.")

dataset = []
with open(data_path, "r") as f:
    for line in f:
        dataset.append(json.loads(line))

# Pre-compute corpus embeddings
import re

# def split_into_sentences(text):
#     """Splits a paragraph into sentences based on capitalization and punctuation."""
#     # This regex looks for a capital letter, followed by anything that's not a period, 
#     # exclamation mark, or question mark, and then ends with one of those punctuation marks.
#     sentences = re.findall(r'([A-Z][^.!?]*[.!?])', text)
#     return sentences

# def create_overlapped_chunks(corpus_documents, chunk_size=2, overlap=1):
#     chunked_corpus = []
#     for doc_idx, doc_text in enumerate(corpus_documents):
#         sentences = split_into_sentences(doc_text)
#         if not sentences:
#             continue

#         # If there are fewer sentences than chunk_size, just use the whole document as one chunk
#         if len(sentences) < chunk_size:
#             chunked_corpus.append({
#                 "text": doc_text,
#                 "original_doc_idx": doc_idx,
#                 "start_sentence_idx": 0,
#                 "end_sentence_idx": len(sentences) - 1
#             })
#             continue

#         for i in range(0, len(sentences) - chunk_size + 1, chunk_size - overlap):
#             chunk_sentences = sentences[i : i + chunk_size]
#             chunk_text = " ".join(chunk_sentences)
#             chunked_corpus.append({
#                 "text": chunk_text,
#                 "original_doc_idx": doc_idx,
#                 "start_sentence_idx": i,
#                 "end_sentence_idx": i + chunk_size - 1
#             })
#     return chunked_corpus

# def process_documents_for_chunking(documents):
#     chunked_corpus_data = create_overlapped_chunks(documents)
#     flat_corpus_chunks = [item["text"] for item in chunked_corpus_data]
#     return chunked_corpus_data, flat_corpus_chunks

# Pre-compute corpus embeddings
original_corpus = [item["positive"] for item in dataset]
# chunked_corpus_data, flat_corpus_chunks = process_documents_for_chunking(original_corpus)
# corpus_embeddings = model.encode(flat_corpus_chunks)
corpus_embeddings = model.encode(original_corpus)

# def find_similar(prompt, top_k):
#     start_time = time.time()

#     prompt_embedding = model.encode(prompt)
#     scores = util.dot_score(prompt_embedding, corpus_embeddings)[0].cpu().tolist()
    
#     # Pair scores with the chunked corpus data
#     scored_chunks = []
#     for i, score in enumerate(scores):
#         scored_chunks.append({
#             "score": score,
#             "text": chunked_corpus_data[i]["text"],
#             "original_doc_idx": chunked_corpus_data[i]["original_doc_idx"]
#         })

#     # Sort by decreasing score
#     scored_chunks = sorted(scored_chunks, key=lambda x: x["score"], reverse=True)

#     results = []
#     for item in scored_chunks[:top_k]:
#         # Return the original document text, not just the chunk
#         original_doc_text = original_corpus[item["original_doc_idx"]]
#         results.append((item["score"], original_doc_text))

#     end_time = time.time()

#     return results, f"{(end_time - start_time) * 1000:.2f} ms"

# with torch.no_grad():
#     encoded_corpus = tokenizer(corpus, padding=True, truncation=True, return_tensors='pt')
#     corpus_embeddings = model(**encoded_corpus).last_hidden_state.mean(dim=1)

def find_similar(prompt, top_k):
    start_time = time.time()

    prompt_embedding = model.encode(prompt)
    scores = util.dot_score(prompt_embedding, corpus_embeddings)[0].cpu().tolist()
    doc_score_pairs = list(zip(original_corpus, scores))

    #Sort by decreasing score
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

    # with torch.no_grad():
    #     encoded_prompt = tokenizer(prompt, padding=True, truncation=True, return_tensors='pt')
    #     prompt_embedding = model(**encoded_prompt).last_hidden_state.mean(dim=1)
        
    #     cos_scores = torch.nn.functional.cosine_similarity(prompt_embedding, corpus_embeddings, dim=1)
    #     top_results = torch.topk(cos_scores, k=int(top_k))

    end_time = time.time()

    results = []
    for doc, score in  doc_score_pairs[:top_k]:
    # for doc, score in  doc_score_pairs:
        results.append((score, doc))

    return results, f"{(end_time - start_time) * 1000:.2f} ms"

iface = gr.Interface(
    fn=find_similar,
    inputs=[
        gr.Dropdown(
            ["Alice sees White rabbit for the first time", "Alice meets caterpillar", "sad turtle story"],
            label="Select a prompt or type your own",
            allow_custom_value=True
        ),
        gr.Slider(1, 20, value=5, step=1, label="Top K")
    ],
    outputs=[
        gr.Dataframe(headers=[ "Score", "Response"]),
        gr.Textbox(label="Time Taken")
    ],
    title="RSFT Alice Embeddings (Transformers)",
    description=f"Enter a prompt to find similar sentences from the corpus."
)

if __name__ == "__main__":
    iface.launch()