Spaces:

philtoms
/

minilm-alice-base-rsft-v1

Sleeping

App Files Files Community

minilm-alice-base-rsft-v1 / app.py

philtoms

Upload app.py

380225c verified 18 days ago

raw

history blame contribute delete

6.67 kB

	import gradio as gr
	import time
	import os
	import json
	import torch
	from transformers import AutoTokenizer, AutoModel
	from sentence_transformers import SentenceTransformer, util

	# --- Path Configuration ---
	# Get the absolute path of the directory containing this script
	script_dir = os.path.dirname(os.path.abspath(__file__))

	# Check if running in a Hugging Face Space
	is_hf_space = "SPACE_ID" in os.environ

	if is_hf_space:
	# In a Space, load model from the Hub and data from the repo root
	model_path = os.environ.get("MODEL_REPO_ID", "philtoms/minilm-alice-base-rsft-v2")
	data_path = "training_triplets.jsonl"
	print(f"Running on HF Spaces. Using model from Hub: {model_path}")
	else:
	# Locally, construct absolute paths based on the script's location
	model_path = os.path.join(script_dir, "..", "models", "minilm-alice-base-rsft-v2", "final")
	data_path = os.path.join(script_dir, "..", "data", "training_triplets.jsonl")
	print(f"Running locally. Using local model at: {model_path}")

	# --- Model and Tokenizer Loading ---
	try:
	# model_path = "sentence-transformers/all-MiniLM-L6-v2"
	model_path = "sentence-transformers/multi-qa-mpnet-base-cos-v1"
	# model_path = "Qwen/Qwen3-Embedding-0.6B"

	# tokenizer = AutoTokenizer.from_pretrained(model_path)
	# model = AutoModel.from_pretrained(model_path)
	model = SentenceTransformer(model_path)

	except Exception as e:
	raise gr.Error(f"Failed to load model from '{model_path}'. Error: {e}")

	# --- Dataset Loading ---
	if not os.path.exists(data_path):
	raise gr.Error(f"Data file not found at '{data_path}'. Please ensure the file exists.")

	dataset = []
	with open(data_path, "r") as f:
	for line in f:
	dataset.append(json.loads(line))

	# Pre-compute corpus embeddings
	import re

	# def split_into_sentences(text):
	# """Splits a paragraph into sentences based on capitalization and punctuation."""
	# # This regex looks for a capital letter, followed by anything that's not a period,
	# # exclamation mark, or question mark, and then ends with one of those punctuation marks.
	# sentences = re.findall(r'([A-Z][^.!?]*[.!?])', text)
	# return sentences

	# def create_overlapped_chunks(corpus_documents, chunk_size=2, overlap=1):
	# chunked_corpus = []
	# for doc_idx, doc_text in enumerate(corpus_documents):
	# sentences = split_into_sentences(doc_text)
	# if not sentences:
	# continue

	# # If there are fewer sentences than chunk_size, just use the whole document as one chunk
	# if len(sentences) < chunk_size:
	# chunked_corpus.append({
	# "text": doc_text,
	# "original_doc_idx": doc_idx,
	# "start_sentence_idx": 0,
	# "end_sentence_idx": len(sentences) - 1
	# })
	# continue

	# for i in range(0, len(sentences) - chunk_size + 1, chunk_size - overlap):
	# chunk_sentences = sentences[i : i + chunk_size]
	# chunk_text = " ".join(chunk_sentences)
	# chunked_corpus.append({
	# "text": chunk_text,
	# "original_doc_idx": doc_idx,
	# "start_sentence_idx": i,
	# "end_sentence_idx": i + chunk_size - 1
	# })
	# return chunked_corpus

	# def process_documents_for_chunking(documents):
	# chunked_corpus_data = create_overlapped_chunks(documents)
	# flat_corpus_chunks = [item["text"] for item in chunked_corpus_data]
	# return chunked_corpus_data, flat_corpus_chunks

	# Pre-compute corpus embeddings
	original_corpus = [item["positive"] for item in dataset]
	# chunked_corpus_data, flat_corpus_chunks = process_documents_for_chunking(original_corpus)
	# corpus_embeddings = model.encode(flat_corpus_chunks)
	corpus_embeddings = model.encode(original_corpus)

	# def find_similar(prompt, top_k):
	# start_time = time.time()

	# prompt_embedding = model.encode(prompt)
	# scores = util.dot_score(prompt_embedding, corpus_embeddings)[0].cpu().tolist()

	# # Pair scores with the chunked corpus data
	# scored_chunks = []
	# for i, score in enumerate(scores):
	# scored_chunks.append({
	# "score": score,
	# "text": chunked_corpus_data[i]["text"],
	# "original_doc_idx": chunked_corpus_data[i]["original_doc_idx"]
	# })

	# # Sort by decreasing score
	# scored_chunks = sorted(scored_chunks, key=lambda x: x["score"], reverse=True)

	# results = []
	# for item in scored_chunks[:top_k]:
	# # Return the original document text, not just the chunk
	# original_doc_text = original_corpus[item["original_doc_idx"]]
	# results.append((item["score"], original_doc_text))

	# end_time = time.time()

	# return results, f"{(end_time - start_time) * 1000:.2f} ms"

	# with torch.no_grad():
	# encoded_corpus = tokenizer(corpus, padding=True, truncation=True, return_tensors='pt')
	# corpus_embeddings = model(**encoded_corpus).last_hidden_state.mean(dim=1)

	def find_similar(prompt, top_k):
	start_time = time.time()

	prompt_embedding = model.encode(prompt)
	scores = util.dot_score(prompt_embedding, corpus_embeddings)[0].cpu().tolist()
	doc_score_pairs = list(zip(original_corpus, scores))

	#Sort by decreasing score
	doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

	# with torch.no_grad():
	# encoded_prompt = tokenizer(prompt, padding=True, truncation=True, return_tensors='pt')
	# prompt_embedding = model(**encoded_prompt).last_hidden_state.mean(dim=1)

	# cos_scores = torch.nn.functional.cosine_similarity(prompt_embedding, corpus_embeddings, dim=1)
	# top_results = torch.topk(cos_scores, k=int(top_k))

	end_time = time.time()

	results = []
	for doc, score in doc_score_pairs[:top_k]:
	# for doc, score in doc_score_pairs:
	results.append((score, doc))

	return results, f"{(end_time - start_time) * 1000:.2f} ms"

	iface = gr.Interface(
	fn=find_similar,
	inputs=[
	gr.Dropdown(
	["Alice sees White rabbit for the first time", "Alice meets caterpillar", "sad turtle story"],
	label="Select a prompt or type your own",
	allow_custom_value=True
	),
	gr.Slider(1, 20, value=5, step=1, label="Top K")
	],
	outputs=[
	gr.Dataframe(headers=[ "Score", "Response"]),
	gr.Textbox(label="Time Taken")
	],
	title="RSFT Alice Embeddings (Transformers)",
	description=f"Enter a prompt to find similar sentences from the corpus."
	)

	if __name__ == "__main__":
	iface.launch()