Spaces:
Sleeping
Sleeping
import torch | |
from transformers import pipeline | |
from sentence_transformers import SentenceTransformer | |
class NLPEngine: | |
def __init__(self, device=-1): | |
device_name = 'cuda' if device == 0 else 'mps' if device == 'mps' else 'cpu' | |
print(f"Initializing NLPEngine on device: {device_name}") | |
self.sentiment = pipeline( | |
'sentiment-analysis', | |
model='distilbert-base-uncased-finetuned-sst-2-english', | |
device=device | |
) | |
self.summarizer = pipeline( | |
'summarization', | |
model='facebook/bart-large-cnn', | |
device=device | |
) | |
self.ner = pipeline( | |
'ner', | |
model='dslim/bert-base-NER', | |
aggregation_strategy='simple', | |
device=device | |
) | |
self.qa = pipeline( | |
'question-answering', | |
model='deepset/roberta-base-squad2', | |
device=device | |
) | |
self.generator = pipeline( | |
'text-generation', | |
model='gpt2', | |
device=device | |
) | |
## For initial tests of semantic search | |
# self.retriever = pipeline( | |
# 'feature-extraction', | |
# model='sentence-transformers/all-MiniLM-L6-v2', | |
# device=device | |
# ) | |
self.sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
print("NLPEngine initialized successfully.") | |
def analyze_sentiment(self, text): | |
return self.sentiment(text) | |
def summarize_text(self, text, max_length=150, min_length=30): | |
return self.summarizer(text, max_length=max_length, min_length=min_length, do_sample=False) | |
def extract_entities(self, text): | |
return self.ner(text) | |
def answer_question(self, question, context): | |
return self.qa(question=question, context=context) | |
def generate_text(self, prompt, max_length=50, num_return_sequences=1): | |
return self.generator(prompt, max_length=max_length, num_return_sequences=num_return_sequences) | |
def get_embeddings(self, text_or_texts): | |
## For initial tests of semantic search | |
## I tried to manually calculate and adjust the embedding sizes, until i directly used the sentence transformer from HF | |
# embeddings = self.retriever(text_or_texts) | |
# if isinstance(text_or_texts, str): | |
# return torch.mean(torch.tensor(embeddings[0]), dim=0) | |
# else: | |
# return torch.stack([torch.mean(torch.tensor(emb), dim=0) for emb in embeddings]) | |
return torch.tensor(self.sentence_model.encode(text_or_texts)) | |
if __name__ == "__main__": | |
pass | |
# Uncomment all the codes from here if you would like to run this as a stand alone script and test on your terminal how each pipeline works | |
# ================================= | |
# # Check for available hardware acceleration | |
# if torch.cuda.is_available(): | |
# selected_device = 0 # CUDA | |
# elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): | |
# selected_device = 'mps' # MPS for Apple Silicon | |
# else: | |
# selected_device = -1 # CPU | |
# print("Starting NLPEngine tests...") | |
# engine = NLPEngine(device=selected_device) | |
# sample_text_sentiment = "Hugging Face is a great platform for NLP." | |
# print(f"\nSentiment for '{sample_text_sentiment}': {engine.analyze_sentiment(sample_text_sentiment)}") | |
# sample_text_summarize = """ | |
# The Hugging Face ecosystem provides a wide array of tools and models for natural language processing. | |
# It includes transformers for state-of-the-art models, datasets for accessing and sharing data, | |
# and a model hub for discovering and using pre-trained models. Developers can leverage these | |
# resources to build powerful NLP applications with relative ease. The platform also supports | |
# various tasks such as text classification, summarization, translation, and question answering. | |
# The quick brown fox jumps over the lazy dog. This sentence is repeated multiple times to ensure | |
# the text is long enough for summarization to be meaningful. The quick brown fox jumps over the lazy dog. | |
# The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. | |
# """ | |
# print(f"\nSummary: {engine.summarize_text(sample_text_summarize, min_length=20, max_length=50)}") | |
# sample_text_ner = "Apple Inc. is looking at buying U.K. startup for $1 billion. Tim Cook is the CEO. The meeting is in New York." | |
# print(f"\nNER for '{sample_text_ner}': {engine.extract_entities(sample_text_ner)}") | |
# sample_context_qa = "The capital of France is Paris. It is known for the Eiffel Tower and the Louvre Museum." | |
# sample_question_qa = "What is Paris known for?" | |
# print(f"\nQA for context '{sample_context_qa}' and question '{sample_question_qa}': {engine.answer_question(question=sample_question_qa, context=sample_context_qa)}") | |
# sample_prompt_generate = "In a world powered by AI," | |
# print(f"\nGenerated Text from prompt '{sample_prompt_generate}': {engine.generate_text(sample_prompt_generate, max_length=30)}") | |
# # sample_text_retriever1 = "This is a test sentence for semantic search." | |
# # sample_text_retriever2 = "Another sentence to compare for similarity." | |
# # embedding1 = engine.get_embeddings(sample_text_retriever1) | |
# # embedding2 = engine.get_embeddings(sample_text_retriever2) | |
# # print(f"\nEmbedding shape for a single sentence: {embedding1.shape}") | |
# corpus = ["The weather is sunny today.", "I enjoy walking in the park on a beautiful day.", "AI is transforming many industries."] | |
# query = "What is the forecast for today?" | |
# query_embedding = engine.get_embeddings(query) | |
# corpus_embeddings = engine.get_embeddings(corpus) | |
# print(f"Query embedding shape: {query_embedding.shape}") | |
# print(f"Corpus embeddings shape: {corpus_embeddings.shape}") | |
# if query_embedding.ndim == 1: | |
# query_embedding = query_embedding.unsqueeze(0) | |
# similarities = torch.nn.functional.cosine_similarity(query_embedding, corpus_embeddings, dim=1) | |
# print(f"\nSimilarities between '{query}' and corpus sentences: {similarities.tolist()}") | |
# print("\nNLPEngine tests completed.") |