Spaces:
Sleeping
Sleeping
File size: 6,292 Bytes
3ab6535 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import torch
from transformers import pipeline
from sentence_transformers import SentenceTransformer
class NLPEngine:
def __init__(self, device=-1):
device_name = 'cuda' if device == 0 else 'mps' if device == 'mps' else 'cpu'
print(f"Initializing NLPEngine on device: {device_name}")
self.sentiment = pipeline(
'sentiment-analysis',
model='distilbert-base-uncased-finetuned-sst-2-english',
device=device
)
self.summarizer = pipeline(
'summarization',
model='facebook/bart-large-cnn',
device=device
)
self.ner = pipeline(
'ner',
model='dslim/bert-base-NER',
aggregation_strategy='simple',
device=device
)
self.qa = pipeline(
'question-answering',
model='deepset/roberta-base-squad2',
device=device
)
self.generator = pipeline(
'text-generation',
model='gpt2',
device=device
)
## For initial tests of semantic search
# self.retriever = pipeline(
# 'feature-extraction',
# model='sentence-transformers/all-MiniLM-L6-v2',
# device=device
# )
self.sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("NLPEngine initialized successfully.")
def analyze_sentiment(self, text):
return self.sentiment(text)
def summarize_text(self, text, max_length=150, min_length=30):
return self.summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
def extract_entities(self, text):
return self.ner(text)
def answer_question(self, question, context):
return self.qa(question=question, context=context)
def generate_text(self, prompt, max_length=50, num_return_sequences=1):
return self.generator(prompt, max_length=max_length, num_return_sequences=num_return_sequences)
def get_embeddings(self, text_or_texts):
## For initial tests of semantic search
## I tried to manually calculate and adjust the embedding sizes, until i directly used the sentence transformer from HF
# embeddings = self.retriever(text_or_texts)
# if isinstance(text_or_texts, str):
# return torch.mean(torch.tensor(embeddings[0]), dim=0)
# else:
# return torch.stack([torch.mean(torch.tensor(emb), dim=0) for emb in embeddings])
return torch.tensor(self.sentence_model.encode(text_or_texts))
if __name__ == "__main__":
pass
# Uncomment all the codes from here if you would like to run this as a stand alone script and test on your terminal how each pipeline works
# =================================
# # Check for available hardware acceleration
# if torch.cuda.is_available():
# selected_device = 0 # CUDA
# elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
# selected_device = 'mps' # MPS for Apple Silicon
# else:
# selected_device = -1 # CPU
# print("Starting NLPEngine tests...")
# engine = NLPEngine(device=selected_device)
# sample_text_sentiment = "Hugging Face is a great platform for NLP."
# print(f"\nSentiment for '{sample_text_sentiment}': {engine.analyze_sentiment(sample_text_sentiment)}")
# sample_text_summarize = """
# The Hugging Face ecosystem provides a wide array of tools and models for natural language processing.
# It includes transformers for state-of-the-art models, datasets for accessing and sharing data,
# and a model hub for discovering and using pre-trained models. Developers can leverage these
# resources to build powerful NLP applications with relative ease. The platform also supports
# various tasks such as text classification, summarization, translation, and question answering.
# The quick brown fox jumps over the lazy dog. This sentence is repeated multiple times to ensure
# the text is long enough for summarization to be meaningful. The quick brown fox jumps over the lazy dog.
# The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.
# """
# print(f"\nSummary: {engine.summarize_text(sample_text_summarize, min_length=20, max_length=50)}")
# sample_text_ner = "Apple Inc. is looking at buying U.K. startup for $1 billion. Tim Cook is the CEO. The meeting is in New York."
# print(f"\nNER for '{sample_text_ner}': {engine.extract_entities(sample_text_ner)}")
# sample_context_qa = "The capital of France is Paris. It is known for the Eiffel Tower and the Louvre Museum."
# sample_question_qa = "What is Paris known for?"
# print(f"\nQA for context '{sample_context_qa}' and question '{sample_question_qa}': {engine.answer_question(question=sample_question_qa, context=sample_context_qa)}")
# sample_prompt_generate = "In a world powered by AI,"
# print(f"\nGenerated Text from prompt '{sample_prompt_generate}': {engine.generate_text(sample_prompt_generate, max_length=30)}")
# # sample_text_retriever1 = "This is a test sentence for semantic search."
# # sample_text_retriever2 = "Another sentence to compare for similarity."
# # embedding1 = engine.get_embeddings(sample_text_retriever1)
# # embedding2 = engine.get_embeddings(sample_text_retriever2)
# # print(f"\nEmbedding shape for a single sentence: {embedding1.shape}")
# corpus = ["The weather is sunny today.", "I enjoy walking in the park on a beautiful day.", "AI is transforming many industries."]
# query = "What is the forecast for today?"
# query_embedding = engine.get_embeddings(query)
# corpus_embeddings = engine.get_embeddings(corpus)
# print(f"Query embedding shape: {query_embedding.shape}")
# print(f"Corpus embeddings shape: {corpus_embeddings.shape}")
# if query_embedding.ndim == 1:
# query_embedding = query_embedding.unsqueeze(0)
# similarities = torch.nn.functional.cosine_similarity(query_embedding, corpus_embeddings, dim=1)
# print(f"\nSimilarities between '{query}' and corpus sentences: {similarities.tolist()}")
# print("\nNLPEngine tests completed.") |