import torch from transformers import pipeline from sentence_transformers import SentenceTransformer class NLPEngine: def __init__(self, device=-1): device_name = 'cuda' if device == 0 else 'mps' if device == 'mps' else 'cpu' print(f"Initializing NLPEngine on device: {device_name}") self.sentiment = pipeline( 'sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english', device=device ) self.summarizer = pipeline( 'summarization', model='facebook/bart-large-cnn', device=device ) self.ner = pipeline( 'ner', model='dslim/bert-base-NER', aggregation_strategy='simple', device=device ) self.qa = pipeline( 'question-answering', model='deepset/roberta-base-squad2', device=device ) self.generator = pipeline( 'text-generation', model='gpt2', device=device ) ## For initial tests of semantic search # self.retriever = pipeline( # 'feature-extraction', # model='sentence-transformers/all-MiniLM-L6-v2', # device=device # ) self.sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') print("NLPEngine initialized successfully.") def analyze_sentiment(self, text): return self.sentiment(text) def summarize_text(self, text, max_length=150, min_length=30): return self.summarizer(text, max_length=max_length, min_length=min_length, do_sample=False) def extract_entities(self, text): return self.ner(text) def answer_question(self, question, context): return self.qa(question=question, context=context) def generate_text(self, prompt, max_length=50, num_return_sequences=1): return self.generator(prompt, max_length=max_length, num_return_sequences=num_return_sequences) def get_embeddings(self, text_or_texts): ## For initial tests of semantic search ## I tried to manually calculate and adjust the embedding sizes, until i directly used the sentence transformer from HF # embeddings = self.retriever(text_or_texts) # if isinstance(text_or_texts, str): # return torch.mean(torch.tensor(embeddings[0]), dim=0) # else: # return torch.stack([torch.mean(torch.tensor(emb), dim=0) for emb in embeddings]) return torch.tensor(self.sentence_model.encode(text_or_texts)) if __name__ == "__main__": pass # Uncomment all the codes from here if you would like to run this as a stand alone script and test on your terminal how each pipeline works # ================================= # # Check for available hardware acceleration # if torch.cuda.is_available(): # selected_device = 0 # CUDA # elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): # selected_device = 'mps' # MPS for Apple Silicon # else: # selected_device = -1 # CPU # print("Starting NLPEngine tests...") # engine = NLPEngine(device=selected_device) # sample_text_sentiment = "Hugging Face is a great platform for NLP." # print(f"\nSentiment for '{sample_text_sentiment}': {engine.analyze_sentiment(sample_text_sentiment)}") # sample_text_summarize = """ # The Hugging Face ecosystem provides a wide array of tools and models for natural language processing. # It includes transformers for state-of-the-art models, datasets for accessing and sharing data, # and a model hub for discovering and using pre-trained models. Developers can leverage these # resources to build powerful NLP applications with relative ease. The platform also supports # various tasks such as text classification, summarization, translation, and question answering. # The quick brown fox jumps over the lazy dog. This sentence is repeated multiple times to ensure # the text is long enough for summarization to be meaningful. The quick brown fox jumps over the lazy dog. # The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. # """ # print(f"\nSummary: {engine.summarize_text(sample_text_summarize, min_length=20, max_length=50)}") # sample_text_ner = "Apple Inc. is looking at buying U.K. startup for $1 billion. Tim Cook is the CEO. The meeting is in New York." # print(f"\nNER for '{sample_text_ner}': {engine.extract_entities(sample_text_ner)}") # sample_context_qa = "The capital of France is Paris. It is known for the Eiffel Tower and the Louvre Museum." # sample_question_qa = "What is Paris known for?" # print(f"\nQA for context '{sample_context_qa}' and question '{sample_question_qa}': {engine.answer_question(question=sample_question_qa, context=sample_context_qa)}") # sample_prompt_generate = "In a world powered by AI," # print(f"\nGenerated Text from prompt '{sample_prompt_generate}': {engine.generate_text(sample_prompt_generate, max_length=30)}") # # sample_text_retriever1 = "This is a test sentence for semantic search." # # sample_text_retriever2 = "Another sentence to compare for similarity." # # embedding1 = engine.get_embeddings(sample_text_retriever1) # # embedding2 = engine.get_embeddings(sample_text_retriever2) # # print(f"\nEmbedding shape for a single sentence: {embedding1.shape}") # corpus = ["The weather is sunny today.", "I enjoy walking in the park on a beautiful day.", "AI is transforming many industries."] # query = "What is the forecast for today?" # query_embedding = engine.get_embeddings(query) # corpus_embeddings = engine.get_embeddings(corpus) # print(f"Query embedding shape: {query_embedding.shape}") # print(f"Corpus embeddings shape: {corpus_embeddings.shape}") # if query_embedding.ndim == 1: # query_embedding = query_embedding.unsqueeze(0) # similarities = torch.nn.functional.cosine_similarity(query_embedding, corpus_embeddings, dim=1) # print(f"\nSimilarities between '{query}' and corpus sentences: {similarities.tolist()}") # print("\nNLPEngine tests completed.")