File size: 6,292 Bytes
3ab6535
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import torch
from transformers import pipeline
from sentence_transformers import SentenceTransformer

class NLPEngine:
    def __init__(self, device=-1):
        device_name = 'cuda' if device == 0 else 'mps' if device == 'mps' else 'cpu'
        print(f"Initializing NLPEngine on device: {device_name}")

        self.sentiment = pipeline(
            'sentiment-analysis',
            model='distilbert-base-uncased-finetuned-sst-2-english',
            device=device
        )
        self.summarizer = pipeline(
            'summarization',
            model='facebook/bart-large-cnn',
            device=device
        )
        self.ner = pipeline(
            'ner',
            model='dslim/bert-base-NER',
            aggregation_strategy='simple',
            device=device
        )
        self.qa = pipeline(
            'question-answering',
            model='deepset/roberta-base-squad2',
            device=device
        )
        self.generator = pipeline(
            'text-generation',
            model='gpt2',
            device=device
        )
        ## For initial tests of semantic search
        # self.retriever = pipeline(
        #     'feature-extraction',
        #     model='sentence-transformers/all-MiniLM-L6-v2',
        #     device=device
        # )
        self.sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

        print("NLPEngine initialized successfully.")

    def analyze_sentiment(self, text):
        return self.sentiment(text)

    def summarize_text(self, text, max_length=150, min_length=30):
        return self.summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)

    def extract_entities(self, text):
        return self.ner(text)

    def answer_question(self, question, context):
        return self.qa(question=question, context=context)

    def generate_text(self, prompt, max_length=50, num_return_sequences=1):
        return self.generator(prompt, max_length=max_length, num_return_sequences=num_return_sequences)

    def get_embeddings(self, text_or_texts):

        ## For initial tests of semantic search
        ## I tried to manually calculate and adjust the embedding sizes, until i directly used the sentence transformer from HF
        # embeddings = self.retriever(text_or_texts)
        # if isinstance(text_or_texts, str):
        #     return torch.mean(torch.tensor(embeddings[0]), dim=0)
        # else:
        #     return torch.stack([torch.mean(torch.tensor(emb), dim=0) for emb in embeddings])
        return torch.tensor(self.sentence_model.encode(text_or_texts))

if __name__ == "__main__":
    pass

    # Uncomment all the codes from here if you would like to run this as a stand alone script and test on your terminal how each pipeline works
    # =================================

    # # Check for available hardware acceleration
    # if torch.cuda.is_available():
    #     selected_device = 0  # CUDA
    # elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    #     selected_device = 'mps'  # MPS for Apple Silicon
    # else:
    #     selected_device = -1  # CPU

    # print("Starting NLPEngine tests...")
    # engine = NLPEngine(device=selected_device)

    # sample_text_sentiment = "Hugging Face is a great platform for NLP."
    # print(f"\nSentiment for '{sample_text_sentiment}': {engine.analyze_sentiment(sample_text_sentiment)}")

    # sample_text_summarize = """
    # The Hugging Face ecosystem provides a wide array of tools and models for natural language processing.
    # It includes transformers for state-of-the-art models, datasets for accessing and sharing data,
    # and a model hub for discovering and using pre-trained models. Developers can leverage these
    # resources to build powerful NLP applications with relative ease. The platform also supports
    # various tasks such as text classification, summarization, translation, and question answering.
    # The quick brown fox jumps over the lazy dog. This sentence is repeated multiple times to ensure
    # the text is long enough for summarization to be meaningful. The quick brown fox jumps over the lazy dog.
    # The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.
    # """
    # print(f"\nSummary: {engine.summarize_text(sample_text_summarize, min_length=20, max_length=50)}")

    # sample_text_ner = "Apple Inc. is looking at buying U.K. startup for $1 billion. Tim Cook is the CEO. The meeting is in New York."
    # print(f"\nNER for '{sample_text_ner}': {engine.extract_entities(sample_text_ner)}")

    # sample_context_qa = "The capital of France is Paris. It is known for the Eiffel Tower and the Louvre Museum."
    # sample_question_qa = "What is Paris known for?"
    # print(f"\nQA for context '{sample_context_qa}' and question '{sample_question_qa}': {engine.answer_question(question=sample_question_qa, context=sample_context_qa)}")

    # sample_prompt_generate = "In a world powered by AI,"
    # print(f"\nGenerated Text from prompt '{sample_prompt_generate}': {engine.generate_text(sample_prompt_generate, max_length=30)}")

    # # sample_text_retriever1 = "This is a test sentence for semantic search."
    # # sample_text_retriever2 = "Another sentence to compare for similarity."
    # # embedding1 = engine.get_embeddings(sample_text_retriever1)
    # # embedding2 = engine.get_embeddings(sample_text_retriever2)
    # # print(f"\nEmbedding shape for a single sentence: {embedding1.shape}")

    # corpus = ["The weather is sunny today.", "I enjoy walking in the park on a beautiful day.", "AI is transforming many industries."]
    # query = "What is the forecast for today?"

    # query_embedding = engine.get_embeddings(query)
    # corpus_embeddings = engine.get_embeddings(corpus)

    # print(f"Query embedding shape: {query_embedding.shape}")
    # print(f"Corpus embeddings shape: {corpus_embeddings.shape}")

    # if query_embedding.ndim == 1:
    #     query_embedding = query_embedding.unsqueeze(0)

    # similarities = torch.nn.functional.cosine_similarity(query_embedding, corpus_embeddings, dim=1)
    # print(f"\nSimilarities between '{query}' and corpus sentences: {similarities.tolist()}")

    # print("\nNLPEngine tests completed.")