File size: 6,586 Bytes
2339301
 
 
 
 
4418e3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2339301
 
 
 
 
 
 
 
 
 
4418e3c
 
2339301
 
 
4418e3c
 
 
 
 
 
 
 
 
 
 
 
 
 
2339301
 
4418e3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2339301
 
4418e3c
 
 
 
 
2339301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4418e3c
2339301
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
import torch
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from gliner import GLiNER
import spacy
import re
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to remove stop words
def preprocess_text(text):
    doc = nlp(text)
    filtered_words = [ token.lemma_  # Convert to base form
                    for token in doc
                    if not token.is_stop  # Remove stop words
                    and not token.is_punct  # Remove punctuation
                    and not token.is_digit  # Remove numbers
                    and len(token.text) > 2  ]
    
    return " ".join(filtered_words)
    
# πŸ“‚ Define input/output directories
input_dir = "summaryoutput"  # Folder containing summarized documents
entity_output_dir = "extracted_entities"  # Folder to save extracted entities
os.makedirs(entity_output_dir, exist_ok=True)

# ⚑ Load Named Entity Recognition (NER) model
device = "cuda" if torch.cuda.is_available() else "cpu"  # Use GPU if available
model = GLiNER.from_pretrained("urchade/gliner_medium-v2.1").to(device)

# Define labels for entity extraction
labels = ["person", "organization", "location", "date", "document", "event", "role", "cryptonym", "operation", "nationality", "contact","SUBJECT","REFERENCE","FROM","TO","DATE","REF","INFO"]
#  Get already processed files
existing_entity_files = set(os.listdir(entity_output_dir))
def extract_entities(text):
    entities = model.predict_entities(text, labels)
    extracted = {}
    regex_patterns = {
        "TO": r"(?i)\bTO[:\s]+([^\n]+)",
        "FROM": r"(?i)\bFROM[:\s]+([^\n]+)",
        "DATE": r"(?i)\bDATE[:\s]+([^\n]+)",
        "REF": r"(?i)\bREF[:\s]+([^\n]+)",
        "SUBJECT": r"(?i)\bSUBJECT[:\s]+([^\n]+)",
    }
    for label, pattern in regex_patterns.items():
        matches = re.findall(pattern, text)
        if matches:
            # Clean up matches
            cleaned_matches = [m.strip().rstrip(')').lstrip(',') for m in matches]
            extracted[label] = list(set(cleaned_matches))  # Remove duplicates
    for entity in entities:
        entity_type = entity["label"]
        entity_text = entity["text"].strip().rstrip(')').lstrip(',')
        if entity_type not in extracted:
            extracted[entity_type] = []
        if entity_text not in extracted[entity_type]:  # Avoid duplicates
            extracted[entity_type].append(entity_text)
    return extracted
# def extract_entities(text):
#     entities = model.predict_entities(text, labels)
#     extracted = {}
#     regex_patterns = {
#         "TO": r"(?i)\bTO[:\s]+([^\n]+)",    # Matches "TO: some text"
#         "FROM": r"(?i)\bFROM[:\s]+([^\n]+)",  # Matches "FROM: some text"
#         "DATE": r"(?i)\bDATE[:\s]+([^\n]+)",  # Matches "DATE: some text"
#         "REF": r"(?i)\bREF[:\s]+([^\n]+)",    # Matches "REF: some text"
#         "SUBJECT": r"(?i)\bSUBJECT[:\s]+([^\n]+)",  # Matches "SUBJECT: some text"
#     }
#     # Apply regex patterns
#     for label, pattern in regex_patterns.items():
#         matches = re.findall(pattern, text)
#         if matches:
#             extracted[label] = matches

#     for entity in entities:
#         entity_type = entity["label"]
#         entity_text = entity["text"]
#         if entity_type not in extracted:
#             extracted[entity_type] = []
#         extracted[entity_type].append(entity_text)
#     return extracted
#  Function to generate word cloud
def generate_word_cloud(text, output_filename):
    os.makedirs(os.path.dirname(output_filename), exist_ok=True)
    filtered_text = preprocess_text(text)
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(filtered_text)
    wordcloud.to_file(output_filename)
    
    
    # Save word cloud image
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.savefig(output_filename, bbox_inches="tight")
    plt.close()

#  Process each document
def extract_entities_from_summaries():
            all_text = ""  # Store all text for a combined word cloud

            for filename in os.listdir(input_dir):
                if filename.endswith(".md"):  # Process Markdown files
                    entity_file = f"entities_{filename}"
                    word_cloud_file = f"wordcloud_{filename}.png"
                    entity_file_path = os.path.join(entity_output_dir, entity_file)
                    word_cloud_path = os.path.join(entity_output_dir, word_cloud_file)

                    # ⏭ Skip if entity file & word cloud already exist
                    if entity_file in existing_entity_files and word_cloud_file in existing_entity_files:
                        print(f"⏩ Skipping {filename}, already processed.")
                        continue

                    file_path = os.path.join(input_dir, filename)
                    
                    with open(file_path, "r", encoding="utf-8") as file:
                        text = file.read()
                    
                    all_text += text + "\n\n"  # Collect text for a combined word cloud

                    #  Extract entities
                    if entity_file not in existing_entity_files:
                        entities = extract_entities(text)

                        #  Save extracted entities to a file
                        with open(entity_file_path, "w", encoding="utf-8") as f:
                            for entity_type, entity_words in entities.items():
                                f.write(f"{entity_type}:")
                                f.write(", ".join(entity_words) + "\n\n")

                        print(f" Extracted entities saved for {filename} -> {entity_file_path}")

                    #  Generate a word cloud for the document
                    if word_cloud_file not in existing_entity_files:
                        generate_word_cloud(text, word_cloud_path)
                        print(f"πŸŒ₯ Word cloud saved for {filename} -> {word_cloud_path}")

            #  Generate a word cloud for the entire dataset
            combined_word_cloud_path = os.path.join(entity_output_dir, "wordcloud_combined.png")

            if all_text.strip() and "wordcloud_combined.png" not in existing_entity_files:
                generate_word_cloud(all_text, combined_word_cloud_path)
                print(f"πŸŒ₯ Combined word cloud saved -> {combined_word_cloud_path}")

            print(" Entity extraction and word cloud generation completed!")


if __name__ == "__main__"and False:
    extract_entities_from_summaries()