Spaces:
Running
Running
import re | |
import fitz | |
import numpy as np | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sentence_transformers import SentenceTransformer | |
import spacy | |
# Load the English NLP model and the SentenceTransformer model | |
nlp = spacy.load("en_core_web_sm") | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
def extract_text(file): | |
if file.name.endswith(".pdf"): | |
doc = fitz.open(stream=file.read(), filetype="pdf") | |
return "\n".join([page.get_text() for page in doc]) | |
else: | |
return file.read().decode("utf-8") | |
def anonymize_text(text): | |
doc = nlp(text) | |
#Collect spaCy-detected entities | |
replacements = [] | |
for ent in doc.ents: | |
if ent.label_ == "PERSON": | |
replacements.append((ent.start_char, ent.end_char, "PERSON")) | |
elif ent.label_ == "DATE": | |
replacements.append((ent.start_char, ent.end_char, "DATE")) | |
elif ent.label_ in ["GPE", "LOC"]: | |
replacements.append((ent.start_char, ent.end_char, "LOCATION")) | |
elif ent.label_ == "ORG": | |
replacements.append((ent.start_char, ent.end_char, "ORG")) | |
#Add regex-based matches for things spaCy misses | |
regex_patterns = [ | |
(r"\b[\w\.-]+@[\w\.-]+\.\w+\b", "EMAIL"), # Email | |
(r"https?://\S+|www\.\S+", "URL"), # URLs | |
(r"\b\d{10}\b", "PHONE"), # 10-digit phone numbers | |
(r"\b[A-Z]{2,}\d{6,}\b", "ID"), # Generic IDs (e.g., AA123456) | |
] | |
for pattern, label in regex_patterns: | |
for match in re.finditer(pattern, text): | |
replacements.append((match.start(), match.end(), label)) | |
replacements.sort(reverse=True) | |
for start, end, replacement in replacements: | |
text = text[:start] + f"[{replacement}]" + text[end:] # Adding brackets for clarity | |
return text | |
def score_synopsis(article, synopsis): | |
embeddings = model.encode([article, synopsis]) | |
similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0] | |
content_coverage = similarity * 50 | |
clarity = (len(set(synopsis.split())) / max(len(synopsis.split()), 1)) * 25 | |
coherence = min(25, 5 * (len(synopsis.split(".")) - 1)) | |
total = content_coverage + clarity + coherence | |
return { | |
"total": round(total, 2), | |
"content_coverage": round(content_coverage, 2), | |
"clarity": round(clarity, 2), | |
"coherence": round(coherence, 2) | |
} |