Spaces:
Sleeping
Sleeping
\ | |
import pandas as pd | |
import re, unicodedata | |
from html import unescape | |
MIN_LEN = 20 | |
MAX_LEN = 60 | |
KEEP_ASCII_ONLY = False | |
MIN_ALPHA_RATIO = 0.60 | |
DROP_IF_ALL_CAPS = False | |
BUZZY = { | |
"synergy","cutting edge","cutting-edge","best in class","best-in-class", | |
"world class","world-class","state of the art","state-of-the-art", | |
"revolutionary","disruptive platform","next generation","next-gen", | |
"leading provider","scalable solution" | |
} | |
URL_RE = re.compile(r"(https?://|www\.)\S+", re.I) | |
EMAIL_RE = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.I) | |
PHONE_RE = re.compile(r"(\+?\d[\d\-\s()]{6,}\d)") | |
WS_RE = re.compile(r"\s+") | |
PUNCT_RE = re.compile(r"[^\w\s]+") | |
TM_RE = re.compile(r"[®©™]") | |
def _nfkc(s): return unicodedata.normalize("NFKC", s) | |
def _clean_text(s: str) -> str: | |
s = "" if s is None else str(s) | |
s = unescape(s) | |
s = _nfkc(s) | |
s = s.replace("\n"," ").replace("\r"," ") | |
s = TM_RE.sub("", s) | |
s = WS_RE.sub(" ", s).strip() | |
return s | |
def _alpha_ratio(s: str) -> float: | |
if not s: return 0.0 | |
letters = sum(ch.isalpha() for ch in s) | |
return letters / max(1, len(s)) | |
def _looks_shouty(s: str) -> bool: | |
letters = [ch for ch in s if ch.isalpha()] | |
if not letters: return False | |
uppers = sum(ch.isupper() for ch in letters) | |
return uppers / len(letters) >= 0.85 | |
def _contains_buzzy(s: str) -> bool: | |
lo = s.lower() | |
return any(term in lo for term in BUZZY) | |
def _has_junk(s: str) -> bool: | |
return bool(URL_RE.search(s) or EMAIL_RE.search(s) or PHONE_RE.search(s)) | |
def _ascii_only(s: str) -> bool: | |
try: | |
s.encode("ascii"); return True | |
except Exception: | |
return False | |
def _dupe_key(s: str) -> str: | |
s = s.lower() | |
s = PUNCT_RE.sub(" ", s) | |
s = WS_RE.sub(" ", s).strip() | |
return s | |
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame: | |
if "tagline" not in df.columns: | |
raise ValueError("Input must contain a 'tagline' column.") | |
df = df.copy() | |
if "description" not in df.columns: | |
df["description"] = df["tagline"] | |
df["tagline"] = df["tagline"].map(_clean_text) | |
df["description"] = df["description"].map(_clean_text) | |
df = df[(df["tagline"].str.len() > 0)] | |
mask_junk = df["tagline"].map(_has_junk) | df["description"].map(_has_junk) | |
df = df[~mask_junk] | |
if KEEP_ASCII_ONLY: | |
df = df[df["tagline"].map(_ascii_only)] | |
df = df[df["tagline"].map(_alpha_ratio) >= MIN_ALPHA_RATIO] | |
df = df[df["tagline"].str.len().between(MIN_LEN, MAX_LEN)] | |
if DROP_IF_ALL_CAPS: | |
df = df[~df["tagline"].map(_looks_shouty)] | |
df = df[~df["tagline"].map(_contains_buzzy)] | |
key = df["tagline"].map(_dupe_key) | |
df = df.loc[~key.duplicated()].reset_index(drop=True) | |
df.loc[df["description"].str.len() == 0, "description"] = df["tagline"] | |
return df | |