Spaces:
Running
Running
__all__ = [ | |
"mask_email_udf", "mask_name_udf", "mask_date_udf", | |
"mask_ssn_udf", "mask_itin_udf", "mask_phone_udf" | |
] | |
from pyspark.sql.functions import udf | |
from pyspark.sql.types import StringType | |
import re | |
from datetime import datetime | |
# Masking functions | |
def mask_email(value): | |
if value and "@" in value: | |
user, domain = value.split("@") | |
return "***@" + domain | |
return None | |
def mask_name(value): | |
if value: | |
return value[0] + "***" | |
return None | |
def mask_date(value): | |
try: | |
dt = datetime.strptime(value, "%Y-%m-%d") | |
return dt.strftime("***-**-%d") | |
except: | |
return None | |
def mask_ssn(value): | |
if value and re.match(r"\d{3}-\d{2}-\d{4}", value): | |
return "***-**-" + value[-4:] | |
return None | |
def mask_itin(value): | |
if value and re.match(r"9\d{2}-7\d-\d{4}", value): | |
return "***-**-" + value[-4:] | |
return None | |
def mask_phone(value): | |
if value and re.match(r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", value): | |
return "***-***-" + value[-4:] | |
return None | |
# UDFs for Spark | |
mask_email_udf = udf(mask_email, StringType()) | |
mask_name_udf = udf(mask_name, StringType()) | |
mask_date_udf = udf(mask_date, StringType()) | |
mask_ssn_udf = udf(mask_ssn, StringType()) | |
mask_itin_udf = udf(mask_itin, StringType()) | |
mask_phone_udf = udf(mask_phone, StringType()) | |