File size: 3,742 Bytes
1e494e3
 
 
 
cd11250
ddaad57
1e494e3
 
cce759a
ddaad57
1e494e3
 
 
 
dd174fa
 
 
1e494e3
 
 
 
ddaad57
 
cce759a
cd11250
 
 
 
 
 
 
 
 
 
 
 
 
 
cce759a
1e494e3
 
dd174fa
 
cd11250
dd174fa
cd11250
 
 
 
 
 
dd174fa
 
1e494e3
 
fa5ad45
cce759a
1e494e3
 
cce759a
ddaad57
 
 
 
 
 
ab99a02
ddaad57
 
ab99a02
 
ddaad57
 
 
 
 
 
ab99a02
ddaad57
 
 
 
 
 
 
 
 
 
 
 
 
 
ab99a02
ddaad57
 
 
ab99a02
ddaad57
 
ab99a02
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import torch
from torch.nn.functional import softmax
import re
from predictor import predict, batch_predict  # Assuming batch_predict is in predictor module

app = FastAPI(
    title="Contact Information Detection API",
    description="API for detecting contact information in text, great thanks to xxparthparekhxx/ContactShieldAI for the model",
    version="1.0.0",
    docs_url="/"
)

def preprocess_text(text):
    # Remove all punctuation except for @ and . which are often used in email addresses
    return re.sub(r'[^\w\s@.]', '', text)

class TextInput(BaseModel):
    text: str

class BatchTextInput(BaseModel):
    texts: list[str]

def check_regex_patterns(text):
    patterns = [
        r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',  # Email
        r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',  # Phone number
        r'\b\d{5}(?:[-\s]\d{4})?\b',  # ZIP code
        r'\b\d+\s+[\w\s]+(?:street|st|avenue|ave|road|rd|highway|hwy|square|sq|trail|trl|drive|dr|court|ct|park|parkway|pkwy|circle|cir|boulevard|blvd)\b\s*(?:[a-z]+\s*\d{1,3})?(?:,\s*(?:apt|bldg|dept|fl|hngr|lot|pier|rm|ste|unit|#)\s*[a-z0-9-]+)?(?:,\s*[a-z]+\s*[a-z]{2}\s*\d{5}(?:-\d{4})?)?',  # Street address
        r'(?:http|https)://(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:/[^\s]*)?'  # Website URL
    ]
    
    for pattern in patterns:
        if re.search(pattern, text, re.IGNORECASE):
            return True
    return False

@app.post("/detect_contact", summary="Detect contact information in text")
async def detect_contact(input: TextInput):
    try:
        preprocessed_text = preprocess_text(input.text)
        
        # First, check with regex patterns
        if check_regex_patterns(preprocessed_text):
            return {
                "text": input.text,
                "is_contact_info": True,
                "method": "regex"
            }
        
        # If no regex patterns match, use the model
        is_contact = predict(preprocessed_text)
        return {
            "text": input.text,
            "is_contact_info": is_contact == 1,
            "method": "model"
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/batch_detect_contact", summary="Detect contact information in batch of texts")
async def batch_detect_contact(inputs: BatchTextInput):
    try:
        # Preprocess all texts
        preprocessed_texts = [preprocess_text(text) for text in inputs.texts]

        # First, use regex to check patterns
        regex_results = [check_regex_patterns(text) for text in preprocessed_texts]


        # For texts where regex doesn't detect anything, use the model
        texts_for_model = [text for text, regex_match in zip(preprocessed_texts, regex_results) if not regex_match]
        if texts_for_model:
            model_results = batch_predict(texts_for_model)
        else:
            model_results = []

        # Prepare final results
        results = []
        model_idx = 0
        for i, text in enumerate(preprocessed_texts):
            if regex_results[i]:
                results.append({
                    "text": inputs.texts[i],
                    "is_contact_info": True,
                    "method": "regex"
                })
            else:
                is_contact = model_results[model_idx]
                results.append({
                    "text": inputs.texts[i],
                    "is_contact_info": bool(is_contact),  # Convert numpy bool
                    "method": "model"
                })
                model_idx += 1

        return results
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))