Spaces:

kokluch
/

phishing-detector-api

Running

App Files Files Community

kokluch commited on May 19

Commit

d316383

1 Parent(s): ea52f2e

Search for past scan before requesting new one.

Browse files

Files changed (2) hide show

app.py +32 -10
url_tools.py +12 -0

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ from pydantic import  BaseModel
 from enum import Enum
 from transformers import pipeline
 from phishing_datasets import submit_entry
-from url_tools import extract_urls, resolve_short_url
 from urlscan_client import UrlscanClient
 import requests
@@ -73,15 +73,37 @@ def predict(model: InputModel) -> OutputModel:
     print(f"Predict: {text}")
     urls = extract_urls(text)
-    results = [urlscan.scan(url) for url in urls]
-    for result in results:
-        overall = result.get('verdicts', {}).get('overall', {})
-        print(f"Checking verdict: {overall}")
-        if overall.get('hasVerdicts') and overall.get('score') > 0:
-            print("Match found. Submitting entry and returning JUNK.")
-            submit_entry(model.query.sender, model.query.message.text)
-            return OutputModel(action=ActionModel.JUNK, sub_action=SubActionModel.NONE)
     label = pipe(text)
     if label[0]['label'] == 'LABEL_1':

 from enum import Enum
 from transformers import pipeline
 from phishing_datasets import submit_entry
+from url_tools import extract_urls, resolve_short_url, extract_domain_from_url
 from urlscan_client import UrlscanClient
 import requests
     print(f"Predict: {text}")
     urls = extract_urls(text)
+    if urls:
+        print("Searching for past scans")
+        search_results = [urlscan.search(f"domain:{extract_domain_from_url(url)}") for url in urls]
+        scan_results = []
+        for search_result in search_results:
+            results = search_result.get('results', [])
+            for result in results:
+                result_uuid = result.get('_id', str)
+                scan_result = urlscan.get_result(result_uuid)
+                scan_results.append(scan_result)
+        if not scan_results:
+            print("Scanning...")
+            scan_results = [urlscan.scan(url) for url in urls]
+        for result in scan_results:
+            overall = result.get('verdicts', {}).get('overall', {})
+            print(f"Checking overall verdict: {overall}")
+            if overall.get('hasVerdicts'):
+                score = overall.get('score')
+                print(f"Has verdicts score {score}")
+                if 0 < overall.get('score'):
+                    print("Submitting entry and returning JUNK.")
+                    submit_entry(model.query.sender, model.query.message.text)
+                    return OutputModel(action=ActionModel.JUNK, sub_action=SubActionModel.NONE)
+                # elif overall.get('score') < 0:
+                #     print("Returning ALLOW.")
+                #     return OutputModel(action=ActionModel.ALLOW, sub_action=SubActionModel.NONE)
     label = pipe(text)
     if label[0]['label'] == 'LABEL_1':

url_tools.py CHANGED Viewed

@@ -7,6 +7,18 @@ def extract_urls(text: str):
     url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?"""
     return re.findall(url_pattern, text)
 def normalize_url(url: str) -> str:
     """Ensure the URL has a scheme and is normalized."""

     url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?"""
     return re.findall(url_pattern, text)
+def extract_domain_from_url(url: str) -> str:
+    """
+    Extracts the domain (netloc) from a given URL.
+    Parameters:
+        url (str): The full URL.
+    Returns:
+        str: The domain (e.g., 'example.com').
+    """
+    parsed = urlparse(url)
+    return parsed.netloc
 def normalize_url(url: str) -> str:
     """Ensure the URL has a scheme and is normalized."""