Spaces:
Running
Running
Search for past scan before requesting new one.
Browse files- app.py +32 -10
- url_tools.py +12 -0
app.py
CHANGED
@@ -5,7 +5,7 @@ from pydantic import BaseModel
|
|
5 |
from enum import Enum
|
6 |
from transformers import pipeline
|
7 |
from phishing_datasets import submit_entry
|
8 |
-
from url_tools import extract_urls, resolve_short_url
|
9 |
from urlscan_client import UrlscanClient
|
10 |
import requests
|
11 |
|
@@ -73,15 +73,37 @@ def predict(model: InputModel) -> OutputModel:
|
|
73 |
print(f"Predict: {text}")
|
74 |
|
75 |
urls = extract_urls(text)
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
label = pipe(text)
|
87 |
if label[0]['label'] == 'LABEL_1':
|
|
|
5 |
from enum import Enum
|
6 |
from transformers import pipeline
|
7 |
from phishing_datasets import submit_entry
|
8 |
+
from url_tools import extract_urls, resolve_short_url, extract_domain_from_url
|
9 |
from urlscan_client import UrlscanClient
|
10 |
import requests
|
11 |
|
|
|
73 |
print(f"Predict: {text}")
|
74 |
|
75 |
urls = extract_urls(text)
|
76 |
+
|
77 |
+
if urls:
|
78 |
+
print("Searching for past scans")
|
79 |
+
search_results = [urlscan.search(f"domain:{extract_domain_from_url(url)}") for url in urls]
|
80 |
+
|
81 |
+
scan_results = []
|
82 |
+
for search_result in search_results:
|
83 |
+
results = search_result.get('results', [])
|
84 |
+
for result in results:
|
85 |
+
result_uuid = result.get('_id', str)
|
86 |
+
scan_result = urlscan.get_result(result_uuid)
|
87 |
+
scan_results.append(scan_result)
|
88 |
+
|
89 |
+
if not scan_results:
|
90 |
+
print("Scanning...")
|
91 |
+
scan_results = [urlscan.scan(url) for url in urls]
|
92 |
+
|
93 |
+
for result in scan_results:
|
94 |
+
overall = result.get('verdicts', {}).get('overall', {})
|
95 |
+
print(f"Checking overall verdict: {overall}")
|
96 |
+
if overall.get('hasVerdicts'):
|
97 |
+
score = overall.get('score')
|
98 |
+
print(f"Has verdicts score {score}")
|
99 |
+
|
100 |
+
if 0 < overall.get('score'):
|
101 |
+
print("Submitting entry and returning JUNK.")
|
102 |
+
submit_entry(model.query.sender, model.query.message.text)
|
103 |
+
return OutputModel(action=ActionModel.JUNK, sub_action=SubActionModel.NONE)
|
104 |
+
# elif overall.get('score') < 0:
|
105 |
+
# print("Returning ALLOW.")
|
106 |
+
# return OutputModel(action=ActionModel.ALLOW, sub_action=SubActionModel.NONE)
|
107 |
|
108 |
label = pipe(text)
|
109 |
if label[0]['label'] == 'LABEL_1':
|
url_tools.py
CHANGED
@@ -7,6 +7,18 @@ def extract_urls(text: str):
|
|
7 |
url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?"""
|
8 |
return re.findall(url_pattern, text)
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def normalize_url(url: str) -> str:
|
12 |
"""Ensure the URL has a scheme and is normalized."""
|
|
|
7 |
url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?"""
|
8 |
return re.findall(url_pattern, text)
|
9 |
|
10 |
+
def extract_domain_from_url(url: str) -> str:
|
11 |
+
"""
|
12 |
+
Extracts the domain (netloc) from a given URL.
|
13 |
+
|
14 |
+
Parameters:
|
15 |
+
url (str): The full URL.
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
str: The domain (e.g., 'example.com').
|
19 |
+
"""
|
20 |
+
parsed = urlparse(url)
|
21 |
+
return parsed.netloc
|
22 |
|
23 |
def normalize_url(url: str) -> str:
|
24 |
"""Ensure the URL has a scheme and is normalized."""
|