kokluch commited on
Commit
d316383
·
1 Parent(s): ea52f2e

Search for past scan before requesting new one.

Browse files
Files changed (2) hide show
  1. app.py +32 -10
  2. url_tools.py +12 -0
app.py CHANGED
@@ -5,7 +5,7 @@ from pydantic import BaseModel
5
  from enum import Enum
6
  from transformers import pipeline
7
  from phishing_datasets import submit_entry
8
- from url_tools import extract_urls, resolve_short_url
9
  from urlscan_client import UrlscanClient
10
  import requests
11
 
@@ -73,15 +73,37 @@ def predict(model: InputModel) -> OutputModel:
73
  print(f"Predict: {text}")
74
 
75
  urls = extract_urls(text)
76
- results = [urlscan.scan(url) for url in urls]
77
-
78
- for result in results:
79
- overall = result.get('verdicts', {}).get('overall', {})
80
- print(f"Checking verdict: {overall}")
81
- if overall.get('hasVerdicts') and overall.get('score') > 0:
82
- print("Match found. Submitting entry and returning JUNK.")
83
- submit_entry(model.query.sender, model.query.message.text)
84
- return OutputModel(action=ActionModel.JUNK, sub_action=SubActionModel.NONE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  label = pipe(text)
87
  if label[0]['label'] == 'LABEL_1':
 
5
  from enum import Enum
6
  from transformers import pipeline
7
  from phishing_datasets import submit_entry
8
+ from url_tools import extract_urls, resolve_short_url, extract_domain_from_url
9
  from urlscan_client import UrlscanClient
10
  import requests
11
 
 
73
  print(f"Predict: {text}")
74
 
75
  urls = extract_urls(text)
76
+
77
+ if urls:
78
+ print("Searching for past scans")
79
+ search_results = [urlscan.search(f"domain:{extract_domain_from_url(url)}") for url in urls]
80
+
81
+ scan_results = []
82
+ for search_result in search_results:
83
+ results = search_result.get('results', [])
84
+ for result in results:
85
+ result_uuid = result.get('_id', str)
86
+ scan_result = urlscan.get_result(result_uuid)
87
+ scan_results.append(scan_result)
88
+
89
+ if not scan_results:
90
+ print("Scanning...")
91
+ scan_results = [urlscan.scan(url) for url in urls]
92
+
93
+ for result in scan_results:
94
+ overall = result.get('verdicts', {}).get('overall', {})
95
+ print(f"Checking overall verdict: {overall}")
96
+ if overall.get('hasVerdicts'):
97
+ score = overall.get('score')
98
+ print(f"Has verdicts score {score}")
99
+
100
+ if 0 < overall.get('score'):
101
+ print("Submitting entry and returning JUNK.")
102
+ submit_entry(model.query.sender, model.query.message.text)
103
+ return OutputModel(action=ActionModel.JUNK, sub_action=SubActionModel.NONE)
104
+ # elif overall.get('score') < 0:
105
+ # print("Returning ALLOW.")
106
+ # return OutputModel(action=ActionModel.ALLOW, sub_action=SubActionModel.NONE)
107
 
108
  label = pipe(text)
109
  if label[0]['label'] == 'LABEL_1':
url_tools.py CHANGED
@@ -7,6 +7,18 @@ def extract_urls(text: str):
7
  url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?"""
8
  return re.findall(url_pattern, text)
9
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def normalize_url(url: str) -> str:
12
  """Ensure the URL has a scheme and is normalized."""
 
7
  url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?"""
8
  return re.findall(url_pattern, text)
9
 
10
+ def extract_domain_from_url(url: str) -> str:
11
+ """
12
+ Extracts the domain (netloc) from a given URL.
13
+
14
+ Parameters:
15
+ url (str): The full URL.
16
+
17
+ Returns:
18
+ str: The domain (e.g., 'example.com').
19
+ """
20
+ parsed = urlparse(url)
21
+ return parsed.netloc
22
 
23
  def normalize_url(url: str) -> str:
24
  """Ensure the URL has a scheme and is normalized."""