Azzan Dwi Riski
commited on
Commit
·
277db83
1
Parent(s):
589504e
update code to show the processed image and text in interface
Browse files
app.py
CHANGED
@@ -263,11 +263,12 @@ def predict_single_url(url):
|
|
263 |
|
264 |
screenshot_path = take_screenshot(url)
|
265 |
if not screenshot_path:
|
266 |
-
return f"Error: Failed to take screenshot for {url}", None
|
267 |
|
268 |
-
|
|
|
269 |
|
270 |
-
if not
|
271 |
print(f"No OCR text found for {url}. Using Image-Only Model.")
|
272 |
image = Image.open(screenshot_path)
|
273 |
image_tensor = transform(image).unsqueeze(0).to(device)
|
@@ -283,10 +284,10 @@ def predict_single_url(url):
|
|
283 |
confidence = image_probs[0].item() if is_gambling else 1 - image_probs[0].item()
|
284 |
print(f"[Image-Only] URL: {url}")
|
285 |
print(f"Prediction: {label} | Confidence: {confidence:.2f}\n")
|
286 |
-
return label, f"Confidence: {confidence:.2f}"
|
287 |
|
288 |
else:
|
289 |
-
image_tensor, input_ids, attention_mask = prepare_data_for_model(screenshot_path,
|
290 |
|
291 |
with torch.no_grad():
|
292 |
fused_logits, image_logits, text_logits, weights = fusion_model(image_tensor, input_ids, attention_mask)
|
@@ -306,15 +307,15 @@ def predict_single_url(url):
|
|
306 |
print(f"Text Model Prediction Probability: {text_probs[0]:.2f}")
|
307 |
print(f"Fusion Final Prediction: {label} | Confidence: {confidence:.2f}\n")
|
308 |
|
309 |
-
return label, f"Confidence: {confidence:.2f}"
|
310 |
|
311 |
def predict_batch_urls(file_obj):
|
312 |
results = []
|
313 |
content = file_obj.read().decode('utf-8')
|
314 |
urls = [line.strip() for line in content.splitlines() if line.strip()]
|
315 |
for url in urls:
|
316 |
-
label, confidence = predict_single_url(url)
|
317 |
-
results.append({"url": url, "label": label, "confidence": confidence})
|
318 |
|
319 |
df = pd.DataFrame(results)
|
320 |
print(f"Batch prediction completed for {len(urls)} URLs.")
|
@@ -329,10 +330,26 @@ with gr.Blocks() as app:
|
|
329 |
with gr.Tab("Single URL"):
|
330 |
url_input = gr.Textbox(label="Enter Website URL")
|
331 |
predict_button = gr.Button("Predict")
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
|
337 |
with gr.Tab("Batch URLs"):
|
338 |
file_input = gr.File(label="Upload .txt file with URLs (one per line)")
|
|
|
263 |
|
264 |
screenshot_path = take_screenshot(url)
|
265 |
if not screenshot_path:
|
266 |
+
return f"Error: Failed to take screenshot for {url}", None, None, None, None
|
267 |
|
268 |
+
raw_text = extract_text_from_image(screenshot_path)
|
269 |
+
cleaned_text = clean_text(raw_text) if raw_text.strip() else ""
|
270 |
|
271 |
+
if not raw_text.strip(): # Jika text kosong
|
272 |
print(f"No OCR text found for {url}. Using Image-Only Model.")
|
273 |
image = Image.open(screenshot_path)
|
274 |
image_tensor = transform(image).unsqueeze(0).to(device)
|
|
|
284 |
confidence = image_probs[0].item() if is_gambling else 1 - image_probs[0].item()
|
285 |
print(f"[Image-Only] URL: {url}")
|
286 |
print(f"Prediction: {label} | Confidence: {confidence:.2f}\n")
|
287 |
+
return label, f"Confidence: {confidence:.2f}", screenshot_path, raw_text, cleaned_text
|
288 |
|
289 |
else:
|
290 |
+
image_tensor, input_ids, attention_mask = prepare_data_for_model(screenshot_path, raw_text)
|
291 |
|
292 |
with torch.no_grad():
|
293 |
fused_logits, image_logits, text_logits, weights = fusion_model(image_tensor, input_ids, attention_mask)
|
|
|
307 |
print(f"Text Model Prediction Probability: {text_probs[0]:.2f}")
|
308 |
print(f"Fusion Final Prediction: {label} | Confidence: {confidence:.2f}\n")
|
309 |
|
310 |
+
return label, f"Confidence: {confidence:.2f}", screenshot_path, raw_text, cleaned_text
|
311 |
|
312 |
def predict_batch_urls(file_obj):
|
313 |
results = []
|
314 |
content = file_obj.read().decode('utf-8')
|
315 |
urls = [line.strip() for line in content.splitlines() if line.strip()]
|
316 |
for url in urls:
|
317 |
+
label, confidence, screenshot_path, raw_text, cleaned_text = predict_single_url(url)
|
318 |
+
results.append({"url": url, "label": label, "confidence": confidence, "screenshot_path": screenshot_path, "raw_text": raw_text, "cleaned_text": cleaned_text})
|
319 |
|
320 |
df = pd.DataFrame(results)
|
321 |
print(f"Batch prediction completed for {len(urls)} URLs.")
|
|
|
330 |
with gr.Tab("Single URL"):
|
331 |
url_input = gr.Textbox(label="Enter Website URL")
|
332 |
predict_button = gr.Button("Predict")
|
333 |
+
|
334 |
+
with gr.Row():
|
335 |
+
with gr.Column():
|
336 |
+
label_output = gr.Label()
|
337 |
+
confidence_output = gr.Textbox(label="Confidence", interactive=False)
|
338 |
+
|
339 |
+
with gr.Column():
|
340 |
+
screenshot_output = gr.Image(label="Screenshot", type="filepath")
|
341 |
+
|
342 |
+
with gr.Row():
|
343 |
+
with gr.Column():
|
344 |
+
raw_text_output = gr.Textbox(label="Raw OCR Text", lines=5)
|
345 |
+
with gr.Column():
|
346 |
+
cleaned_text_output = gr.Textbox(label="Cleaned Text", lines=5)
|
347 |
+
|
348 |
+
predict_button.click(
|
349 |
+
fn=predict_single_url,
|
350 |
+
inputs=url_input,
|
351 |
+
outputs=[label_output, confidence_output, screenshot_output, raw_text_output, cleaned_text_output]
|
352 |
+
)
|
353 |
|
354 |
with gr.Tab("Batch URLs"):
|
355 |
file_input = gr.File(label="Upload .txt file with URLs (one per line)")
|