Spaces:
Running
Running
Add 3 files
Browse files- README.md +6 -4
- index.html +1418 -19
- prompts.txt +1 -0
README.md
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: pink
|
5 |
-
colorTo:
|
6 |
sdk: static
|
7 |
pinned: false
|
|
|
|
|
8 |
---
|
9 |
|
10 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: smart-web-crawler
|
3 |
+
emoji: 🐳
|
4 |
colorFrom: pink
|
5 |
+
colorTo: pink
|
6 |
sdk: static
|
7 |
pinned: false
|
8 |
+
tags:
|
9 |
+
- deepsite
|
10 |
---
|
11 |
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
index.html
CHANGED
@@ -1,19 +1,1418 @@
|
|
1 |
-
<!
|
2 |
-
<html>
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>Smart Web Crawler</title>
|
7 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
8 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
|
9 |
+
<style>
|
10 |
+
.gradient-bg {
|
11 |
+
background: linear-gradient(135deg, #6b73ff 0%, #000dff 100%);
|
12 |
+
}
|
13 |
+
.crawl-animation {
|
14 |
+
animation: crawlPulse 2s infinite;
|
15 |
+
}
|
16 |
+
@keyframes crawlPulse {
|
17 |
+
0% { transform: translateY(0); }
|
18 |
+
50% { transform: translateY(-5px); }
|
19 |
+
100% { transform: translateY(0); }
|
20 |
+
}
|
21 |
+
.progress-bar {
|
22 |
+
transition: width 0.3s ease;
|
23 |
+
}
|
24 |
+
.result-card:hover {
|
25 |
+
transform: translateY(-5px);
|
26 |
+
box-shadow: 0 10px 25px rgba(0, 0, 255, 0.1);
|
27 |
+
}
|
28 |
+
.code-block {
|
29 |
+
font-family: 'Courier New', monospace;
|
30 |
+
background-color: #2d3748;
|
31 |
+
color: #f7fafc;
|
32 |
+
}
|
33 |
+
.tab-active {
|
34 |
+
border-bottom: 3px solid #3b82f6;
|
35 |
+
color: #3b82f6;
|
36 |
+
font-weight: 600;
|
37 |
+
}
|
38 |
+
.fade-in {
|
39 |
+
animation: fadeIn 0.5s ease-in;
|
40 |
+
}
|
41 |
+
@keyframes fadeIn {
|
42 |
+
from { opacity: 0; }
|
43 |
+
to { opacity: 1; }
|
44 |
+
}
|
45 |
+
.progress-step {
|
46 |
+
position: relative;
|
47 |
+
padding-left: 2rem;
|
48 |
+
}
|
49 |
+
.progress-step:before {
|
50 |
+
content: '';
|
51 |
+
position: absolute;
|
52 |
+
left: 0.5rem;
|
53 |
+
top: 0;
|
54 |
+
bottom: 0;
|
55 |
+
width: 2px;
|
56 |
+
background-color: #e5e7eb;
|
57 |
+
}
|
58 |
+
.progress-step:first-child:before {
|
59 |
+
top: 1rem;
|
60 |
+
}
|
61 |
+
.progress-step:last-child:before {
|
62 |
+
bottom: calc(100% - 1rem);
|
63 |
+
}
|
64 |
+
.progress-step.completed .step-icon {
|
65 |
+
background-color: #10b981;
|
66 |
+
color: white;
|
67 |
+
}
|
68 |
+
.progress-step.active .step-icon {
|
69 |
+
background-color: #3b82f6;
|
70 |
+
color: white;
|
71 |
+
}
|
72 |
+
.progress-step.pending .step-icon {
|
73 |
+
background-color: #e5e7eb;
|
74 |
+
color: #6b7280;
|
75 |
+
}
|
76 |
+
.progress-step.error .step-icon {
|
77 |
+
background-color: #ef4444;
|
78 |
+
color: white;
|
79 |
+
}
|
80 |
+
.log-entry.error {
|
81 |
+
color: #ef4444;
|
82 |
+
}
|
83 |
+
.log-entry.warning {
|
84 |
+
color: #f59e0b;
|
85 |
+
}
|
86 |
+
.log-entry.success {
|
87 |
+
color: #10b981;
|
88 |
+
}
|
89 |
+
.log-entry.info {
|
90 |
+
color: #3b82f6;
|
91 |
+
}
|
92 |
+
.progress-multi {
|
93 |
+
height: 6px;
|
94 |
+
border-radius: 3px;
|
95 |
+
}
|
96 |
+
</style>
|
97 |
+
</head>
|
98 |
+
<body class="bg-gray-50 min-h-screen">
|
99 |
+
<div class="gradient-bg text-white py-8 px-4 shadow-lg">
|
100 |
+
<div class="container mx-auto">
|
101 |
+
<div class="flex items-center justify-between">
|
102 |
+
<div>
|
103 |
+
<h1 class="text-3xl font-bold flex items-center">
|
104 |
+
<i class="fas fa-spider mr-3 crawl-animation"></i> Smart Web Crawler
|
105 |
+
</h1>
|
106 |
+
<p class="mt-2 opacity-90">Extract and organize web content into structured knowledge</p>
|
107 |
+
</div>
|
108 |
+
<div class="hidden md:block">
|
109 |
+
<div class="flex space-x-2">
|
110 |
+
<span class="px-3 py-1 bg-blue-400 rounded-full text-xs font-semibold">AI-Powered</span>
|
111 |
+
<span class="px-3 py-1 bg-purple-400 rounded-full text-xs font-semibold">Multi-Format</span>
|
112 |
+
<span class="px-3 py-1 bg-green-400 rounded-full text-xs font-semibold">Smart Filtering</span>
|
113 |
+
</div>
|
114 |
+
</div>
|
115 |
+
</div>
|
116 |
+
</div>
|
117 |
+
</div>
|
118 |
+
|
119 |
+
<div class="container mx-auto px-4 py-8">
|
120 |
+
<div class="bg-white rounded-xl shadow-lg overflow-hidden mb-8">
|
121 |
+
<div class="p-6">
|
122 |
+
<h2 class="text-xl font-semibold text-gray-800 mb-4">Crawler Configuration</h2>
|
123 |
+
|
124 |
+
<div class="grid grid-cols-1 md:grid-cols-2 gap-6">
|
125 |
+
<div>
|
126 |
+
<label class="block text-sm font-medium text-gray-700 mb-1">Start URL</label>
|
127 |
+
<div class="flex">
|
128 |
+
<input type="text" id="baseUrl" placeholder="https://example.com"
|
129 |
+
class="flex-1 px-4 py-2 border border-gray-300 rounded-l-lg focus:ring-blue-500 focus:border-blue-500">
|
130 |
+
<button id="validateUrlBtn" class="px-4 py-2 bg-blue-600 text-white rounded-r-lg hover:bg-blue-700">
|
131 |
+
<i class="fas fa-check"></i>
|
132 |
+
</button>
|
133 |
+
</div>
|
134 |
+
<p id="urlError" class="text-red-500 text-xs mt-1 hidden">Please enter a valid URL starting with http:// or https://</p>
|
135 |
+
</div>
|
136 |
+
|
137 |
+
<div>
|
138 |
+
<label class="block text-sm font-medium text-gray-700 mb-1">Output Format</label>
|
139 |
+
<div class="flex space-x-4">
|
140 |
+
<label class="inline-flex items-center">
|
141 |
+
<input type="radio" name="outputFormat" value="json" checked class="h-4 w-4 text-blue-600 focus:ring-blue-500">
|
142 |
+
<span class="ml-2">JSON</span>
|
143 |
+
</label>
|
144 |
+
<label class="inline-flex items-center">
|
145 |
+
<input type="radio" name="outputFormat" value="md" class="h-4 w-4 text-blue-600 focus:ring-blue-500">
|
146 |
+
<span class="ml-2">Markdown</span>
|
147 |
+
</label>
|
148 |
+
</div>
|
149 |
+
</div>
|
150 |
+
|
151 |
+
<div>
|
152 |
+
<label class="block text-sm font-medium text-gray-700 mb-1">Max Depth</label>
|
153 |
+
<input type="number" id="maxDepth" min="1" max="10" value="3"
|
154 |
+
class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500">
|
155 |
+
</div>
|
156 |
+
|
157 |
+
<div>
|
158 |
+
<label class="block text-sm font-medium text-gray-700 mb-1">Max Concurrent Requests</label>
|
159 |
+
<input type="number" id="maxConcurrent" min="1" max="50" value="20"
|
160 |
+
class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500">
|
161 |
+
</div>
|
162 |
+
</div>
|
163 |
+
|
164 |
+
<div class="mt-6">
|
165 |
+
<h3 class="text-md font-medium text-gray-700 mb-3">Content to Extract</h3>
|
166 |
+
<div class="flex flex-wrap gap-4">
|
167 |
+
<label class="inline-flex items-center">
|
168 |
+
<input type="checkbox" id="extractText" checked class="h-4 w-4 text-blue-600 rounded focus:ring-blue-500">
|
169 |
+
<span class="ml-2">Text Content</span>
|
170 |
+
</label>
|
171 |
+
<label class="inline-flex items-center">
|
172 |
+
<input type="checkbox" id="extractCode" checked class="h-4 w-4 text-blue-600 rounded focus:ring-blue-500">
|
173 |
+
<span class="ml-2">Code Blocks</span>
|
174 |
+
</label>
|
175 |
+
<label class="inline-flex items-center">
|
176 |
+
<input type="checkbox" id="extractTables" checked class="h-4 w-4 text-blue-600 rounded focus:ring-blue-500">
|
177 |
+
<span class="ml-2">Tables</span>
|
178 |
+
</label>
|
179 |
+
<label class="inline-flex items-center">
|
180 |
+
<input type="checkbox" id="extractLists" checked class="h-4 w-4 text-blue-600 rounded focus:ring-blue-500">
|
181 |
+
<span class="ml-2">Lists</span>
|
182 |
+
</label>
|
183 |
+
</div>
|
184 |
+
</div>
|
185 |
+
|
186 |
+
<div class="mt-6">
|
187 |
+
<label class="inline-flex items-center">
|
188 |
+
<input type="checkbox" id="useLLMFilter" class="h-4 w-4 text-blue-600 rounded focus:ring-blue-500">
|
189 |
+
<span class="ml-2 font-medium">Enable AI Content Filtering</span>
|
190 |
+
</label>
|
191 |
+
<div id="llmSettings" class="mt-3 pl-6 hidden">
|
192 |
+
<label class="block text-sm font-medium text-gray-700 mb-1">Minimum Quality Score (0-100)</label>
|
193 |
+
<input type="number" id="minLLMScore" min="0" max="100" value="50"
|
194 |
+
class="w-24 px-4 py-2 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500">
|
195 |
+
</div>
|
196 |
+
</div>
|
197 |
+
|
198 |
+
<div class="mt-8 flex justify-center">
|
199 |
+
<button id="startCrawlBtn"
|
200 |
+
class="px-8 py-3 bg-blue-600 text-white rounded-lg font-semibold hover:bg-blue-700 transition-all flex items-center">
|
201 |
+
<i class="fas fa-play mr-2"></i> Start Crawling
|
202 |
+
</button>
|
203 |
+
</div>
|
204 |
+
</div>
|
205 |
+
</div>
|
206 |
+
|
207 |
+
<div id="progressSection" class="hidden">
|
208 |
+
<div class="bg-white rounded-xl shadow-lg overflow-hidden mb-8">
|
209 |
+
<div class="p-6">
|
210 |
+
<div class="flex justify-between items-center mb-4">
|
211 |
+
<h2 class="text-xl font-semibold text-gray-800">Crawling Progress</h2>
|
212 |
+
<button id="stopCrawlBtn"
|
213 |
+
class="px-4 py-2 bg-red-500 text-white rounded-lg text-sm hover:bg-red-600">
|
214 |
+
<i class="fas fa-stop mr-1"></i> Stop
|
215 |
+
</button>
|
216 |
+
</div>
|
217 |
+
|
218 |
+
<!-- Progress Steps -->
|
219 |
+
<div class="mb-6">
|
220 |
+
<div class="flex space-x-4 mb-4">
|
221 |
+
<div class="progress-step pending" id="step1">
|
222 |
+
<div class="flex items-center">
|
223 |
+
<div class="step-icon w-6 h-6 rounded-full flex items-center justify-center mr-2">
|
224 |
+
<i class="fas fa-link text-xs"></i>
|
225 |
+
</div>
|
226 |
+
<span class="text-sm">URL Validation</span>
|
227 |
+
</div>
|
228 |
+
</div>
|
229 |
+
<div class="progress-step pending" id="step2">
|
230 |
+
<div class="flex items-center">
|
231 |
+
<div class="step-icon w-6 h-6 rounded-full flex items-center justify-center mr-2">
|
232 |
+
<i class="fas fa-sitemap text-xs"></i>
|
233 |
+
</div>
|
234 |
+
<span class="text-sm">Site Mapping</span>
|
235 |
+
</div>
|
236 |
+
</div>
|
237 |
+
<div class="progress-step pending" id="step3">
|
238 |
+
<div class="flex items-center">
|
239 |
+
<div class="step-icon w-6 h-6 rounded-full flex items-center justify-center mr-2">
|
240 |
+
<i class="fas fa-file-alt text-xs"></i>
|
241 |
+
</div>
|
242 |
+
<span class="text-sm">Content Extraction</span>
|
243 |
+
</div>
|
244 |
+
</div>
|
245 |
+
<div class="progress-step pending" id="step4">
|
246 |
+
<div class="flex items-center">
|
247 |
+
<div class="step-icon w-6 h-6 rounded-full flex items-center justify-center mr-2">
|
248 |
+
<i class="fas fa-robot text-xs"></i>
|
249 |
+
</div>
|
250 |
+
<span class="text-sm">AI Analysis</span>
|
251 |
+
</div>
|
252 |
+
</div>
|
253 |
+
</div>
|
254 |
+
|
255 |
+
<!-- Multi-level progress bars -->
|
256 |
+
<div class="space-y-2 mb-2">
|
257 |
+
<div>
|
258 |
+
<div class="flex justify-between text-xs text-gray-600 mb-1">
|
259 |
+
<span>URL Discovery</span>
|
260 |
+
<span id="urlDiscoveryPercent">0%</span>
|
261 |
+
</div>
|
262 |
+
<div class="w-full bg-gray-200 rounded-full h-1.5">
|
263 |
+
<div id="urlDiscoveryBar" class="progress-multi bg-blue-400 h-1.5 rounded-full" style="width: 0%"></div>
|
264 |
+
</div>
|
265 |
+
</div>
|
266 |
+
<div>
|
267 |
+
<div class="flex justify-between text-xs text-gray-600 mb-1">
|
268 |
+
<span>Content Extraction</span>
|
269 |
+
<span id="contentExtractionPercent">0%</span>
|
270 |
+
</div>
|
271 |
+
<div class="w-full bg-gray-200 rounded-full h-1.5">
|
272 |
+
<div id="contentExtractionBar" class="progress-multi bg-green-400 h-1.5 rounded-full" style="width: 0%"></div>
|
273 |
+
</div>
|
274 |
+
</div>
|
275 |
+
<div>
|
276 |
+
<div class="flex justify-between text-xs text-gray-600 mb-1">
|
277 |
+
<span>AI Processing</span>
|
278 |
+
<span id="aiProcessingPercent">0%</span>
|
279 |
+
</div>
|
280 |
+
<div class="w-full bg-gray-200 rounded-full h-1.5">
|
281 |
+
<div id="aiProcessingBar" class="progress-multi bg-purple-400 h-1.5 rounded-full" style="width: 0%"></div>
|
282 |
+
</div>
|
283 |
+
</div>
|
284 |
+
</div>
|
285 |
+
|
286 |
+
<!-- Main progress bar -->
|
287 |
+
<div class="mb-4">
|
288 |
+
<div class="flex justify-between text-sm text-gray-600 mb-1">
|
289 |
+
<span>Overall Progress: <span id="overallPercent">0%</span></span>
|
290 |
+
<span>Time Elapsed: <span id="timeElapsed">00:00</span></span>
|
291 |
+
</div>
|
292 |
+
<div class="w-full bg-gray-200 rounded-full h-2.5">
|
293 |
+
<div id="progressBar" class="progress-bar bg-blue-600 h-2.5 rounded-full" style="width: 0%"></div>
|
294 |
+
</div>
|
295 |
+
</div>
|
296 |
+
</div>
|
297 |
+
|
298 |
+
<div class="bg-gray-50 p-4 rounded-lg">
|
299 |
+
<div class="flex items-center mb-2">
|
300 |
+
<div class="w-8 h-8 rounded-full bg-blue-100 flex items-center justify-center mr-3">
|
301 |
+
<i class="fas fa-spider text-blue-600"></i>
|
302 |
+
</div>
|
303 |
+
<div class="flex-1">
|
304 |
+
<p class="text-sm font-medium">Currently Crawling:</p>
|
305 |
+
<p id="currentUrl" class="text-sm text-gray-600 truncate">Waiting to start...</p>
|
306 |
+
</div>
|
307 |
+
</div>
|
308 |
+
|
309 |
+
<div class="flex items-center">
|
310 |
+
<div class="w-8 h-8 rounded-full bg-purple-100 flex items-center justify-center mr-3">
|
311 |
+
<i class="fas fa-robot text-purple-600"></i>
|
312 |
+
</div>
|
313 |
+
<div class="flex-1">
|
314 |
+
<p class="text-sm font-medium">AI Analysis:</p>
|
315 |
+
<p id="aiAnalysis" class="text-sm text-gray-600">Ready to evaluate content quality</p>
|
316 |
+
</div>
|
317 |
+
</div>
|
318 |
+
</div>
|
319 |
+
</div>
|
320 |
+
</div>
|
321 |
+
|
322 |
+
<div class="grid grid-cols-1 lg:grid-cols-3 gap-6">
|
323 |
+
<div class="lg:col-span-2">
|
324 |
+
<div class="bg-white rounded-xl shadow-lg overflow-hidden">
|
325 |
+
<div class="p-6">
|
326 |
+
<h2 class="text-xl font-semibold text-gray-800 mb-4">Crawling Log</h2>
|
327 |
+
<div id="crawlLog" class="h-96 overflow-y-auto bg-gray-50 p-4 rounded-lg font-mono text-sm space-y-2">
|
328 |
+
<div class="text-gray-500">System ready. Waiting for crawl to start...</div>
|
329 |
+
</div>
|
330 |
+
</div>
|
331 |
+
</div>
|
332 |
+
</div>
|
333 |
+
|
334 |
+
<div>
|
335 |
+
<div class="bg-white rounded-xl shadow-lg overflow-hidden">
|
336 |
+
<div class="p-6">
|
337 |
+
<h2 class="text-xl font-semibold text-gray-800 mb-4">Statistics</h2>
|
338 |
+
<div class="space-y-4">
|
339 |
+
<div class="flex items-center justify-between p-3 bg-blue-50 rounded-lg">
|
340 |
+
<div>
|
341 |
+
<p class="text-xs text-gray-500">Total URLs</p>
|
342 |
+
<p id="totalUrls" class="text-lg font-semibold">0</p>
|
343 |
+
</div>
|
344 |
+
<div class="p-2 bg-blue-100 rounded-full">
|
345 |
+
<i class="fas fa-link text-blue-600"></i>
|
346 |
+
</div>
|
347 |
+
</div>
|
348 |
+
|
349 |
+
<div class="flex items-center justify-between p-3 bg-green-50 rounded-lg">
|
350 |
+
<div>
|
351 |
+
<p class="text-xs text-gray-500">Valid Content</p>
|
352 |
+
<p id="validContent" class="text-lg font-semibold">0</p>
|
353 |
+
</div>
|
354 |
+
<div class="p-2 bg-green-100 rounded-full">
|
355 |
+
<i class="fas fa-check-circle text-green-600"></i>
|
356 |
+
</div>
|
357 |
+
</div>
|
358 |
+
|
359 |
+
<div class="flex items-center justify-between p-3 bg-purple-50 rounded-lg">
|
360 |
+
<div>
|
361 |
+
<p class="text-xs text-gray-500">AI Approved</p>
|
362 |
+
<p id="aiApproved" class="text-lg font-semibold">0</p>
|
363 |
+
</div>
|
364 |
+
<div class="p-2 bg-purple-100 rounded-full">
|
365 |
+
<i class="fas fa-star text-purple-600"></i>
|
366 |
+
</div>
|
367 |
+
</div>
|
368 |
+
|
369 |
+
<div class="flex items-center justify-between p-3 bg-yellow-50 rounded-lg">
|
370 |
+
<div>
|
371 |
+
<p class="text-xs text-gray-500">Avg. Score</p>
|
372 |
+
<p id="avgScore" class="text-lg font-semibold">0</p>
|
373 |
+
</div>
|
374 |
+
<div class="p-2 bg-yellow-100 rounded-full">
|
375 |
+
<i class="fas fa-chart-line text-yellow-600"></i>
|
376 |
+
</div>
|
377 |
+
</div>
|
378 |
+
|
379 |
+
<div class="flex items-center justify-between p-3 bg-red-50 rounded-lg">
|
380 |
+
<div>
|
381 |
+
<p class="text-xs text-gray-500">Errors</p>
|
382 |
+
<p id="errorCount" class="text-lg font-semibold">0</p>
|
383 |
+
</div>
|
384 |
+
<div class="p-2 bg-red-100 rounded-full">
|
385 |
+
<i class="fas fa-exclamation-triangle text-red-600"></i>
|
386 |
+
</div>
|
387 |
+
</div>
|
388 |
+
</div>
|
389 |
+
</div>
|
390 |
+
</div>
|
391 |
+
</div>
|
392 |
+
</div>
|
393 |
+
</div>
|
394 |
+
|
395 |
+
<div id="resultsSection" class="hidden mt-8">
|
396 |
+
<div class="bg-white rounded-xl shadow-lg overflow-hidden">
|
397 |
+
<div class="p-6">
|
398 |
+
<div class="flex justify-between items-center mb-6">
|
399 |
+
<h2 class="text-xl font-semibold text-gray-800">Crawl Results</h2>
|
400 |
+
<div class="flex space-x-2">
|
401 |
+
<button id="downloadResultsBtn" class="px-4 py-2 bg-green-600 text-white rounded-lg text-sm hover:bg-green-700">
|
402 |
+
<i class="fas fa-download mr-1"></i> Download
|
403 |
+
</button>
|
404 |
+
<button id="clearResultsBtn" class="px-4 py-2 bg-gray-200 text-gray-700 rounded-lg text-sm hover:bg-gray-300">
|
405 |
+
<i class="fas fa-trash mr-1"></i> Clear
|
406 |
+
</button>
|
407 |
+
</div>
|
408 |
+
</div>
|
409 |
+
|
410 |
+
<div class="border-b border-gray-200">
|
411 |
+
<div class="flex space-x-4">
|
412 |
+
<button id="tabSummary" class="tab-active px-4 py-2 text-sm font-medium">Summary</button>
|
413 |
+
<button id="tabContent" class="px-4 py-2 text-sm font-medium text-gray-500 hover:text-gray-700">Content</button>
|
414 |
+
<button id="tabJson" class="px-4 py-2 text-sm font-medium text-gray-500 hover:text-gray-700">JSON View</button>
|
415 |
+
</div>
|
416 |
+
</div>
|
417 |
+
|
418 |
+
<div id="summaryTab" class="py-4">
|
419 |
+
<div class="grid grid-cols-1 md:grid-cols-3 gap-6 mb-6">
|
420 |
+
<div class="bg-blue-50 p-4 rounded-lg">
|
421 |
+
<h3 class="font-medium text-blue-800 mb-2">Crawl Overview</h3>
|
422 |
+
<ul class="space-y-2 text-sm">
|
423 |
+
<li class="flex justify-between">
|
424 |
+
<span class="text-gray-600">Start URL:</span>
|
425 |
+
<span id="summaryStartUrl" class="font-medium">-</span>
|
426 |
+
</li>
|
427 |
+
<li class="flex justify-between">
|
428 |
+
<span class="text-gray-600">Total Pages:</span>
|
429 |
+
<span id="summaryTotalPages" class="font-medium">0</span>
|
430 |
+
</li>
|
431 |
+
<li class="flex justify-between">
|
432 |
+
<span class="text-gray-600">Duration:</span>
|
433 |
+
<span id="summaryDuration" class="font-medium">0s</span>
|
434 |
+
</li>
|
435 |
+
<li class="flex justify-between">
|
436 |
+
<span class="text-gray-600">Errors:</span>
|
437 |
+
<span id="summaryErrors" class="font-medium">0</span>
|
438 |
+
</li>
|
439 |
+
</ul>
|
440 |
+
</div>
|
441 |
+
|
442 |
+
<div class="bg-purple-50 p-4 rounded-lg">
|
443 |
+
<h3 class="font-medium text-purple-800 mb-2">Content Analysis</h3>
|
444 |
+
<ul class="space-y-2 text-sm">
|
445 |
+
<li class="flex justify-between">
|
446 |
+
<span class="text-gray-600">Text Paragraphs:</span>
|
447 |
+
<span id="summaryText" class="font-medium">0</span>
|
448 |
+
</li>
|
449 |
+
<li class="flex justify-between">
|
450 |
+
<span class="text-gray-600">Code Blocks:</span>
|
451 |
+
<span id="summaryCode" class="font-medium">0</span>
|
452 |
+
</li>
|
453 |
+
<li class="flex justify-between">
|
454 |
+
<span class="text-gray-600">Tables:</span>
|
455 |
+
<span id="summaryTables" class="font-medium">0</span>
|
456 |
+
</li>
|
457 |
+
</ul>
|
458 |
+
</div>
|
459 |
+
|
460 |
+
<div class="bg-green-50 p-4 rounded-lg">
|
461 |
+
<h3 class="font-medium text-green-800 mb-2">Quality Metrics</h3>
|
462 |
+
<ul class="space-y-2 text-sm">
|
463 |
+
<li class="flex justify-between">
|
464 |
+
<span class="text-gray-600">Avg. Quality Score:</span>
|
465 |
+
<span id="summaryAvgScore" class="font-medium">0</span>
|
466 |
+
</li>
|
467 |
+
<li class="flex justify-between">
|
468 |
+
<span class="text-gray-600">Highest Score:</span>
|
469 |
+
<span id="summaryHighScore" class="font-medium">0</span>
|
470 |
+
</li>
|
471 |
+
<li class="flex justify-between">
|
472 |
+
<span class="text-gray-600">Lowest Score:</span>
|
473 |
+
<span id="summaryLowScore" class="font-medium">0</span>
|
474 |
+
</li>
|
475 |
+
</ul>
|
476 |
+
</div>
|
477 |
+
</div>
|
478 |
+
|
479 |
+
<div class="mb-6">
|
480 |
+
<h3 class="font-medium text-gray-800 mb-3">Top Keywords</h3>
|
481 |
+
<div id="keywordCloud" class="flex flex-wrap gap-2">
|
482 |
+
<span class="px-3 py-1 bg-gray-100 rounded-full text-sm">No keywords extracted yet</span>
|
483 |
+
</div>
|
484 |
+
</div>
|
485 |
+
|
486 |
+
<div>
|
487 |
+
<h3 class="font-medium text-gray-800 mb-3">Best Content</h3>
|
488 |
+
<div id="topContent" class="space-y-4">
|
489 |
+
<div class="p-4 bg-gray-50 rounded-lg text-sm text-gray-600">
|
490 |
+
No content has been evaluated yet. Run a crawl to see results.
|
491 |
+
</div>
|
492 |
+
</div>
|
493 |
+
</div>
|
494 |
+
</div>
|
495 |
+
|
496 |
+
<div id="contentTab" class="py-4 hidden">
|
497 |
+
<div class="mb-4">
|
498 |
+
<div class="relative">
|
499 |
+
<input type="text" id="contentSearch" placeholder="Search content..."
|
500 |
+
class="w-full pl-10 pr-4 py-2 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500">
|
501 |
+
<div class="absolute inset-y-0 left-0 pl-3 flex items-center pointer-events-none">
|
502 |
+
<i class="fas fa-search text-gray-400"></i>
|
503 |
+
</div>
|
504 |
+
</div>
|
505 |
+
</div>
|
506 |
+
|
507 |
+
<div id="contentResults" class="space-y-6">
|
508 |
+
<!-- Content cards will be added here dynamically -->
|
509 |
+
</div>
|
510 |
+
|
511 |
+
<div id="contentPagination" class="flex justify-center mt-6 hidden">
|
512 |
+
<nav class="inline-flex rounded-md shadow">
|
513 |
+
<button class="px-3 py-1 rounded-l-md border border-gray-300 bg-white text-sm font-medium text-gray-700 hover:bg-gray-50">
|
514 |
+
Previous
|
515 |
+
</button>
|
516 |
+
<button class="px-3 py-1 border-t border-b border-gray-300 bg-white text-sm font-medium text-blue-600 hover:bg-gray-50">
|
517 |
+
1
|
518 |
+
</button>
|
519 |
+
<button class="px-3 py-1 border border-gray-300 bg-white text-sm font-medium text-gray-700 hover:bg-gray-50 rounded-r-md">
|
520 |
+
Next
|
521 |
+
</button>
|
522 |
+
</nav>
|
523 |
+
</div>
|
524 |
+
</div>
|
525 |
+
|
526 |
+
<div id="jsonTab" class="py-4 hidden">
|
527 |
+
<div class="bg-gray-800 rounded-lg p-4">
|
528 |
+
<div class="flex justify-between items-center mb-3">
|
529 |
+
<span class="text-gray-300 font-mono text-sm">output.json</span>
|
530 |
+
<button id="copyJsonBtn" class="px-3 py-1 bg-gray-700 text-gray-300 rounded text-sm hover:bg-gray-600">
|
531 |
+
<i class="fas fa-copy mr-1"></i> Copy
|
532 |
+
</button>
|
533 |
+
</div>
|
534 |
+
<pre id="jsonViewer" class="text-gray-300 font-mono text-sm overflow-x-auto p-4 bg-gray-900 rounded">{
|
535 |
+
"message": "Run a crawl to see the JSON output here"
|
536 |
+
}</pre>
|
537 |
+
</div>
|
538 |
+
</div>
|
539 |
+
</div>
|
540 |
+
</div>
|
541 |
+
</div>
|
542 |
+
</div>
|
543 |
+
|
544 |
+
<footer class="bg-gray-100 py-6 mt-12">
|
545 |
+
<div class="container mx-auto px-4 text-center text-gray-600 text-sm">
|
546 |
+
<p>Smart Web Crawler - Extract and organize web content into structured knowledge</p>
|
547 |
+
<p class="mt-2">© 2023 AI Web Tools. All rights reserved.</p>
|
548 |
+
</div>
|
549 |
+
</footer>
|
550 |
+
|
551 |
+
<script>
|
552 |
+
// Global variables
|
553 |
+
let crawlData = {
|
554 |
+
startUrl: '',
|
555 |
+
startTime: null,
|
556 |
+
endTime: null,
|
557 |
+
pagesCrawled: 0,
|
558 |
+
contentSaved: 0,
|
559 |
+
totalUrls: 0,
|
560 |
+
validContent: 0,
|
561 |
+
aiApproved: 0,
|
562 |
+
totalScore: 0,
|
563 |
+
errorCount: 0,
|
564 |
+
results: [],
|
565 |
+
keywords: [],
|
566 |
+
stats: {
|
567 |
+
text: 0,
|
568 |
+
code: 0,
|
569 |
+
tables: 0,
|
570 |
+
lists: 0
|
571 |
+
},
|
572 |
+
logEntries: []
|
573 |
+
};
|
574 |
+
|
575 |
+
let isCrawling = false;
|
576 |
+
let crawlInterval;
|
577 |
+
let timerInterval;
|
578 |
+
let elapsedSeconds = 0;
|
579 |
+
|
580 |
+
// DOM elements
|
581 |
+
const progressSection = document.getElementById('progressSection');
|
582 |
+
const resultsSection = document.getElementById('resultsSection');
|
583 |
+
const startCrawlBtn = document.getElementById('startCrawlBtn');
|
584 |
+
const stopCrawlBtn = document.getElementById('stopCrawlBtn');
|
585 |
+
const crawlLog = document.getElementById('crawlLog');
|
586 |
+
const currentUrl = document.getElementById('currentUrl');
|
587 |
+
const aiAnalysis = document.getElementById('aiAnalysis');
|
588 |
+
const pagesCrawled = document.getElementById('pagesCrawled');
|
589 |
+
const contentSaved = document.getElementById('contentSaved');
|
590 |
+
const progressBar = document.getElementById('progressBar');
|
591 |
+
const totalUrls = document.getElementById('totalUrls');
|
592 |
+
const validContent = document.getElementById('validContent');
|
593 |
+
const aiApproved = document.getElementById('aiApproved');
|
594 |
+
const avgScore = document.getElementById('avgScore');
|
595 |
+
const errorCount = document.getElementById('errorCount');
|
596 |
+
const timeElapsed = document.getElementById('timeElapsed');
|
597 |
+
const overallPercent = document.getElementById('overallPercent');
|
598 |
+
|
599 |
+
// Progress bars
|
600 |
+
const urlDiscoveryBar = document.getElementById('urlDiscoveryBar');
|
601 |
+
const contentExtractionBar = document.getElementById('contentExtractionBar');
|
602 |
+
const aiProcessingBar = document.getElementById('aiProcessingBar');
|
603 |
+
const urlDiscoveryPercent = document.getElementById('urlDiscoveryPercent');
|
604 |
+
const contentExtractionPercent = document.getElementById('contentExtractionPercent');
|
605 |
+
const aiProcessingPercent = document.getElementById('aiProcessingPercent');
|
606 |
+
|
607 |
+
// Progress steps
|
608 |
+
const step1 = document.getElementById('step1');
|
609 |
+
const step2 = document.getElementById('step2');
|
610 |
+
const step3 = document.getElementById('step3');
|
611 |
+
const step4 = document.getElementById('step4');
|
612 |
+
|
613 |
+
// Configuration elements
|
614 |
+
const baseUrlInput = document.getElementById('baseUrl');
|
615 |
+
const maxDepthInput = document.getElementById('maxDepth');
|
616 |
+
const maxConcurrentInput = document.getElementById('maxConcurrent');
|
617 |
+
const extractTextCheckbox = document.getElementById('extractText');
|
618 |
+
const extractCodeCheckbox = document.getElementById('extractCode');
|
619 |
+
const extractTablesCheckbox = document.getElementById('extractTables');
|
620 |
+
const extractListsCheckbox = document.getElementById('extractLists');
|
621 |
+
const useLLMFilterCheckbox = document.getElementById('useLLMFilter');
|
622 |
+
const minLLMScoreInput = document.getElementById('minLLMScore');
|
623 |
+
const llmSettingsDiv = document.getElementById('llmSettings');
|
624 |
+
const validateUrlBtn = document.getElementById('validateUrlBtn');
|
625 |
+
|
626 |
+
// Results elements
|
627 |
+
const summaryStartUrl = document.getElementById('summaryStartUrl');
|
628 |
+
const summaryTotalPages = document.getElementById('summaryTotalPages');
|
629 |
+
const summaryDuration = document.getElementById('summaryDuration');
|
630 |
+
const summaryErrors = document.getElementById('summaryErrors');
|
631 |
+
const summaryText = document.getElementById('summaryText');
|
632 |
+
const summaryCode = document.getElementById('summaryCode');
|
633 |
+
const summaryTables = document.getElementById('summaryTables');
|
634 |
+
const summaryAvgScore = document.getElementById('summaryAvgScore');
|
635 |
+
const summaryHighScore = document.getElementById('summaryHighScore');
|
636 |
+
const summaryLowScore = document.getElementById('summaryLowScore');
|
637 |
+
const keywordCloud = document.getElementById('keywordCloud');
|
638 |
+
const topContent = document.getElementById('topContent');
|
639 |
+
const contentResults = document.getElementById('contentResults');
|
640 |
+
const jsonViewer = document.getElementById('jsonViewer');
|
641 |
+
const downloadResultsBtn = document.getElementById('downloadResultsBtn');
|
642 |
+
const clearResultsBtn = document.getElementById('clearResultsBtn');
|
643 |
+
const copyJsonBtn = document.getElementById('copyJsonBtn');
|
644 |
+
const tabSummary = document.getElementById('tabSummary');
|
645 |
+
const tabContent = document.getElementById('tabContent');
|
646 |
+
const tabJson = document.getElementById('tabJson');
|
647 |
+
|
648 |
+
// Initialize UI
|
649 |
+
document.addEventListener('DOMContentLoaded', function() {
|
650 |
+
// Show/hide LLM settings based on checkbox
|
651 |
+
useLLMFilterCheckbox.addEventListener('change', function() {
|
652 |
+
llmSettingsDiv.style.display = this.checked ? 'block' : 'none';
|
653 |
+
});
|
654 |
+
|
655 |
+
// Set default values
|
656 |
+
baseUrlInput.value = 'https://example.com';
|
657 |
+
|
658 |
+
// Add event listeners
|
659 |
+
validateUrlBtn.addEventListener('click', validateUrl);
|
660 |
+
startCrawlBtn.addEventListener('click', startCrawling);
|
661 |
+
stopCrawlBtn.addEventListener('click', stopCrawling);
|
662 |
+
downloadResultsBtn.addEventListener('click', downloadResults);
|
663 |
+
clearResultsBtn.addEventListener('click', clearResults);
|
664 |
+
copyJsonBtn.addEventListener('click', copyJson);
|
665 |
+
tabSummary.addEventListener('click', () => switchTab('summary'));
|
666 |
+
tabContent.addEventListener('click', () => switchTab('content'));
|
667 |
+
tabJson.addEventListener('click', () => switchTab('json'));
|
668 |
+
});
|
669 |
+
|
670 |
+
// Validate URL input
|
671 |
+
function validateUrl() {
|
672 |
+
const url = baseUrlInput.value.trim();
|
673 |
+
const urlError = document.getElementById('urlError');
|
674 |
+
|
675 |
+
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
676 |
+
urlError.classList.remove('hidden');
|
677 |
+
baseUrlInput.classList.add('border-red-500');
|
678 |
+
return false;
|
679 |
+
} else {
|
680 |
+
urlError.classList.add('hidden');
|
681 |
+
baseUrlInput.classList.remove('border-red-500');
|
682 |
+
return true;
|
683 |
+
}
|
684 |
+
}
|
685 |
+
|
686 |
+
// Start crawling simulation
|
687 |
+
function startCrawling() {
|
688 |
+
if (!validateUrl()) return;
|
689 |
+
|
690 |
+
// Reset data
|
691 |
+
crawlData = {
|
692 |
+
startUrl: baseUrlInput.value.trim(),
|
693 |
+
startTime: new Date(),
|
694 |
+
endTime: null,
|
695 |
+
pagesCrawled: 0,
|
696 |
+
contentSaved: 0,
|
697 |
+
totalUrls: 0,
|
698 |
+
validContent: 0,
|
699 |
+
aiApproved: 0,
|
700 |
+
totalScore: 0,
|
701 |
+
errorCount: 0,
|
702 |
+
results: [],
|
703 |
+
keywords: [],
|
704 |
+
stats: {
|
705 |
+
text: 0,
|
706 |
+
code: 0,
|
707 |
+
tables: 0,
|
708 |
+
lists: 0
|
709 |
+
},
|
710 |
+
logEntries: []
|
711 |
+
};
|
712 |
+
|
713 |
+
// Reset progress bars
|
714 |
+
progressBar.style.width = '0%';
|
715 |
+
urlDiscoveryBar.style.width = '0%';
|
716 |
+
contentExtractionBar.style.width = '0%';
|
717 |
+
aiProcessingBar.style.width = '0%';
|
718 |
+
urlDiscoveryPercent.textContent = '0%';
|
719 |
+
contentExtractionPercent.textContent = '0%';
|
720 |
+
aiProcessingPercent.textContent = '0%';
|
721 |
+
overallPercent.textContent = '0%';
|
722 |
+
|
723 |
+
// Reset progress steps
|
724 |
+
step1.className = 'progress-step active';
|
725 |
+
step2.className = 'progress-step pending';
|
726 |
+
step3.className = 'progress-step pending';
|
727 |
+
step4.className = 'progress-step pending';
|
728 |
+
|
729 |
+
// Reset timer
|
730 |
+
elapsedSeconds = 0;
|
731 |
+
updateTimer();
|
732 |
+
clearInterval(timerInterval);
|
733 |
+
timerInterval = setInterval(updateTimer, 1000);
|
734 |
+
|
735 |
+
// Show progress section
|
736 |
+
progressSection.classList.remove('hidden');
|
737 |
+
resultsSection.classList.add('hidden');
|
738 |
+
|
739 |
+
// Update UI
|
740 |
+
startCrawlBtn.disabled = true;
|
741 |
+
isCrawling = true;
|
742 |
+
|
743 |
+
// Clear log
|
744 |
+
crawlLog.innerHTML = '';
|
745 |
+
|
746 |
+
// Simulate crawling
|
747 |
+
crawlInterval = setInterval(simulateCrawlStep, 1000);
|
748 |
+
|
749 |
+
// Add initial log
|
750 |
+
addLogEntry('Starting crawl from: ' + crawlData.startUrl, 'info');
|
751 |
+
addLogEntry('Configuration: Max Depth=' + maxDepthInput.value +
|
752 |
+
', Max Concurrent=' + maxConcurrentInput.value, 'info');
|
753 |
+
|
754 |
+
// Update current URL
|
755 |
+
currentUrl.textContent = crawlData.startUrl;
|
756 |
+
|
757 |
+
// Simulate URL validation
|
758 |
+
setTimeout(() => {
|
759 |
+
step1.className = 'progress-step completed';
|
760 |
+
step2.className = 'progress-step active';
|
761 |
+
addLogEntry('URL validated successfully', 'success');
|
762 |
+
updateProgressBar('urlDiscovery', 10);
|
763 |
+
}, 500);
|
764 |
+
}
|
765 |
+
|
766 |
+
// Update timer display
|
767 |
+
function updateTimer() {
|
768 |
+
elapsedSeconds++;
|
769 |
+
const minutes = Math.floor(elapsedSeconds / 60);
|
770 |
+
const seconds = elapsedSeconds % 60;
|
771 |
+
timeElapsed.textContent = `${minutes.toString().padStart(2, '0')}:${seconds.toString().padStart(2, '0')}`;
|
772 |
+
}
|
773 |
+
|
774 |
+
// Update progress bars
|
775 |
+
function updateProgressBar(type, percent) {
|
776 |
+
let bar, percentElement;
|
777 |
+
|
778 |
+
switch(type) {
|
779 |
+
case 'urlDiscovery':
|
780 |
+
bar = urlDiscoveryBar;
|
781 |
+
percentElement = urlDiscoveryPercent;
|
782 |
+
break;
|
783 |
+
case 'contentExtraction':
|
784 |
+
bar = contentExtractionBar;
|
785 |
+
percentElement = contentExtractionPercent;
|
786 |
+
break;
|
787 |
+
case 'aiProcessing':
|
788 |
+
bar = aiProcessingBar;
|
789 |
+
percentElement = aiProcessingPercent;
|
790 |
+
break;
|
791 |
+
case 'overall':
|
792 |
+
bar = progressBar;
|
793 |
+
percentElement = overallPercent;
|
794 |
+
break;
|
795 |
+
default:
|
796 |
+
return;
|
797 |
+
}
|
798 |
+
|
799 |
+
bar.style.width = percent + '%';
|
800 |
+
percentElement.textContent = percent + '%';
|
801 |
+
|
802 |
+
// Calculate overall progress as average of the three bars
|
803 |
+
if (type !== 'overall') {
|
804 |
+
const urlPercent = parseInt(urlDiscoveryPercent.textContent);
|
805 |
+
const contentPercent = parseInt(contentExtractionPercent.textContent);
|
806 |
+
const aiPercent = parseInt(aiProcessingPercent.textContent);
|
807 |
+
const overall = Math.round((urlPercent + contentPercent + aiPercent) / 3);
|
808 |
+
updateProgressBar('overall', overall);
|
809 |
+
}
|
810 |
+
}
|
811 |
+
|
812 |
+
// Stop crawling
|
813 |
+
function stopCrawling() {
|
814 |
+
clearInterval(crawlInterval);
|
815 |
+
clearInterval(timerInterval);
|
816 |
+
isCrawling = false;
|
817 |
+
crawlData.endTime = new Date();
|
818 |
+
|
819 |
+
// Update UI
|
820 |
+
startCrawlBtn.disabled = false;
|
821 |
+
addLogEntry('Crawl stopped by user', 'warning');
|
822 |
+
aiAnalysis.textContent = 'Crawl stopped - analyzing results';
|
823 |
+
|
824 |
+
// Mark all steps as completed or error
|
825 |
+
if (crawlData.errorCount > 0) {
|
826 |
+
step4.className = 'progress-step error';
|
827 |
+
addLogEntry('Crawl completed with errors', 'error');
|
828 |
+
} else {
|
829 |
+
step4.className = 'progress-step completed';
|
830 |
+
addLogEntry('Crawl completed successfully', 'success');
|
831 |
+
}
|
832 |
+
|
833 |
+
// Process results after a short delay
|
834 |
+
setTimeout(processResults, 500);
|
835 |
+
}
|
836 |
+
|
837 |
+
// Simulate a crawl step
|
838 |
+
function simulateCrawlStep() {
|
839 |
+
if (!isCrawling) return;
|
840 |
+
|
841 |
+
// Randomly decide if we're done
|
842 |
+
if (Math.random() < 0.1 && crawlData.pagesCrawled > 5) {
|
843 |
+
stopCrawling();
|
844 |
+
return;
|
845 |
+
}
|
846 |
+
|
847 |
+
// Randomly generate errors (10% chance)
|
848 |
+
if (Math.random() < 0.1) {
|
849 |
+
const errorTypes = [
|
850 |
+
'Connection timeout',
|
851 |
+
'SSL certificate error',
|
852 |
+
'404 Not Found',
|
853 |
+
'403 Forbidden',
|
854 |
+
'500 Server Error'
|
855 |
+
];
|
856 |
+
const errorType = errorTypes[Math.floor(Math.random() * errorTypes.length)];
|
857 |
+
const fakeUrl = generateFakeUrl(crawlData.startUrl);
|
858 |
+
|
859 |
+
crawlData.errorCount++;
|
860 |
+
errorCount.textContent = crawlData.errorCount;
|
861 |
+
|
862 |
+
addLogEntry(`Error crawling ${fakeUrl}: ${errorType}`, 'error');
|
863 |
+
|
864 |
+
// Randomly fail a step if we have multiple errors
|
865 |
+
if (crawlData.errorCount > 2 && Math.random() < 0.3) {
|
866 |
+
const steps = [step2, step3, step4];
|
867 |
+
const failedStep = steps[Math.floor(Math.random() * steps.length)];
|
868 |
+
failedStep.className = 'progress-step error';
|
869 |
+
addLogEntry(`Step failed: ${failedStep.querySelector('span').textContent}`, 'error');
|
870 |
+
}
|
871 |
+
|
872 |
+
return;
|
873 |
+
}
|
874 |
+
|
875 |
+
// Simulate finding new URLs
|
876 |
+
const newUrls = Math.floor(Math.random() * 3) + 1;
|
877 |
+
crawlData.totalUrls += newUrls;
|
878 |
+
totalUrls.textContent = crawlData.totalUrls;
|
879 |
+
|
880 |
+
// Update URL discovery progress
|
881 |
+
if (crawlData.pagesCrawled < 5) {
|
882 |
+
const progress = Math.min(100, 10 + (crawlData.pagesCrawled / 5) * 90);
|
883 |
+
updateProgressBar('urlDiscovery', progress);
|
884 |
+
}
|
885 |
+
|
886 |
+
// Simulate crawling a page
|
887 |
+
crawlData.pagesCrawled++;
|
888 |
+
pagesCrawled.textContent = crawlData.pagesCrawled;
|
889 |
+
|
890 |
+
// Simulate URL being crawled
|
891 |
+
const fakeUrl = generateFakeUrl(crawlData.startUrl);
|
892 |
+
currentUrl.textContent = fakeUrl;
|
893 |
+
|
894 |
+
// Simulate AI analysis
|
895 |
+
const aiMessages = [
|
896 |
+
"Analyzing content structure...",
|
897 |
+
"Evaluating content quality...",
|
898 |
+
"Checking for relevant information...",
|
899 |
+
"Identifying key concepts...",
|
900 |
+
"Filtering low-quality content..."
|
901 |
+
];
|
902 |
+
aiAnalysis.textContent = aiMessages[Math.floor(Math.random() * aiMessages.length)];
|
903 |
+
|
904 |
+
// Randomly decide if we found valid content
|
905 |
+
if (Math.random() > 0.3) {
|
906 |
+
crawlData.validContent++;
|
907 |
+
validContent.textContent = crawlData.validContent;
|
908 |
+
|
909 |
+
// Simulate content being saved
|
910 |
+
if (Math.random() > 0.5) {
|
911 |
+
crawlData.contentSaved++;
|
912 |
+
contentSaved.textContent = crawlData.contentSaved;
|
913 |
+
|
914 |
+
// Update content extraction progress
|
915 |
+
const contentProgress = Math.min(100, (crawlData.contentSaved / 10) * 100);
|
916 |
+
updateProgressBar('contentExtraction', contentProgress);
|
917 |
+
|
918 |
+
// Activate content extraction step if not already
|
919 |
+
if (step3.className.includes('pending')) {
|
920 |
+
step2.className = 'progress-step completed';
|
921 |
+
step3.className = 'progress-step active';
|
922 |
+
addLogEntry('Site mapping complete, starting content extraction', 'success');
|
923 |
+
}
|
924 |
+
|
925 |
+
// Generate fake content
|
926 |
+
const contentTypes = ['text', 'code', 'table', 'list'];
|
927 |
+
const type = contentTypes[Math.floor(Math.random() * contentTypes.length)];
|
928 |
+
|
929 |
+
// Update stats
|
930 |
+
if (type === 'text') crawlData.stats.text++;
|
931 |
+
if (type === 'code') crawlData.stats.code++;
|
932 |
+
if (type === 'table') crawlData.stats.tables++;
|
933 |
+
if (type === 'list') crawlData.stats.lists++;
|
934 |
+
|
935 |
+
// Generate fake score if LLM filter is enabled
|
936 |
+
let score = 0;
|
937 |
+
if (useLLMFilterCheckbox.checked) {
|
938 |
+
score = Math.floor(Math.random() * 41) + 60; // 60-100
|
939 |
+
crawlData.totalScore += score;
|
940 |
+
|
941 |
+
if (score >= parseInt(minLLMScoreInput.value)) {
|
942 |
+
crawlData.aiApproved++;
|
943 |
+
aiApproved.textContent = crawlData.aiApproved;
|
944 |
+
|
945 |
+
// Update AI processing progress
|
946 |
+
const aiProgress = Math.min(100, (crawlData.aiApproved / 5) * 100);
|
947 |
+
updateProgressBar('aiProcessing', aiProgress);
|
948 |
+
|
949 |
+
// Activate AI processing step if not already
|
950 |
+
if (step4.className.includes('pending')) {
|
951 |
+
step3.className = 'progress-step completed';
|
952 |
+
step4.className = 'progress-step active';
|
953 |
+
addLogEntry('Content extraction complete, starting AI analysis', 'success');
|
954 |
+
}
|
955 |
+
}
|
956 |
+
}
|
957 |
+
|
958 |
+
// Calculate average score
|
959 |
+
if (crawlData.aiApproved > 0) {
|
960 |
+
const avg = Math.round(crawlData.totalScore / crawlData.aiApproved);
|
961 |
+
avgScore.textContent = avg;
|
962 |
+
}
|
963 |
+
|
964 |
+
// Add log entry
|
965 |
+
addLogEntry(`Saved ${type} content from ${fakeUrl}` +
|
966 |
+
(useLLMFilterCheckbox.checked ? ` (AI Score: ${score})` : ''), 'success');
|
967 |
+
|
968 |
+
// Add to results
|
969 |
+
const result = {
|
970 |
+
url: fakeUrl,
|
971 |
+
type: type,
|
972 |
+
content: generateFakeContent(type),
|
973 |
+
score: score,
|
974 |
+
keywords: generateFakeKeywords()
|
975 |
+
};
|
976 |
+
|
977 |
+
crawlData.results.push(result);
|
978 |
+
|
979 |
+
// Add keywords to cloud
|
980 |
+
result.keywords.forEach(keyword => {
|
981 |
+
if (!crawlData.keywords.includes(keyword)) {
|
982 |
+
crawlData.keywords.push(keyword);
|
983 |
+
}
|
984 |
+
});
|
985 |
+
}
|
986 |
+
}
|
987 |
+
|
988 |
+
// Add random log entries
|
989 |
+
if (Math.random() > 0.7) {
|
990 |
+
const logMessages = [
|
991 |
+
{msg: `Found ${newUrls} new URLs to crawl`, type: 'info'},
|
992 |
+
{msg: "Processing page content...", type: 'info'},
|
993 |
+
{msg: "Extracting text paragraphs...", type: 'info'},
|
994 |
+
{msg: "Identifying code blocks...", type: 'info'},
|
995 |
+
{msg: "Parsing table structures...", type: 'info'},
|
996 |
+
{msg: "Waiting for server response...", type: 'warning'},
|
997 |
+
{msg: "Rate limit approaching, slowing down requests", type: 'warning'}
|
998 |
+
];
|
999 |
+
const message = logMessages[Math.floor(Math.random() * logMessages.length)];
|
1000 |
+
addLogEntry(message.msg, message.type);
|
1001 |
+
}
|
1002 |
+
}
|
1003 |
+
|
1004 |
+
// Add log entry with type
|
1005 |
+
function addLogEntry(message, type = 'info') {
|
1006 |
+
const now = new Date();
|
1007 |
+
const timeStr = now.toLocaleTimeString();
|
1008 |
+
const entry = document.createElement('div');
|
1009 |
+
entry.className = `log-entry ${type} fade-in`;
|
1010 |
+
entry.innerHTML = `<span class="text-gray-500">[${timeStr}]</span> ${message}`;
|
1011 |
+
crawlLog.appendChild(entry);
|
1012 |
+
crawlLog.scrollTop = crawlLog.scrollHeight;
|
1013 |
+
|
1014 |
+
// Add to crawl data
|
1015 |
+
crawlData.logEntries.push({
|
1016 |
+
time: timeStr,
|
1017 |
+
message: message,
|
1018 |
+
type: type
|
1019 |
+
});
|
1020 |
+
}
|
1021 |
+
|
1022 |
+
// Process results after crawl completes
|
1023 |
+
function processResults() {
|
1024 |
+
// Show results section
|
1025 |
+
resultsSection.classList.remove('hidden');
|
1026 |
+
|
1027 |
+
// Update summary
|
1028 |
+
summaryStartUrl.textContent = crawlData.startUrl;
|
1029 |
+
summaryTotalPages.textContent = crawlData.pagesCrawled;
|
1030 |
+
summaryErrors.textContent = crawlData.errorCount;
|
1031 |
+
|
1032 |
+
const duration = Math.round((crawlData.endTime - crawlData.startTime) / 1000);
|
1033 |
+
summaryDuration.textContent = duration + 's';
|
1034 |
+
|
1035 |
+
summaryText.textContent = crawlData.stats.text;
|
1036 |
+
summaryCode.textContent = crawlData.stats.code;
|
1037 |
+
summaryTables.textContent = crawlData.stats.tables;
|
1038 |
+
|
1039 |
+
if (useLLMFilterCheckbox.checked && crawlData.aiApproved > 0) {
|
1040 |
+
const avg = Math.round(crawlData.totalScore / crawlData.aiApproved);
|
1041 |
+
summaryAvgScore.textContent = avg;
|
1042 |
+
|
1043 |
+
// Find high and low scores
|
1044 |
+
let high = 0, low = 100;
|
1045 |
+
crawlData.results.forEach(result => {
|
1046 |
+
if (result.score > high) high = result.score;
|
1047 |
+
if (result.score < low) low = result.score;
|
1048 |
+
});
|
1049 |
+
|
1050 |
+
summaryHighScore.textContent = high;
|
1051 |
+
summaryLowScore.textContent = low;
|
1052 |
+
} else {
|
1053 |
+
summaryAvgScore.textContent = 'N/A';
|
1054 |
+
summaryHighScore.textContent = 'N/A';
|
1055 |
+
summaryLowScore.textContent = 'N/A';
|
1056 |
+
}
|
1057 |
+
|
1058 |
+
// Update keyword cloud
|
1059 |
+
updateKeywordCloud();
|
1060 |
+
|
1061 |
+
// Update top content
|
1062 |
+
updateTopContent();
|
1063 |
+
|
1064 |
+
// Update content results
|
1065 |
+
updateContentResults();
|
1066 |
+
|
1067 |
+
// Update JSON viewer
|
1068 |
+
updateJsonViewer();
|
1069 |
+
}
|
1070 |
+
|
1071 |
+
// Update keyword cloud
|
1072 |
+
function updateKeywordCloud() {
|
1073 |
+
keywordCloud.innerHTML = '';
|
1074 |
+
|
1075 |
+
if (crawlData.keywords.length === 0) {
|
1076 |
+
keywordCloud.innerHTML = '<span class="px-3 py-1 bg-gray-100 rounded-full text-sm">No keywords extracted</span>';
|
1077 |
+
return;
|
1078 |
+
}
|
1079 |
+
|
1080 |
+
// Show up to 12 keywords with random sizes
|
1081 |
+
const shuffled = [...crawlData.keywords].sort(() => 0.5 - Math.random());
|
1082 |
+
const selected = shuffled.slice(0, Math.min(12, shuffled.length));
|
1083 |
+
|
1084 |
+
selected.forEach(keyword => {
|
1085 |
+
const sizes = ['text-xs', 'text-sm', 'text-base', 'text-lg'];
|
1086 |
+
const size = sizes[Math.floor(Math.random() * sizes.length)];
|
1087 |
+
|
1088 |
+
const colors = [
|
1089 |
+
'bg-blue-100 text-blue-800',
|
1090 |
+
'bg-green-100 text-green-800',
|
1091 |
+
'bg-purple-100 text-purple-800',
|
1092 |
+
'bg-yellow-100 text-yellow-800',
|
1093 |
+
'bg-red-100 text-red-800',
|
1094 |
+
'bg-indigo-100 text-indigo-800'
|
1095 |
+
];
|
1096 |
+
const color = colors[Math.floor(Math.random() * colors.length)];
|
1097 |
+
|
1098 |
+
const el = document.createElement('span');
|
1099 |
+
el.className = `px-3 py-1 rounded-full ${size} ${color} font-medium`;
|
1100 |
+
el.textContent = keyword;
|
1101 |
+
keywordCloud.appendChild(el);
|
1102 |
+
});
|
1103 |
+
}
|
1104 |
+
|
1105 |
+
// Update top content
|
1106 |
+
function updateTopContent() {
|
1107 |
+
topContent.innerHTML = '';
|
1108 |
+
|
1109 |
+
if (crawlData.results.length === 0) {
|
1110 |
+
topContent.innerHTML = `
|
1111 |
+
<div class="p-4 bg-gray-50 rounded-lg text-sm text-gray-600">
|
1112 |
+
No content has been evaluated yet. Run a crawl to see results.
|
1113 |
+
</div>
|
1114 |
+
`;
|
1115 |
+
return;
|
1116 |
+
}
|
1117 |
+
|
1118 |
+
// Sort by score (if available) or just take first few
|
1119 |
+
const sorted = [...crawlData.results].sort((a, b) => b.score - a.score);
|
1120 |
+
const top = sorted.slice(0, Math.min(3, sorted.length));
|
1121 |
+
|
1122 |
+
top.forEach(result => {
|
1123 |
+
const card = document.createElement('div');
|
1124 |
+
card.className = 'result-card bg-white border border-gray-200 rounded-lg p-4 hover:shadow-md transition-all';
|
1125 |
+
|
1126 |
+
let contentPreview = '';
|
1127 |
+
if (result.type === 'text') {
|
1128 |
+
contentPreview = result.content.substring(0, 150) + '...';
|
1129 |
+
} else if (result.type === 'code') {
|
1130 |
+
contentPreview = result.content.split('\n')[0] + '...';
|
1131 |
+
} else if (result.type === 'table') {
|
1132 |
+
contentPreview = 'Table with ' + result.content.rows.length + ' rows';
|
1133 |
+
} else if (result.type === 'list') {
|
1134 |
+
contentPreview = 'List with ' + result.content.items.length + ' items';
|
1135 |
+
}
|
1136 |
+
|
1137 |
+
card.innerHTML = `
|
1138 |
+
<div class="flex justify-between items-start mb-2">
|
1139 |
+
<h4 class="font-medium text-blue-600">${result.type.charAt(0).toUpperCase() + result.type.slice(1)}</h4>
|
1140 |
+
${useLLMFilterCheckbox.checked ? `<span class="px-2 py-1 bg-green-100 text-green-800 rounded-full text-xs">Score: ${result.score}</span>` : ''}
|
1141 |
+
</div>
|
1142 |
+
<p class="text-sm text-gray-600 mb-3">${contentPreview}</p>
|
1143 |
+
<div class="flex justify-between items-center">
|
1144 |
+
<a href="${result.url}" target="_blank" class="text-xs text-blue-500 hover:underline">View source</a>
|
1145 |
+
<div class="flex space-x-1">
|
1146 |
+
${result.keywords.slice(0, 2).map(k => `<span class="px-2 py-0.5 bg-gray-100 rounded-full text-xs">${k}</span>`).join('')}
|
1147 |
+
</div>
|
1148 |
+
</div>
|
1149 |
+
`;
|
1150 |
+
|
1151 |
+
topContent.appendChild(card);
|
1152 |
+
});
|
1153 |
+
}
|
1154 |
+
|
1155 |
+
// Update content results
|
1156 |
+
function updateContentResults() {
|
1157 |
+
contentResults.innerHTML = '';
|
1158 |
+
|
1159 |
+
if (crawlData.results.length === 0) {
|
1160 |
+
contentResults.innerHTML = `
|
1161 |
+
<div class="p-8 text-center text-gray-500">
|
1162 |
+
<i class="fas fa-inbox text-4xl mb-2"></i>
|
1163 |
+
<p>No content has been extracted yet.</p>
|
1164 |
+
</div>
|
1165 |
+
`;
|
1166 |
+
return;
|
1167 |
+
}
|
1168 |
+
|
1169 |
+
crawlData.results.forEach(result => {
|
1170 |
+
const card = document.createElement('div');
|
1171 |
+
card.className = 'result-card bg-white border border-gray-200 rounded-lg p-4 hover:shadow-md transition-all fade-in';
|
1172 |
+
|
1173 |
+
let contentDisplay = '';
|
1174 |
+
if (result.type === 'text') {
|
1175 |
+
contentDisplay = `<p class="text-gray-700">${result.content}</p>`;
|
1176 |
+
} else if (result.type === 'code') {
|
1177 |
+
contentDisplay = `
|
1178 |
+
<div class="code-block rounded-lg p-3 overflow-x-auto">
|
1179 |
+
<pre><code>${result.content}</code></pre>
|
1180 |
+
</div>
|
1181 |
+
`;
|
1182 |
+
} else if (result.type === 'table') {
|
1183 |
+
contentDisplay = `
|
1184 |
+
<div class="overflow-x-auto">
|
1185 |
+
<table class="min-w-full border">
|
1186 |
+
<thead>
|
1187 |
+
<tr class="bg-gray-100">
|
1188 |
+
${result.content.headers.map(h => `<th class="px-4 py-2 text-left border">${h}</th>`).join('')}
|
1189 |
+
</tr>
|
1190 |
+
</thead>
|
1191 |
+
<tbody>
|
1192 |
+
${result.content.rows.map(row => `
|
1193 |
+
<tr>
|
1194 |
+
${row.map(cell => `<td class="px-4 py-2 border">${cell}</td>`).join('')}
|
1195 |
+
</tr>
|
1196 |
+
`).join('')}
|
1197 |
+
</tbody>
|
1198 |
+
</table>
|
1199 |
+
</div>
|
1200 |
+
`;
|
1201 |
+
} else if (result.type === 'list') {
|
1202 |
+
contentDisplay = `
|
1203 |
+
<ul class="list-disc pl-5 space-y-1">
|
1204 |
+
${result.content.items.map(item => `<li>${item}</li>`).join('')}
|
1205 |
+
</ul>
|
1206 |
+
`;
|
1207 |
+
}
|
1208 |
+
|
1209 |
+
card.innerHTML = `
|
1210 |
+
<div class="flex justify-between items-start mb-3">
|
1211 |
+
<div>
|
1212 |
+
<h4 class="font-medium text-blue-600">${result.type.charAt(0).toUpperCase() + result.type.slice(1)}</h4>
|
1213 |
+
<a href="${result.url}" target="_blank" class="text-xs text-gray-500 hover:underline">${result.url}</a>
|
1214 |
+
</div>
|
1215 |
+
${useLLMFilterCheckbox.checked ? `<span class="px-2 py-1 bg-green-100 text-green-800 rounded-full text-xs">Score: ${result.score}</span>` : ''}
|
1216 |
+
</div>
|
1217 |
+
${contentDisplay}
|
1218 |
+
<div class="mt-3 pt-2 border-t border-gray-100">
|
1219 |
+
<div class="flex flex-wrap gap-1">
|
1220 |
+
${result.keywords.map(k => `<span class="px-2 py-0.5 bg-gray-100 rounded-full text-xs">${k}</span>`).join('')}
|
1221 |
+
</div>
|
1222 |
+
</div>
|
1223 |
+
`;
|
1224 |
+
|
1225 |
+
contentResults.appendChild(card);
|
1226 |
+
});
|
1227 |
+
}
|
1228 |
+
|
1229 |
+
// Update JSON viewer
|
1230 |
+
function updateJsonViewer() {
|
1231 |
+
const output = {
|
1232 |
+
metadata: {
|
1233 |
+
source: crawlData.startUrl,
|
1234 |
+
pages: crawlData.pagesCrawled,
|
1235 |
+
duration: Math.round((crawlData.endTime - crawlData.startTime) / 1000) + 's',
|
1236 |
+
created: new Date().toISOString(),
|
1237 |
+
errors: crawlData.errorCount
|
1238 |
+
},
|
1239 |
+
content: crawlData.results.map(r => ({
|
1240 |
+
url: r.url,
|
1241 |
+
type: r.type,
|
1242 |
+
content: r.type === 'table' ? { headers: r.content.headers, rows: r.content.rows } : r.content,
|
1243 |
+
score: r.score,
|
1244 |
+
keywords: r.keywords
|
1245 |
+
}))
|
1246 |
+
};
|
1247 |
+
|
1248 |
+
jsonViewer.textContent = JSON.stringify(output, null, 2);
|
1249 |
+
}
|
1250 |
+
|
1251 |
+
// Switch between tabs
|
1252 |
+
function switchTab(tab) {
|
1253 |
+
document.getElementById('summaryTab').classList.add('hidden');
|
1254 |
+
document.getElementById('contentTab').classList.add('hidden');
|
1255 |
+
document.getElementById('jsonTab').classList.add('hidden');
|
1256 |
+
|
1257 |
+
document.getElementById('tabSummary').classList.remove('tab-active');
|
1258 |
+
document.getElementById('tabContent').classList.remove('tab-active');
|
1259 |
+
document.getElementById('tabJson').classList.remove('tab-active');
|
1260 |
+
|
1261 |
+
document.getElementById('tabSummary').classList.add('text-gray-500');
|
1262 |
+
document.getElementById('tabContent').classList.add('text-gray-500');
|
1263 |
+
document.getElementById('tabJson').classList.add('text-gray-500');
|
1264 |
+
|
1265 |
+
document.getElementById(tab + 'Tab').classList.remove('hidden');
|
1266 |
+
document.getElementById('tab' + tab.charAt(0).toUpperCase() + tab.slice(1)).classList.add('tab-active');
|
1267 |
+
document.getElementById('tab' + tab.charAt(0).toUpperCase() + tab.slice(1)).classList.remove('text-gray-500');
|
1268 |
+
}
|
1269 |
+
|
1270 |
+
// Download results
|
1271 |
+
function downloadResults() {
|
1272 |
+
const format = document.querySelector('input[name="outputFormat"]:checked').value;
|
1273 |
+
const blob = format === 'json' ?
|
1274 |
+
new Blob([JSON.stringify(crawlData.results, null, 2)], { type: 'application/json' }) :
|
1275 |
+
new Blob([generateMarkdownOutput()], { type: 'text/markdown' });
|
1276 |
+
|
1277 |
+
const url = URL.createObjectURL(blob);
|
1278 |
+
const a = document.createElement('a');
|
1279 |
+
a.href = url;
|
1280 |
+
a.download = `crawl_results_${new Date().toISOString().slice(0, 10)}.${format}`;
|
1281 |
+
document.body.appendChild(a);
|
1282 |
+
a.click();
|
1283 |
+
document.body.removeChild(a);
|
1284 |
+
URL.revokeObjectURL(url);
|
1285 |
+
}
|
1286 |
+
|
1287 |
+
// Generate markdown output
|
1288 |
+
function generateMarkdownOutput() {
|
1289 |
+
let md = `# Web Crawl Results\n\n`;
|
1290 |
+
md += `- **Source**: ${crawlData.startUrl}\n`;
|
1291 |
+
md += `- **Pages Crawled**: ${crawlData.pagesCrawled}\n`;
|
1292 |
+
md += `- **Content Saved**: ${crawlData.contentSaved}\n`;
|
1293 |
+
md += `- **Errors**: ${crawlData.errorCount}\n`;
|
1294 |
+
md += `- **Date**: ${new Date().toISOString()}\n\n`;
|
1295 |
+
|
1296 |
+
crawlData.results.forEach(result => {
|
1297 |
+
md += `## ${result.url}\n\n`;
|
1298 |
+
md += `- **Type**: ${result.type}\n`;
|
1299 |
+
if (useLLMFilterCheckbox.checked) {
|
1300 |
+
md += `- **AI Score**: ${result.score}\n`;
|
1301 |
+
}
|
1302 |
+
md += `- **Keywords**: ${result.keywords.join(', ')}\n\n`;
|
1303 |
+
|
1304 |
+
if (result.type === 'text') {
|
1305 |
+
md += `${result.content}\n\n`;
|
1306 |
+
} else if (result.type === 'code') {
|
1307 |
+
md += `\`\`\`\n${result.content}\n\`\`\`\n\n`;
|
1308 |
+
} else if (result.type === 'table') {
|
1309 |
+
md += `| ${result.content.headers.join(' | ')} |\n`;
|
1310 |
+
md += `| ${result.content.headers.map(() => '---').join(' | ')} |\n`;
|
1311 |
+
result.content.rows.forEach(row => {
|
1312 |
+
md += `| ${row.join(' | ')} |\n`;
|
1313 |
+
});
|
1314 |
+
md += '\n';
|
1315 |
+
} else if (result.type === 'list') {
|
1316 |
+
result.content.items.forEach(item => {
|
1317 |
+
md += `- ${item}\n`;
|
1318 |
+
});
|
1319 |
+
md += '\n';
|
1320 |
+
}
|
1321 |
+
|
1322 |
+
md += '---\n\n';
|
1323 |
+
});
|
1324 |
+
|
1325 |
+
return md;
|
1326 |
+
}
|
1327 |
+
|
1328 |
+
// Copy JSON to clipboard
|
1329 |
+
function copyJson() {
|
1330 |
+
navigator.clipboard.writeText(jsonViewer.textContent)
|
1331 |
+
.then(() => {
|
1332 |
+
const copyBtn = document.querySelector('#jsonTab button');
|
1333 |
+
copyBtn.innerHTML = '<i class="fas fa-check mr-1"></i> Copied!';
|
1334 |
+
setTimeout(() => {
|
1335 |
+
copyBtn.innerHTML = '<i class="fas fa-copy mr-1"></i> Copy';
|
1336 |
+
}, 2000);
|
1337 |
+
});
|
1338 |
+
}
|
1339 |
+
|
1340 |
+
// Clear results
|
1341 |
+
function clearResults() {
|
1342 |
+
if (confirm('Are you sure you want to clear all results?')) {
|
1343 |
+
crawlData = {
|
1344 |
+
startUrl: '',
|
1345 |
+
startTime: null,
|
1346 |
+
endTime: null,
|
1347 |
+
pagesCrawled: 0,
|
1348 |
+
contentSaved: 0,
|
1349 |
+
totalUrls: 0,
|
1350 |
+
validContent: 0,
|
1351 |
+
aiApproved: 0,
|
1352 |
+
totalScore: 0,
|
1353 |
+
errorCount: 0,
|
1354 |
+
results: [],
|
1355 |
+
keywords: [],
|
1356 |
+
stats: {
|
1357 |
+
text: 0,
|
1358 |
+
code: 0,
|
1359 |
+
tables: 0,
|
1360 |
+
lists: 0
|
1361 |
+
},
|
1362 |
+
logEntries: []
|
1363 |
+
};
|
1364 |
+
|
1365 |
+
updateKeywordCloud();
|
1366 |
+
updateTopContent();
|
1367 |
+
updateContentResults();
|
1368 |
+
updateJsonViewer();
|
1369 |
+
|
1370 |
+
// Reset summary
|
1371 |
+
summaryStartUrl.textContent = '-';
|
1372 |
+
summaryTotalPages.textContent = '0';
|
1373 |
+
summaryDuration.textContent = '0s';
|
1374 |
+
summaryErrors.textContent = '0';
|
1375 |
+
summaryText.textContent = '0';
|
1376 |
+
summaryCode.textContent = '0';
|
1377 |
+
summaryTables.textContent = '0';
|
1378 |
+
summaryAvgScore.textContent = '0';
|
1379 |
+
summaryHighScore.textContent = '0';
|
1380 |
+
summaryLowScore.textContent = '0';
|
1381 |
+
}
|
1382 |
+
}
|
1383 |
+
|
1384 |
+
// Helper functions
|
1385 |
+
function generateFakeUrl(baseUrl) {
|
1386 |
+
const paths = [
|
1387 |
+
'about', 'contact', 'products', 'services', 'blog',
|
1388 |
+
'article', 'docs', 'tutorial', 'guide', 'faq'
|
1389 |
+
];
|
1390 |
+
const extensions = ['', '.html', '.php', '/'];
|
1391 |
+
|
1392 |
+
const path = paths[Math.floor(Math.random() * paths.length)];
|
1393 |
+
const ext = extensions[Math.floor(Math.random() * extensions.length)];
|
1394 |
+
const query = Math.random() > 0.7 ? '?id=' + Math.floor(Math.random() * 1000) : '';
|
1395 |
+
|
1396 |
+
return baseUrl + '/' + path + ext + query;
|
1397 |
+
}
|
1398 |
+
|
1399 |
+
function generateFakeContent(type) {
|
1400 |
+
if (type === 'text') {
|
1401 |
+
const paragraphs = [
|
1402 |
+
"The quick brown fox jumps over the lazy dog. This sentence contains all the letters in the English alphabet.",
|
1403 |
+
"Web crawling is an essential technique for gathering information from websites. It involves systematically browsing the web to index and collect data.",
|
1404 |
+
"Artificial intelligence is transforming many industries by automating complex tasks and providing insights from large datasets.",
|
1405 |
+
"The future of technology lies in the convergence of AI, blockchain, and IoT, creating smarter and more connected systems.",
|
1406 |
+
"Responsive web design ensures that websites adapt to different screen sizes and devices, providing optimal viewing experiences."
|
1407 |
+
];
|
1408 |
+
return paragraphs[Math.floor(Math.random() * paragraphs.length)];
|
1409 |
+
} else if (type === 'code') {
|
1410 |
+
const languages = ['javascript', 'python', 'html', 'css', 'java'];
|
1411 |
+
const language = languages[Math.floor(Math.random() * languages.length)];
|
1412 |
+
|
1413 |
+
if (language === 'javascript') {
|
1414 |
+
return `function greet(name) {\n return "Hello, " + name + "!";\n}\n\nconst message = greet("World");\nconsole.log(message);`;
|
1415 |
+
} else if (language === 'python') {
|
1416 |
+
return `def factorial(n):\n if n == 0:\n return 1\n else:\n return n * factorial(n-1)\n\nprint(factorial(5))`;
|
1417 |
+
} else if (language === 'html') {
|
1418 |
+
return `<!DOCTYPE html>\n<html>\n<head>\n <title>Example</title>\n</head>\n<body>\n <h1>Hello World</h1>\n<p style="border-radius: 8px; text-align: center; font-size: 12px; color: #fff; margin-top: 16px;position: fixed; left: 8px; bottom: 8px; z-index: 10; background: rgba(0, 0, 0, 0.8); padding: 4px 8px;">Made with <img src="https://enzostvs-deepsite.hf.space/logo.svg" alt="DeepSite Logo" style="width: 16px; height: 16px; vertical-align: middle;display:inline-block;margin-right:3px;filter:brightness(0) invert(1);"><a href="https://enzostvs-deepsite.hf.space" style="color: #fff;text-decoration: underline;" target="_blank" >DeepSite</a> - 🧬 <a href="https://enzostvs-deepsite.hf.space?remix=gewei20/smart-web-crawler" style="color: #fff;text-decoration: underline;" target="_blank" >Remix</a></p></body>\n</html>
|
prompts.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
需要爬取运行提示,错误提示,进度条
|