gewei20 commited on
Commit
8773530
·
verified ·
1 Parent(s): 40013b0

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +268 -1375
index.html CHANGED
@@ -1,1418 +1,311 @@
1
  <!DOCTYPE html>
2
- <html lang="en">
3
  <head>
4
  <meta charset="UTF-8">
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
- <title>Smart Web Crawler</title>
7
  <script src="https://cdn.tailwindcss.com"></script>
8
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
 
 
 
 
9
  <style>
10
- .gradient-bg {
11
- background: linear-gradient(135deg, #6b73ff 0%, #000dff 100%);
12
- }
13
- .crawl-animation {
14
- animation: crawlPulse 2s infinite;
15
- }
16
- @keyframes crawlPulse {
17
- 0% { transform: translateY(0); }
18
- 50% { transform: translateY(-5px); }
19
- 100% { transform: translateY(0); }
20
- }
21
- .progress-bar {
22
- transition: width 0.3s ease;
23
- }
24
- .result-card:hover {
25
- transform: translateY(-5px);
26
- box-shadow: 0 10px 25px rgba(0, 0, 255, 0.1);
27
- }
28
- .code-block {
29
- font-family: 'Courier New', monospace;
30
- background-color: #2d3748;
31
- color: #f7fafc;
32
- }
33
- .tab-active {
34
- border-bottom: 3px solid #3b82f6;
35
- color: #3b82f6;
36
- font-weight: 600;
37
- }
38
- .fade-in {
39
- animation: fadeIn 0.5s ease-in;
40
- }
41
- @keyframes fadeIn {
42
- from { opacity: 0; }
43
- to { opacity: 1; }
44
- }
45
- .progress-step {
46
- position: relative;
47
- padding-left: 2rem;
48
- }
49
- .progress-step:before {
50
- content: '';
51
- position: absolute;
52
- left: 0.5rem;
53
- top: 0;
54
- bottom: 0;
55
- width: 2px;
56
- background-color: #e5e7eb;
57
- }
58
- .progress-step:first-child:before {
59
- top: 1rem;
60
- }
61
- .progress-step:last-child:before {
62
- bottom: calc(100% - 1rem);
63
- }
64
- .progress-step.completed .step-icon {
65
- background-color: #10b981;
66
- color: white;
67
- }
68
- .progress-step.active .step-icon {
69
- background-color: #3b82f6;
70
- color: white;
71
- }
72
- .progress-step.pending .step-icon {
73
- background-color: #e5e7eb;
74
- color: #6b7280;
75
- }
76
- .progress-step.error .step-icon {
77
- background-color: #ef4444;
78
- color: white;
79
- }
80
- .log-entry.error {
81
- color: #ef4444;
82
- }
83
- .log-entry.warning {
84
- color: #f59e0b;
85
- }
86
- .log-entry.success {
87
- color: #10b981;
88
- }
89
- .log-entry.info {
90
- color: #3b82f6;
91
- }
92
- .progress-multi {
93
- height: 6px;
94
- border-radius: 3px;
95
- }
96
  </style>
97
  </head>
98
- <body class="bg-gray-50 min-h-screen">
99
- <div class="gradient-bg text-white py-8 px-4 shadow-lg">
100
- <div class="container mx-auto">
101
- <div class="flex items-center justify-between">
102
- <div>
103
- <h1 class="text-3xl font-bold flex items-center">
104
- <i class="fas fa-spider mr-3 crawl-animation"></i> Smart Web Crawler
105
- </h1>
106
- <p class="mt-2 opacity-90">Extract and organize web content into structured knowledge</p>
107
- </div>
108
- <div class="hidden md:block">
109
- <div class="flex space-x-2">
110
- <span class="px-3 py-1 bg-blue-400 rounded-full text-xs font-semibold">AI-Powered</span>
111
- <span class="px-3 py-1 bg-purple-400 rounded-full text-xs font-semibold">Multi-Format</span>
112
- <span class="px-3 py-1 bg-green-400 rounded-full text-xs font-semibold">Smart Filtering</span>
113
- </div>
114
- </div>
115
- </div>
116
- </div>
117
- </div>
118
-
119
  <div class="container mx-auto px-4 py-8">
120
- <div class="bg-white rounded-xl shadow-lg overflow-hidden mb-8">
121
- <div class="p-6">
122
- <h2 class="text-xl font-semibold text-gray-800 mb-4">Crawler Configuration</h2>
123
-
124
- <div class="grid grid-cols-1 md:grid-cols-2 gap-6">
125
- <div>
126
- <label class="block text-sm font-medium text-gray-700 mb-1">Start URL</label>
127
- <div class="flex">
128
- <input type="text" id="baseUrl" placeholder="https://example.com"
129
- class="flex-1 px-4 py-2 border border-gray-300 rounded-l-lg focus:ring-blue-500 focus:border-blue-500">
130
- <button id="validateUrlBtn" class="px-4 py-2 bg-blue-600 text-white rounded-r-lg hover:bg-blue-700">
131
- <i class="fas fa-check"></i>
132
- </button>
133
- </div>
134
- <p id="urlError" class="text-red-500 text-xs mt-1 hidden">Please enter a valid URL starting with http:// or https://</p>
135
- </div>
136
-
137
- <div>
138
- <label class="block text-sm font-medium text-gray-700 mb-1">Output Format</label>
139
- <div class="flex space-x-4">
140
- <label class="inline-flex items-center">
141
- <input type="radio" name="outputFormat" value="json" checked class="h-4 w-4 text-blue-600 focus:ring-blue-500">
142
- <span class="ml-2">JSON</span>
143
- </label>
144
- <label class="inline-flex items-center">
145
- <input type="radio" name="outputFormat" value="md" class="h-4 w-4 text-blue-600 focus:ring-blue-500">
146
- <span class="ml-2">Markdown</span>
147
- </label>
148
- </div>
149
- </div>
150
-
151
- <div>
152
- <label class="block text-sm font-medium text-gray-700 mb-1">Max Depth</label>
153
- <input type="number" id="maxDepth" min="1" max="10" value="3"
154
- class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500">
155
- </div>
156
-
157
- <div>
158
- <label class="block text-sm font-medium text-gray-700 mb-1">Max Concurrent Requests</label>
159
- <input type="number" id="maxConcurrent" min="1" max="50" value="20"
160
- class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500">
161
- </div>
162
- </div>
163
-
164
- <div class="mt-6">
165
- <h3 class="text-md font-medium text-gray-700 mb-3">Content to Extract</h3>
166
- <div class="flex flex-wrap gap-4">
167
- <label class="inline-flex items-center">
168
- <input type="checkbox" id="extractText" checked class="h-4 w-4 text-blue-600 rounded focus:ring-blue-500">
169
- <span class="ml-2">Text Content</span>
170
- </label>
171
- <label class="inline-flex items-center">
172
- <input type="checkbox" id="extractCode" checked class="h-4 w-4 text-blue-600 rounded focus:ring-blue-500">
173
- <span class="ml-2">Code Blocks</span>
174
- </label>
175
- <label class="inline-flex items-center">
176
- <input type="checkbox" id="extractTables" checked class="h-4 w-4 text-blue-600 rounded focus:ring-blue-500">
177
- <span class="ml-2">Tables</span>
178
- </label>
179
- <label class="inline-flex items-center">
180
- <input type="checkbox" id="extractLists" checked class="h-4 w-4 text-blue-600 rounded focus:ring-blue-500">
181
- <span class="ml-2">Lists</span>
182
- </label>
183
- </div>
184
- </div>
185
-
186
- <div class="mt-6">
187
- <label class="inline-flex items-center">
188
- <input type="checkbox" id="useLLMFilter" class="h-4 w-4 text-blue-600 rounded focus:ring-blue-500">
189
- <span class="ml-2 font-medium">Enable AI Content Filtering</span>
190
- </label>
191
- <div id="llmSettings" class="mt-3 pl-6 hidden">
192
- <label class="block text-sm font-medium text-gray-700 mb-1">Minimum Quality Score (0-100)</label>
193
- <input type="number" id="minLLMScore" min="0" max="100" value="50"
194
- class="w-24 px-4 py-2 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500">
195
- </div>
196
- </div>
197
-
198
- <div class="mt-8 flex justify-center">
199
- <button id="startCrawlBtn"
200
- class="px-8 py-3 bg-blue-600 text-white rounded-lg font-semibold hover:bg-blue-700 transition-all flex items-center">
201
- <i class="fas fa-play mr-2"></i> Start Crawling
202
  </button>
203
  </div>
 
204
  </div>
205
- </div>
206
-
207
- <div id="progressSection" class="hidden">
208
- <div class="bg-white rounded-xl shadow-lg overflow-hidden mb-8">
209
- <div class="p-6">
210
- <div class="flex justify-between items-center mb-4">
211
- <h2 class="text-xl font-semibold text-gray-800">Crawling Progress</h2>
212
- <button id="stopCrawlBtn"
213
- class="px-4 py-2 bg-red-500 text-white rounded-lg text-sm hover:bg-red-600">
214
- <i class="fas fa-stop mr-1"></i> Stop
215
- </button>
216
- </div>
217
-
218
- <!-- Progress Steps -->
219
- <div class="mb-6">
220
- <div class="flex space-x-4 mb-4">
221
- <div class="progress-step pending" id="step1">
222
- <div class="flex items-center">
223
- <div class="step-icon w-6 h-6 rounded-full flex items-center justify-center mr-2">
224
- <i class="fas fa-link text-xs"></i>
225
- </div>
226
- <span class="text-sm">URL Validation</span>
227
- </div>
228
- </div>
229
- <div class="progress-step pending" id="step2">
230
- <div class="flex items-center">
231
- <div class="step-icon w-6 h-6 rounded-full flex items-center justify-center mr-2">
232
- <i class="fas fa-sitemap text-xs"></i>
233
- </div>
234
- <span class="text-sm">Site Mapping</span>
235
- </div>
236
- </div>
237
- <div class="progress-step pending" id="step3">
238
- <div class="flex items-center">
239
- <div class="step-icon w-6 h-6 rounded-full flex items-center justify-center mr-2">
240
- <i class="fas fa-file-alt text-xs"></i>
241
- </div>
242
- <span class="text-sm">Content Extraction</span>
243
- </div>
244
- </div>
245
- <div class="progress-step pending" id="step4">
246
- <div class="flex items-center">
247
- <div class="step-icon w-6 h-6 rounded-full flex items-center justify-center mr-2">
248
- <i class="fas fa-robot text-xs"></i>
249
- </div>
250
- <span class="text-sm">AI Analysis</span>
251
- </div>
252
- </div>
253
- </div>
254
-
255
- <!-- Multi-level progress bars -->
256
- <div class="space-y-2 mb-2">
257
- <div>
258
- <div class="flex justify-between text-xs text-gray-600 mb-1">
259
- <span>URL Discovery</span>
260
- <span id="urlDiscoveryPercent">0%</span>
261
- </div>
262
- <div class="w-full bg-gray-200 rounded-full h-1.5">
263
- <div id="urlDiscoveryBar" class="progress-multi bg-blue-400 h-1.5 rounded-full" style="width: 0%"></div>
264
- </div>
265
- </div>
266
- <div>
267
- <div class="flex justify-between text-xs text-gray-600 mb-1">
268
- <span>Content Extraction</span>
269
- <span id="contentExtractionPercent">0%</span>
270
- </div>
271
- <div class="w-full bg-gray-200 rounded-full h-1.5">
272
- <div id="contentExtractionBar" class="progress-multi bg-green-400 h-1.5 rounded-full" style="width: 0%"></div>
273
- </div>
274
- </div>
275
- <div>
276
- <div class="flex justify-between text-xs text-gray-600 mb-1">
277
- <span>AI Processing</span>
278
- <span id="aiProcessingPercent">0%</span>
279
- </div>
280
- <div class="w-full bg-gray-200 rounded-full h-1.5">
281
- <div id="aiProcessingBar" class="progress-multi bg-purple-400 h-1.5 rounded-full" style="width: 0%"></div>
282
- </div>
283
- </div>
284
- </div>
285
-
286
- <!-- Main progress bar -->
287
- <div class="mb-4">
288
- <div class="flex justify-between text-sm text-gray-600 mb-1">
289
- <span>Overall Progress: <span id="overallPercent">0%</span></span>
290
- <span>Time Elapsed: <span id="timeElapsed">00:00</span></span>
291
- </div>
292
- <div class="w-full bg-gray-200 rounded-full h-2.5">
293
- <div id="progressBar" class="progress-bar bg-blue-600 h-2.5 rounded-full" style="width: 0%"></div>
294
- </div>
295
- </div>
296
- </div>
297
-
298
- <div class="bg-gray-50 p-4 rounded-lg">
299
- <div class="flex items-center mb-2">
300
- <div class="w-8 h-8 rounded-full bg-blue-100 flex items-center justify-center mr-3">
301
- <i class="fas fa-spider text-blue-600"></i>
302
- </div>
303
- <div class="flex-1">
304
- <p class="text-sm font-medium">Currently Crawling:</p>
305
- <p id="currentUrl" class="text-sm text-gray-600 truncate">Waiting to start...</p>
306
- </div>
307
- </div>
308
-
309
- <div class="flex items-center">
310
- <div class="w-8 h-8 rounded-full bg-purple-100 flex items-center justify-center mr-3">
311
- <i class="fas fa-robot text-purple-600"></i>
312
- </div>
313
- <div class="flex-1">
314
- <p class="text-sm font-medium">AI Analysis:</p>
315
- <p id="aiAnalysis" class="text-sm text-gray-600">Ready to evaluate content quality</p>
316
- </div>
317
- </div>
318
- </div>
319
  </div>
320
- </div>
321
-
322
- <div class="grid grid-cols-1 lg:grid-cols-3 gap-6">
323
- <div class="lg:col-span-2">
324
- <div class="bg-white rounded-xl shadow-lg overflow-hidden">
325
- <div class="p-6">
326
- <h2 class="text-xl font-semibold text-gray-800 mb-4">Crawling Log</h2>
327
- <div id="crawlLog" class="h-96 overflow-y-auto bg-gray-50 p-4 rounded-lg font-mono text-sm space-y-2">
328
- <div class="text-gray-500">System ready. Waiting for crawl to start...</div>
329
- </div>
330
- </div>
331
- </div>
332
  </div>
333
-
334
- <div>
335
- <div class="bg-white rounded-xl shadow-lg overflow-hidden">
336
- <div class="p-6">
337
- <h2 class="text-xl font-semibold text-gray-800 mb-4">Statistics</h2>
338
- <div class="space-y-4">
339
- <div class="flex items-center justify-between p-3 bg-blue-50 rounded-lg">
340
- <div>
341
- <p class="text-xs text-gray-500">Total URLs</p>
342
- <p id="totalUrls" class="text-lg font-semibold">0</p>
343
- </div>
344
- <div class="p-2 bg-blue-100 rounded-full">
345
- <i class="fas fa-link text-blue-600"></i>
346
- </div>
347
- </div>
348
-
349
- <div class="flex items-center justify-between p-3 bg-green-50 rounded-lg">
350
- <div>
351
- <p class="text-xs text-gray-500">Valid Content</p>
352
- <p id="validContent" class="text-lg font-semibold">0</p>
353
- </div>
354
- <div class="p-2 bg-green-100 rounded-full">
355
- <i class="fas fa-check-circle text-green-600"></i>
356
- </div>
357
- </div>
358
-
359
- <div class="flex items-center justify-between p-3 bg-purple-50 rounded-lg">
360
- <div>
361
- <p class="text-xs text-gray-500">AI Approved</p>
362
- <p id="aiApproved" class="text-lg font-semibold">0</p>
363
- </div>
364
- <div class="p-2 bg-purple-100 rounded-full">
365
- <i class="fas fa-star text-purple-600"></i>
366
- </div>
367
- </div>
368
-
369
- <div class="flex items-center justify-between p-3 bg-yellow-50 rounded-lg">
370
- <div>
371
- <p class="text-xs text-gray-500">Avg. Score</p>
372
- <p id="avgScore" class="text-lg font-semibold">0</p>
373
- </div>
374
- <div class="p-2 bg-yellow-100 rounded-full">
375
- <i class="fas fa-chart-line text-yellow-600"></i>
376
- </div>
377
- </div>
378
-
379
- <div class="flex items-center justify-between p-3 bg-red-50 rounded-lg">
380
- <div>
381
- <p class="text-xs text-gray-500">Errors</p>
382
- <p id="errorCount" class="text-lg font-semibold">0</p>
383
- </div>
384
- <div class="p-2 bg-red-100 rounded-full">
385
- <i class="fas fa-exclamation-triangle text-red-600"></i>
386
- </div>
387
- </div>
388
- </div>
389
- </div>
390
  </div>
391
- </div>
392
  </div>
393
- </div>
394
-
395
- <div id="resultsSection" class="hidden mt-8">
396
- <div class="bg-white rounded-xl shadow-lg overflow-hidden">
397
- <div class="p-6">
398
- <div class="flex justify-between items-center mb-6">
399
- <h2 class="text-xl font-semibold text-gray-800">Crawl Results</h2>
400
- <div class="flex space-x-2">
401
- <button id="downloadResultsBtn" class="px-4 py-2 bg-green-600 text-white rounded-lg text-sm hover:bg-green-700">
402
- <i class="fas fa-download mr-1"></i> Download
403
- </button>
404
- <button id="clearResultsBtn" class="px-4 py-2 bg-gray-200 text-gray-700 rounded-lg text-sm hover:bg-gray-300">
405
- <i class="fas fa-trash mr-1"></i> Clear
406
- </button>
407
- </div>
408
- </div>
409
-
410
- <div class="border-b border-gray-200">
411
- <div class="flex space-x-4">
412
- <button id="tabSummary" class="tab-active px-4 py-2 text-sm font-medium">Summary</button>
413
- <button id="tabContent" class="px-4 py-2 text-sm font-medium text-gray-500 hover:text-gray-700">Content</button>
414
- <button id="tabJson" class="px-4 py-2 text-sm font-medium text-gray-500 hover:text-gray-700">JSON View</button>
415
- </div>
416
- </div>
417
-
418
- <div id="summaryTab" class="py-4">
419
- <div class="grid grid-cols-1 md:grid-cols-3 gap-6 mb-6">
420
- <div class="bg-blue-50 p-4 rounded-lg">
421
- <h3 class="font-medium text-blue-800 mb-2">Crawl Overview</h3>
422
- <ul class="space-y-2 text-sm">
423
- <li class="flex justify-between">
424
- <span class="text-gray-600">Start URL:</span>
425
- <span id="summaryStartUrl" class="font-medium">-</span>
426
- </li>
427
- <li class="flex justify-between">
428
- <span class="text-gray-600">Total Pages:</span>
429
- <span id="summaryTotalPages" class="font-medium">0</span>
430
- </li>
431
- <li class="flex justify-between">
432
- <span class="text-gray-600">Duration:</span>
433
- <span id="summaryDuration" class="font-medium">0s</span>
434
- </li>
435
- <li class="flex justify-between">
436
- <span class="text-gray-600">Errors:</span>
437
- <span id="summaryErrors" class="font-medium">0</span>
438
- </li>
439
- </ul>
440
- </div>
441
-
442
- <div class="bg-purple-50 p-4 rounded-lg">
443
- <h3 class="font-medium text-purple-800 mb-2">Content Analysis</h3>
444
- <ul class="space-y-2 text-sm">
445
- <li class="flex justify-between">
446
- <span class="text-gray-600">Text Paragraphs:</span>
447
- <span id="summaryText" class="font-medium">0</span>
448
- </li>
449
- <li class="flex justify-between">
450
- <span class="text-gray-600">Code Blocks:</span>
451
- <span id="summaryCode" class="font-medium">0</span>
452
- </li>
453
- <li class="flex justify-between">
454
- <span class="text-gray-600">Tables:</span>
455
- <span id="summaryTables" class="font-medium">0</span>
456
- </li>
457
- </ul>
458
- </div>
459
-
460
- <div class="bg-green-50 p-4 rounded-lg">
461
- <h3 class="font-medium text-green-800 mb-2">Quality Metrics</h3>
462
- <ul class="space-y-2 text-sm">
463
- <li class="flex justify-between">
464
- <span class="text-gray-600">Avg. Quality Score:</span>
465
- <span id="summaryAvgScore" class="font-medium">0</span>
466
- </li>
467
- <li class="flex justify-between">
468
- <span class="text-gray-600">Highest Score:</span>
469
- <span id="summaryHighScore" class="font-medium">0</span>
470
- </li>
471
- <li class="flex justify-between">
472
- <span class="text-gray-600">Lowest Score:</span>
473
- <span id="summaryLowScore" class="font-medium">0</span>
474
- </li>
475
- </ul>
476
- </div>
477
- </div>
478
-
479
- <div class="mb-6">
480
- <h3 class="font-medium text-gray-800 mb-3">Top Keywords</h3>
481
- <div id="keywordCloud" class="flex flex-wrap gap-2">
482
- <span class="px-3 py-1 bg-gray-100 rounded-full text-sm">No keywords extracted yet</span>
483
- </div>
484
- </div>
485
-
486
- <div>
487
- <h3 class="font-medium text-gray-800 mb-3">Best Content</h3>
488
- <div id="topContent" class="space-y-4">
489
- <div class="p-4 bg-gray-50 rounded-lg text-sm text-gray-600">
490
- No content has been evaluated yet. Run a crawl to see results.
491
- </div>
492
- </div>
493
- </div>
494
- </div>
495
-
496
- <div id="contentTab" class="py-4 hidden">
497
- <div class="mb-4">
498
- <div class="relative">
499
- <input type="text" id="contentSearch" placeholder="Search content..."
500
- class="w-full pl-10 pr-4 py-2 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500">
501
- <div class="absolute inset-y-0 left-0 pl-3 flex items-center pointer-events-none">
502
- <i class="fas fa-search text-gray-400"></i>
503
- </div>
504
- </div>
505
- </div>
506
-
507
- <div id="contentResults" class="space-y-6">
508
- <!-- Content cards will be added here dynamically -->
509
- </div>
510
-
511
- <div id="contentPagination" class="flex justify-center mt-6 hidden">
512
- <nav class="inline-flex rounded-md shadow">
513
- <button class="px-3 py-1 rounded-l-md border border-gray-300 bg-white text-sm font-medium text-gray-700 hover:bg-gray-50">
514
- Previous
515
- </button>
516
- <button class="px-3 py-1 border-t border-b border-gray-300 bg-white text-sm font-medium text-blue-600 hover:bg-gray-50">
517
- 1
518
- </button>
519
- <button class="px-3 py-1 border border-gray-300 bg-white text-sm font-medium text-gray-700 hover:bg-gray-50 rounded-r-md">
520
- Next
521
- </button>
522
- </nav>
523
- </div>
524
- </div>
525
-
526
- <div id="jsonTab" class="py-4 hidden">
527
- <div class="bg-gray-800 rounded-lg p-4">
528
- <div class="flex justify-between items-center mb-3">
529
- <span class="text-gray-300 font-mono text-sm">output.json</span>
530
- <button id="copyJsonBtn" class="px-3 py-1 bg-gray-700 text-gray-300 rounded text-sm hover:bg-gray-600">
531
- <i class="fas fa-copy mr-1"></i> Copy
532
- </button>
533
- </div>
534
- <pre id="jsonViewer" class="text-gray-300 font-mono text-sm overflow-x-auto p-4 bg-gray-900 rounded">{
535
- "message": "Run a crawl to see the JSON output here"
536
- }</pre>
537
- </div>
538
- </div>
539
  </div>
 
 
 
 
 
 
 
 
540
  </div>
541
- </div>
 
 
 
 
542
  </div>
543
-
544
- <footer class="bg-gray-100 py-6 mt-12">
545
- <div class="container mx-auto px-4 text-center text-gray-600 text-sm">
546
- <p>Smart Web Crawler - Extract and organize web content into structured knowledge</p>
547
- <p class="mt-2">© 2023 AI Web Tools. All rights reserved.</p>
548
- </div>
549
- </footer>
550
-
551
  <script>
552
- // Global variables
553
- let crawlData = {
554
- startUrl: '',
555
- startTime: null,
556
- endTime: null,
557
- pagesCrawled: 0,
558
- contentSaved: 0,
559
- totalUrls: 0,
560
- validContent: 0,
561
- aiApproved: 0,
562
- totalScore: 0,
563
- errorCount: 0,
564
- results: [],
565
- keywords: [],
566
- stats: {
567
- text: 0,
568
- code: 0,
569
- tables: 0,
570
- lists: 0
571
- },
572
- logEntries: []
573
- };
574
-
575
- let isCrawling = false;
576
- let crawlInterval;
577
- let timerInterval;
578
- let elapsedSeconds = 0;
579
-
580
- // DOM elements
581
- const progressSection = document.getElementById('progressSection');
582
- const resultsSection = document.getElementById('resultsSection');
583
- const startCrawlBtn = document.getElementById('startCrawlBtn');
584
- const stopCrawlBtn = document.getElementById('stopCrawlBtn');
585
- const crawlLog = document.getElementById('crawlLog');
586
- const currentUrl = document.getElementById('currentUrl');
587
- const aiAnalysis = document.getElementById('aiAnalysis');
588
- const pagesCrawled = document.getElementById('pagesCrawled');
589
- const contentSaved = document.getElementById('contentSaved');
590
- const progressBar = document.getElementById('progressBar');
591
- const totalUrls = document.getElementById('totalUrls');
592
- const validContent = document.getElementById('validContent');
593
- const aiApproved = document.getElementById('aiApproved');
594
- const avgScore = document.getElementById('avgScore');
595
- const errorCount = document.getElementById('errorCount');
596
- const timeElapsed = document.getElementById('timeElapsed');
597
- const overallPercent = document.getElementById('overallPercent');
598
-
599
- // Progress bars
600
- const urlDiscoveryBar = document.getElementById('urlDiscoveryBar');
601
- const contentExtractionBar = document.getElementById('contentExtractionBar');
602
- const aiProcessingBar = document.getElementById('aiProcessingBar');
603
- const urlDiscoveryPercent = document.getElementById('urlDiscoveryPercent');
604
- const contentExtractionPercent = document.getElementById('contentExtractionPercent');
605
- const aiProcessingPercent = document.getElementById('aiProcessingPercent');
606
-
607
- // Progress steps
608
- const step1 = document.getElementById('step1');
609
- const step2 = document.getElementById('step2');
610
- const step3 = document.getElementById('step3');
611
- const step4 = document.getElementById('step4');
612
-
613
- // Configuration elements
614
- const baseUrlInput = document.getElementById('baseUrl');
615
- const maxDepthInput = document.getElementById('maxDepth');
616
- const maxConcurrentInput = document.getElementById('maxConcurrent');
617
- const extractTextCheckbox = document.getElementById('extractText');
618
- const extractCodeCheckbox = document.getElementById('extractCode');
619
- const extractTablesCheckbox = document.getElementById('extractTables');
620
- const extractListsCheckbox = document.getElementById('extractLists');
621
- const useLLMFilterCheckbox = document.getElementById('useLLMFilter');
622
- const minLLMScoreInput = document.getElementById('minLLMScore');
623
- const llmSettingsDiv = document.getElementById('llmSettings');
624
- const validateUrlBtn = document.getElementById('validateUrlBtn');
625
-
626
- // Results elements
627
- const summaryStartUrl = document.getElementById('summaryStartUrl');
628
- const summaryTotalPages = document.getElementById('summaryTotalPages');
629
- const summaryDuration = document.getElementById('summaryDuration');
630
- const summaryErrors = document.getElementById('summaryErrors');
631
- const summaryText = document.getElementById('summaryText');
632
- const summaryCode = document.getElementById('summaryCode');
633
- const summaryTables = document.getElementById('summaryTables');
634
- const summaryAvgScore = document.getElementById('summaryAvgScore');
635
- const summaryHighScore = document.getElementById('summaryHighScore');
636
- const summaryLowScore = document.getElementById('summaryLowScore');
637
- const keywordCloud = document.getElementById('keywordCloud');
638
- const topContent = document.getElementById('topContent');
639
- const contentResults = document.getElementById('contentResults');
640
- const jsonViewer = document.getElementById('jsonViewer');
641
- const downloadResultsBtn = document.getElementById('downloadResultsBtn');
642
- const clearResultsBtn = document.getElementById('clearResultsBtn');
643
- const copyJsonBtn = document.getElementById('copyJsonBtn');
644
- const tabSummary = document.getElementById('tabSummary');
645
- const tabContent = document.getElementById('tabContent');
646
- const tabJson = document.getElementById('tabJson');
647
-
648
- // Initialize UI
649
- document.addEventListener('DOMContentLoaded', function() {
650
- // Show/hide LLM settings based on checkbox
651
- useLLMFilterCheckbox.addEventListener('change', function() {
652
- llmSettingsDiv.style.display = this.checked ? 'block' : 'none';
653
- });
654
-
655
- // Set default values
656
- baseUrlInput.value = 'https://example.com';
657
-
658
- // Add event listeners
659
- validateUrlBtn.addEventListener('click', validateUrl);
660
- startCrawlBtn.addEventListener('click', startCrawling);
661
- stopCrawlBtn.addEventListener('click', stopCrawling);
662
- downloadResultsBtn.addEventListener('click', downloadResults);
663
- clearResultsBtn.addEventListener('click', clearResults);
664
- copyJsonBtn.addEventListener('click', copyJson);
665
- tabSummary.addEventListener('click', () => switchTab('summary'));
666
- tabContent.addEventListener('click', () => switchTab('content'));
667
- tabJson.addEventListener('click', () => switchTab('json'));
668
- });
669
 
670
- // Validate URL input
671
- function validateUrl() {
672
- const url = baseUrlInput.value.trim();
673
- const urlError = document.getElementById('urlError');
674
-
675
- if (!url.startsWith('http://') && !url.startsWith('https://')) {
676
- urlError.classList.remove('hidden');
677
- baseUrlInput.classList.add('border-red-500');
678
- return false;
679
  } else {
680
- urlError.classList.add('hidden');
681
- baseUrlInput.classList.remove('border-red-500');
682
- return true;
683
  }
684
- }
685
-
686
- // Start crawling simulation
687
- function startCrawling() {
688
- if (!validateUrl()) return;
689
-
690
- // Reset data
691
- crawlData = {
692
- startUrl: baseUrlInput.value.trim(),
693
- startTime: new Date(),
694
- endTime: null,
695
- pagesCrawled: 0,
696
- contentSaved: 0,
697
- totalUrls: 0,
698
- validContent: 0,
699
- aiApproved: 0,
700
- totalScore: 0,
701
- errorCount: 0,
702
- results: [],
703
- keywords: [],
704
- stats: {
705
- text: 0,
706
- code: 0,
707
- tables: 0,
708
- lists: 0
709
- },
710
- logEntries: []
711
- };
712
-
713
- // Reset progress bars
714
- progressBar.style.width = '0%';
715
- urlDiscoveryBar.style.width = '0%';
716
- contentExtractionBar.style.width = '0%';
717
- aiProcessingBar.style.width = '0%';
718
- urlDiscoveryPercent.textContent = '0%';
719
- contentExtractionPercent.textContent = '0%';
720
- aiProcessingPercent.textContent = '0%';
721
- overallPercent.textContent = '0%';
722
-
723
- // Reset progress steps
724
- step1.className = 'progress-step active';
725
- step2.className = 'progress-step pending';
726
- step3.className = 'progress-step pending';
727
- step4.className = 'progress-step pending';
728
-
729
- // Reset timer
730
- elapsedSeconds = 0;
731
- updateTimer();
732
- clearInterval(timerInterval);
733
- timerInterval = setInterval(updateTimer, 1000);
734
-
735
- // Show progress section
736
- progressSection.classList.remove('hidden');
737
- resultsSection.classList.add('hidden');
738
-
739
- // Update UI
740
- startCrawlBtn.disabled = true;
741
- isCrawling = true;
742
-
743
- // Clear log
744
- crawlLog.innerHTML = '';
745
-
746
- // Simulate crawling
747
- crawlInterval = setInterval(simulateCrawlStep, 1000);
748
-
749
- // Add initial log
750
- addLogEntry('Starting crawl from: ' + crawlData.startUrl, 'info');
751
- addLogEntry('Configuration: Max Depth=' + maxDepthInput.value +
752
- ', Max Concurrent=' + maxConcurrentInput.value, 'info');
753
-
754
- // Update current URL
755
- currentUrl.textContent = crawlData.startUrl;
756
-
757
- // Simulate URL validation
758
- setTimeout(() => {
759
- step1.className = 'progress-step completed';
760
- step2.className = 'progress-step active';
761
- addLogEntry('URL validated successfully', 'success');
762
- updateProgressBar('urlDiscovery', 10);
763
- }, 500);
764
- }
765
-
766
- // Update timer display
767
- function updateTimer() {
768
- elapsedSeconds++;
769
- const minutes = Math.floor(elapsedSeconds / 60);
770
- const seconds = elapsedSeconds % 60;
771
- timeElapsed.textContent = `${minutes.toString().padStart(2, '0')}:${seconds.toString().padStart(2, '0')}`;
772
- }
773
-
774
- // Update progress bars
775
- function updateProgressBar(type, percent) {
776
- let bar, percentElement;
777
-
778
- switch(type) {
779
- case 'urlDiscovery':
780
- bar = urlDiscoveryBar;
781
- percentElement = urlDiscoveryPercent;
782
- break;
783
- case 'contentExtraction':
784
- bar = contentExtractionBar;
785
- percentElement = contentExtractionPercent;
786
- break;
787
- case 'aiProcessing':
788
- bar = aiProcessingBar;
789
- percentElement = aiProcessingPercent;
790
- break;
791
- case 'overall':
792
- bar = progressBar;
793
- percentElement = overallPercent;
794
- break;
795
- default:
796
- return;
797
  }
798
-
799
- bar.style.width = percent + '%';
800
- percentElement.textContent = percent + '%';
801
-
802
- // Calculate overall progress as average of the three bars
803
- if (type !== 'overall') {
804
- const urlPercent = parseInt(urlDiscoveryPercent.textContent);
805
- const contentPercent = parseInt(contentExtractionPercent.textContent);
806
- const aiPercent = parseInt(aiProcessingPercent.textContent);
807
- const overall = Math.round((urlPercent + contentPercent + aiPercent) / 3);
808
- updateProgressBar('overall', overall);
809
  }
810
- }
811
-
812
- // Stop crawling
813
- function stopCrawling() {
814
- clearInterval(crawlInterval);
815
- clearInterval(timerInterval);
816
- isCrawling = false;
817
- crawlData.endTime = new Date();
818
-
819
- // Update UI
820
- startCrawlBtn.disabled = false;
821
- addLogEntry('Crawl stopped by user', 'warning');
822
- aiAnalysis.textContent = 'Crawl stopped - analyzing results';
823
-
824
- // Mark all steps as completed or error
825
- if (crawlData.errorCount > 0) {
826
- step4.className = 'progress-step error';
827
- addLogEntry('Crawl completed with errors', 'error');
828
  } else {
829
- step4.className = 'progress-step completed';
830
- addLogEntry('Crawl completed successfully', 'success');
831
  }
832
-
833
- // Process results after a short delay
834
- setTimeout(processResults, 500);
835
- }
836
 
837
- // Simulate a crawl step
838
- function simulateCrawlStep() {
839
- if (!isCrawling) return;
840
-
841
- // Randomly decide if we're done
842
- if (Math.random() < 0.1 && crawlData.pagesCrawled > 5) {
843
- stopCrawling();
844
- return;
845
- }
846
-
847
- // Randomly generate errors (10% chance)
848
- if (Math.random() < 0.1) {
849
- const errorTypes = [
850
- 'Connection timeout',
851
- 'SSL certificate error',
852
- '404 Not Found',
853
- '403 Forbidden',
854
- '500 Server Error'
855
- ];
856
- const errorType = errorTypes[Math.floor(Math.random() * errorTypes.length)];
857
- const fakeUrl = generateFakeUrl(crawlData.startUrl);
858
-
859
- crawlData.errorCount++;
860
- errorCount.textContent = crawlData.errorCount;
861
 
862
- addLogEntry(`Error crawling ${fakeUrl}: ${errorType}`, 'error');
863
-
864
- // Randomly fail a step if we have multiple errors
865
- if (crawlData.errorCount > 2 && Math.random() < 0.3) {
866
- const steps = [step2, step3, step4];
867
- const failedStep = steps[Math.floor(Math.random() * steps.length)];
868
- failedStep.className = 'progress-step error';
869
- addLogEntry(`Step failed: ${failedStep.querySelector('span').textContent}`, 'error');
870
- }
871
-
872
- return;
873
- }
874
-
875
- // Simulate finding new URLs
876
- const newUrls = Math.floor(Math.random() * 3) + 1;
877
- crawlData.totalUrls += newUrls;
878
- totalUrls.textContent = crawlData.totalUrls;
879
-
880
- // Update URL discovery progress
881
- if (crawlData.pagesCrawled < 5) {
882
- const progress = Math.min(100, 10 + (crawlData.pagesCrawled / 5) * 90);
883
- updateProgressBar('urlDiscovery', progress);
884
- }
885
-
886
- // Simulate crawling a page
887
- crawlData.pagesCrawled++;
888
- pagesCrawled.textContent = crawlData.pagesCrawled;
889
-
890
- // Simulate URL being crawled
891
- const fakeUrl = generateFakeUrl(crawlData.startUrl);
892
- currentUrl.textContent = fakeUrl;
893
-
894
- // Simulate AI analysis
895
- const aiMessages = [
896
- "Analyzing content structure...",
897
- "Evaluating content quality...",
898
- "Checking for relevant information...",
899
- "Identifying key concepts...",
900
- "Filtering low-quality content..."
901
- ];
902
- aiAnalysis.textContent = aiMessages[Math.floor(Math.random() * aiMessages.length)];
903
-
904
- // Randomly decide if we found valid content
905
- if (Math.random() > 0.3) {
906
- crawlData.validContent++;
907
- validContent.textContent = crawlData.validContent;
908
-
909
- // Simulate content being saved
910
- if (Math.random() > 0.5) {
911
- crawlData.contentSaved++;
912
- contentSaved.textContent = crawlData.contentSaved;
913
-
914
- // Update content extraction progress
915
- const contentProgress = Math.min(100, (crawlData.contentSaved / 10) * 100);
916
- updateProgressBar('contentExtraction', contentProgress);
917
-
918
- // Activate content extraction step if not already
919
- if (step3.className.includes('pending')) {
920
- step2.className = 'progress-step completed';
921
- step3.className = 'progress-step active';
922
- addLogEntry('Site mapping complete, starting content extraction', 'success');
923
- }
924
-
925
- // Generate fake content
926
- const contentTypes = ['text', 'code', 'table', 'list'];
927
- const type = contentTypes[Math.floor(Math.random() * contentTypes.length)];
928
-
929
- // Update stats
930
- if (type === 'text') crawlData.stats.text++;
931
- if (type === 'code') crawlData.stats.code++;
932
- if (type === 'table') crawlData.stats.tables++;
933
- if (type === 'list') crawlData.stats.lists++;
934
-
935
- // Generate fake score if LLM filter is enabled
936
- let score = 0;
937
- if (useLLMFilterCheckbox.checked) {
938
- score = Math.floor(Math.random() * 41) + 60; // 60-100
939
- crawlData.totalScore += score;
940
-
941
- if (score >= parseInt(minLLMScoreInput.value)) {
942
- crawlData.aiApproved++;
943
- aiApproved.textContent = crawlData.aiApproved;
944
-
945
- // Update AI processing progress
946
- const aiProgress = Math.min(100, (crawlData.aiApproved / 5) * 100);
947
- updateProgressBar('aiProcessing', aiProgress);
948
-
949
- // Activate AI processing step if not already
950
- if (step4.className.includes('pending')) {
951
- step3.className = 'progress-step completed';
952
- step4.className = 'progress-step active';
953
- addLogEntry('Content extraction complete, starting AI analysis', 'success');
954
- }
955
- }
956
- }
957
-
958
- // Calculate average score
959
- if (crawlData.aiApproved > 0) {
960
- const avg = Math.round(crawlData.totalScore / crawlData.aiApproved);
961
- avgScore.textContent = avg;
962
- }
963
-
964
- // Add log entry
965
- addLogEntry(`Saved ${type} content from ${fakeUrl}` +
966
- (useLLMFilterCheckbox.checked ? ` (AI Score: ${score})` : ''), 'success');
967
-
968
- // Add to results
969
- const result = {
970
- url: fakeUrl,
971
- type: type,
972
- content: generateFakeContent(type),
973
- score: score,
974
- keywords: generateFakeKeywords()
975
- };
976
-
977
- crawlData.results.push(result);
978
-
979
- // Add keywords to cloud
980
- result.keywords.forEach(keyword => {
981
- if (!crawlData.keywords.includes(keyword)) {
982
- crawlData.keywords.push(keyword);
983
- }
984
- });
985
  }
 
 
 
 
 
 
 
986
  }
987
-
988
- // Add random log entries
989
- if (Math.random() > 0.7) {
990
- const logMessages = [
991
- {msg: `Found ${newUrls} new URLs to crawl`, type: 'info'},
992
- {msg: "Processing page content...", type: 'info'},
993
- {msg: "Extracting text paragraphs...", type: 'info'},
994
- {msg: "Identifying code blocks...", type: 'info'},
995
- {msg: "Parsing table structures...", type: 'info'},
996
- {msg: "Waiting for server response...", type: 'warning'},
997
- {msg: "Rate limit approaching, slowing down requests", type: 'warning'}
998
- ];
999
- const message = logMessages[Math.floor(Math.random() * logMessages.length)];
1000
- addLogEntry(message.msg, message.type);
1001
- }
1002
- }
1003
-
1004
- // Add log entry with type
1005
- function addLogEntry(message, type = 'info') {
1006
- const now = new Date();
1007
- const timeStr = now.toLocaleTimeString();
1008
- const entry = document.createElement('div');
1009
- entry.className = `log-entry ${type} fade-in`;
1010
- entry.innerHTML = `<span class="text-gray-500">[${timeStr}]</span> ${message}`;
1011
- crawlLog.appendChild(entry);
1012
- crawlLog.scrollTop = crawlLog.scrollHeight;
1013
-
1014
- // Add to crawl data
1015
- crawlData.logEntries.push({
1016
- time: timeStr,
1017
- message: message,
1018
- type: type
1019
- });
1020
- }
1021
-
1022
- // Process results after crawl completes
1023
- function processResults() {
1024
- // Show results section
1025
- resultsSection.classList.remove('hidden');
1026
-
1027
- // Update summary
1028
- summaryStartUrl.textContent = crawlData.startUrl;
1029
- summaryTotalPages.textContent = crawlData.pagesCrawled;
1030
- summaryErrors.textContent = crawlData.errorCount;
1031
-
1032
- const duration = Math.round((crawlData.endTime - crawlData.startTime) / 1000);
1033
- summaryDuration.textContent = duration + 's';
1034
-
1035
- summaryText.textContent = crawlData.stats.text;
1036
- summaryCode.textContent = crawlData.stats.code;
1037
- summaryTables.textContent = crawlData.stats.tables;
1038
-
1039
- if (useLLMFilterCheckbox.checked && crawlData.aiApproved > 0) {
1040
- const avg = Math.round(crawlData.totalScore / crawlData.aiApproved);
1041
- summaryAvgScore.textContent = avg;
1042
-
1043
- // Find high and low scores
1044
- let high = 0, low = 100;
1045
- crawlData.results.forEach(result => {
1046
- if (result.score > high) high = result.score;
1047
- if (result.score < low) low = result.score;
1048
  });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1049
 
1050
- summaryHighScore.textContent = high;
1051
- summaryLowScore.textContent = low;
1052
- } else {
1053
- summaryAvgScore.textContent = 'N/A';
1054
- summaryHighScore.textContent = 'N/A';
1055
- summaryLowScore.textContent = 'N/A';
1056
- }
1057
-
1058
- // Update keyword cloud
1059
- updateKeywordCloud();
1060
-
1061
- // Update top content
1062
- updateTopContent();
1063
-
1064
- // Update content results
1065
- updateContentResults();
1066
-
1067
- // Update JSON viewer
1068
- updateJsonViewer();
1069
- }
1070
-
1071
- // Update keyword cloud
1072
- function updateKeywordCloud() {
1073
- keywordCloud.innerHTML = '';
1074
-
1075
- if (crawlData.keywords.length === 0) {
1076
- keywordCloud.innerHTML = '<span class="px-3 py-1 bg-gray-100 rounded-full text-sm">No keywords extracted</span>';
1077
- return;
1078
- }
1079
-
1080
- // Show up to 12 keywords with random sizes
1081
- const shuffled = [...crawlData.keywords].sort(() => 0.5 - Math.random());
1082
- const selected = shuffled.slice(0, Math.min(12, shuffled.length));
1083
-
1084
- selected.forEach(keyword => {
1085
- const sizes = ['text-xs', 'text-sm', 'text-base', 'text-lg'];
1086
- const size = sizes[Math.floor(Math.random() * sizes.length)];
1087
 
1088
- const colors = [
1089
- 'bg-blue-100 text-blue-800',
1090
- 'bg-green-100 text-green-800',
1091
- 'bg-purple-100 text-purple-800',
1092
- 'bg-yellow-100 text-yellow-800',
1093
- 'bg-red-100 text-red-800',
1094
- 'bg-indigo-100 text-indigo-800'
1095
- ];
1096
- const color = colors[Math.floor(Math.random() * colors.length)];
1097
 
1098
- const el = document.createElement('span');
1099
- el.className = `px-3 py-1 rounded-full ${size} ${color} font-medium`;
1100
- el.textContent = keyword;
1101
- keywordCloud.appendChild(el);
1102
- });
1103
- }
1104
-
1105
- // Update top content
1106
- function updateTopContent() {
1107
- topContent.innerHTML = '';
1108
-
1109
- if (crawlData.results.length === 0) {
1110
- topContent.innerHTML = `
1111
- <div class="p-4 bg-gray-50 rounded-lg text-sm text-gray-600">
1112
- No content has been evaluated yet. Run a crawl to see results.
1113
- </div>
1114
- `;
1115
- return;
1116
- }
1117
-
1118
- // Sort by score (if available) or just take first few
1119
- const sorted = [...crawlData.results].sort((a, b) => b.score - a.score);
1120
- const top = sorted.slice(0, Math.min(3, sorted.length));
1121
-
1122
- top.forEach(result => {
1123
- const card = document.createElement('div');
1124
- card.className = 'result-card bg-white border border-gray-200 rounded-lg p-4 hover:shadow-md transition-all';
1125
-
1126
- let contentPreview = '';
1127
- if (result.type === 'text') {
1128
- contentPreview = result.content.substring(0, 150) + '...';
1129
- } else if (result.type === 'code') {
1130
- contentPreview = result.content.split('\n')[0] + '...';
1131
- } else if (result.type === 'table') {
1132
- contentPreview = 'Table with ' + result.content.rows.length + ' rows';
1133
- } else if (result.type === 'list') {
1134
- contentPreview = 'List with ' + result.content.items.length + ' items';
1135
  }
1136
 
1137
- card.innerHTML = `
1138
- <div class="flex justify-between items-start mb-2">
1139
- <h4 class="font-medium text-blue-600">${result.type.charAt(0).toUpperCase() + result.type.slice(1)}</h4>
1140
- ${useLLMFilterCheckbox.checked ? `<span class="px-2 py-1 bg-green-100 text-green-800 rounded-full text-xs">Score: ${result.score}</span>` : ''}
1141
- </div>
1142
- <p class="text-sm text-gray-600 mb-3">${contentPreview}</p>
1143
- <div class="flex justify-between items-center">
1144
- <a href="${result.url}" target="_blank" class="text-xs text-blue-500 hover:underline">View source</a>
1145
- <div class="flex space-x-1">
1146
- ${result.keywords.slice(0, 2).map(k => `<span class="px-2 py-0.5 bg-gray-100 rounded-full text-xs">${k}</span>`).join('')}
1147
- </div>
1148
- </div>
1149
- `;
1150
-
1151
- topContent.appendChild(card);
1152
- });
1153
- }
1154
-
1155
- // Update content results
1156
- function updateContentResults() {
1157
- contentResults.innerHTML = '';
1158
-
1159
- if (crawlData.results.length === 0) {
1160
- contentResults.innerHTML = `
1161
- <div class="p-8 text-center text-gray-500">
1162
- <i class="fas fa-inbox text-4xl mb-2"></i>
1163
- <p>No content has been extracted yet.</p>
1164
- </div>
1165
- `;
1166
  return;
1167
  }
1168
-
1169
- crawlData.results.forEach(result => {
1170
- const card = document.createElement('div');
1171
- card.className = 'result-card bg-white border border-gray-200 rounded-lg p-4 hover:shadow-md transition-all fade-in';
1172
-
1173
- let contentDisplay = '';
1174
- if (result.type === 'text') {
1175
- contentDisplay = `<p class="text-gray-700">${result.content}</p>`;
1176
- } else if (result.type === 'code') {
1177
- contentDisplay = `
1178
- <div class="code-block rounded-lg p-3 overflow-x-auto">
1179
- <pre><code>${result.content}</code></pre>
1180
- </div>
1181
- `;
1182
- } else if (result.type === 'table') {
1183
- contentDisplay = `
1184
- <div class="overflow-x-auto">
1185
- <table class="min-w-full border">
1186
- <thead>
1187
- <tr class="bg-gray-100">
1188
- ${result.content.headers.map(h => `<th class="px-4 py-2 text-left border">${h}</th>`).join('')}
1189
- </tr>
1190
- </thead>
1191
- <tbody>
1192
- ${result.content.rows.map(row => `
1193
- <tr>
1194
- ${row.map(cell => `<td class="px-4 py-2 border">${cell}</td>`).join('')}
1195
- </tr>
1196
- `).join('')}
1197
- </tbody>
1198
- </table>
1199
- </div>
1200
- `;
1201
- } else if (result.type === 'list') {
1202
- contentDisplay = `
1203
- <ul class="list-disc pl-5 space-y-1">
1204
- ${result.content.items.map(item => `<li>${item}</li>`).join('')}
1205
- </ul>
1206
- `;
1207
- }
1208
-
1209
- card.innerHTML = `
1210
- <div class="flex justify-between items-start mb-3">
1211
- <div>
1212
- <h4 class="font-medium text-blue-600">${result.type.charAt(0).toUpperCase() + result.type.slice(1)}</h4>
1213
- <a href="${result.url}" target="_blank" class="text-xs text-gray-500 hover:underline">${result.url}</a>
1214
- </div>
1215
- ${useLLMFilterCheckbox.checked ? `<span class="px-2 py-1 bg-green-100 text-green-800 rounded-full text-xs">Score: ${result.score}</span>` : ''}
1216
- </div>
1217
- ${contentDisplay}
1218
- <div class="mt-3 pt-2 border-t border-gray-100">
1219
- <div class="flex flex-wrap gap-1">
1220
- ${result.keywords.map(k => `<span class="px-2 py-0.5 bg-gray-100 rounded-full text-xs">${k}</span>`).join('')}
1221
  </div>
1222
- </div>
1223
- `;
1224
-
1225
- contentResults.appendChild(card);
1226
- });
1227
- }
1228
-
1229
- // Update JSON viewer
1230
- function updateJsonViewer() {
1231
- const output = {
1232
- metadata: {
1233
- source: crawlData.startUrl,
1234
- pages: crawlData.pagesCrawled,
1235
- duration: Math.round((crawlData.endTime - crawlData.startTime) / 1000) + 's',
1236
- created: new Date().toISOString(),
1237
- errors: crawlData.errorCount
1238
- },
1239
- content: crawlData.results.map(r => ({
1240
- url: r.url,
1241
- type: r.type,
1242
- content: r.type === 'table' ? { headers: r.content.headers, rows: r.content.rows } : r.content,
1243
- score: r.score,
1244
- keywords: r.keywords
1245
- }))
1246
- };
1247
-
1248
- jsonViewer.textContent = JSON.stringify(output, null, 2);
1249
- }
1250
-
1251
- // Switch between tabs
1252
- function switchTab(tab) {
1253
- document.getElementById('summaryTab').classList.add('hidden');
1254
- document.getElementById('contentTab').classList.add('hidden');
1255
- document.getElementById('jsonTab').classList.add('hidden');
1256
-
1257
- document.getElementById('tabSummary').classList.remove('tab-active');
1258
- document.getElementById('tabContent').classList.remove('tab-active');
1259
- document.getElementById('tabJson').classList.remove('tab-active');
1260
-
1261
- document.getElementById('tabSummary').classList.add('text-gray-500');
1262
- document.getElementById('tabContent').classList.add('text-gray-500');
1263
- document.getElementById('tabJson').classList.add('text-gray-500');
1264
-
1265
- document.getElementById(tab + 'Tab').classList.remove('hidden');
1266
- document.getElementById('tab' + tab.charAt(0).toUpperCase() + tab.slice(1)).classList.add('tab-active');
1267
- document.getElementById('tab' + tab.charAt(0).toUpperCase() + tab.slice(1)).classList.remove('text-gray-500');
1268
- }
1269
-
1270
- // Download results
1271
- function downloadResults() {
1272
- const format = document.querySelector('input[name="outputFormat"]:checked').value;
1273
- const blob = format === 'json' ?
1274
- new Blob([JSON.stringify(crawlData.results, null, 2)], { type: 'application/json' }) :
1275
- new Blob([generateMarkdownOutput()], { type: 'text/markdown' });
1276
-
1277
- const url = URL.createObjectURL(blob);
1278
- const a = document.createElement('a');
1279
- a.href = url;
1280
- a.download = `crawl_results_${new Date().toISOString().slice(0, 10)}.${format}`;
1281
- document.body.appendChild(a);
1282
- a.click();
1283
- document.body.removeChild(a);
1284
- URL.revokeObjectURL(url);
1285
- }
1286
-
1287
- // Generate markdown output
1288
- function generateMarkdownOutput() {
1289
- let md = `# Web Crawl Results\n\n`;
1290
- md += `- **Source**: ${crawlData.startUrl}\n`;
1291
- md += `- **Pages Crawled**: ${crawlData.pagesCrawled}\n`;
1292
- md += `- **Content Saved**: ${crawlData.contentSaved}\n`;
1293
- md += `- **Errors**: ${crawlData.errorCount}\n`;
1294
- md += `- **Date**: ${new Date().toISOString()}\n\n`;
1295
-
1296
- crawlData.results.forEach(result => {
1297
- md += `## ${result.url}\n\n`;
1298
- md += `- **Type**: ${result.type}\n`;
1299
- if (useLLMFilterCheckbox.checked) {
1300
- md += `- **AI Score**: ${result.score}\n`;
1301
- }
1302
- md += `- **Keywords**: ${result.keywords.join(', ')}\n\n`;
1303
-
1304
- if (result.type === 'text') {
1305
- md += `${result.content}\n\n`;
1306
- } else if (result.type === 'code') {
1307
- md += `\`\`\`\n${result.content}\n\`\`\`\n\n`;
1308
- } else if (result.type === 'table') {
1309
- md += `| ${result.content.headers.join(' | ')} |\n`;
1310
- md += `| ${result.content.headers.map(() => '---').join(' | ')} |\n`;
1311
- result.content.rows.forEach(row => {
1312
- md += `| ${row.join(' | ')} |\n`;
1313
- });
1314
- md += '\n';
1315
- } else if (result.type === 'list') {
1316
- result.content.items.forEach(item => {
1317
- md += `- ${item}\n`;
1318
- });
1319
- md += '\n';
1320
- }
1321
-
1322
- md += '---\n\n';
1323
- });
1324
-
1325
- return md;
1326
- }
1327
-
1328
- // Copy JSON to clipboard
1329
- function copyJson() {
1330
- navigator.clipboard.writeText(jsonViewer.textContent)
1331
- .then(() => {
1332
- const copyBtn = document.querySelector('#jsonTab button');
1333
- copyBtn.innerHTML = '<i class="fas fa-check mr-1"></i> Copied!';
1334
- setTimeout(() => {
1335
- copyBtn.innerHTML = '<i class="fas fa-copy mr-1"></i> Copy';
1336
- }, 2000);
1337
  });
1338
- }
1339
-
1340
- // Clear results
1341
- function clearResults() {
1342
- if (confirm('Are you sure you want to clear all results?')) {
1343
- crawlData = {
1344
- startUrl: '',
1345
- startTime: null,
1346
- endTime: null,
1347
- pagesCrawled: 0,
1348
- contentSaved: 0,
1349
- totalUrls: 0,
1350
- validContent: 0,
1351
- aiApproved: 0,
1352
- totalScore: 0,
1353
- errorCount: 0,
1354
- results: [],
1355
- keywords: [],
1356
- stats: {
1357
- text: 0,
1358
- code: 0,
1359
- tables: 0,
1360
- lists: 0
1361
- },
1362
- logEntries: []
1363
- };
1364
-
1365
- updateKeywordCloud();
1366
- updateTopContent();
1367
- updateContentResults();
1368
- updateJsonViewer();
1369
-
1370
- // Reset summary
1371
- summaryStartUrl.textContent = '-';
1372
- summaryTotalPages.textContent = '0';
1373
- summaryDuration.textContent = '0s';
1374
- summaryErrors.textContent = '0';
1375
- summaryText.textContent = '0';
1376
- summaryCode.textContent = '0';
1377
- summaryTables.textContent = '0';
1378
- summaryAvgScore.textContent = '0';
1379
- summaryHighScore.textContent = '0';
1380
- summaryLowScore.textContent = '0';
1381
  }
1382
- }
1383
-
1384
- // Helper functions
1385
- function generateFakeUrl(baseUrl) {
1386
- const paths = [
1387
- 'about', 'contact', 'products', 'services', 'blog',
1388
- 'article', 'docs', 'tutorial', 'guide', 'faq'
1389
- ];
1390
- const extensions = ['', '.html', '.php', '/'];
1391
-
1392
- const path = paths[Math.floor(Math.random() * paths.length)];
1393
- const ext = extensions[Math.floor(Math.random() * extensions.length)];
1394
- const query = Math.random() > 0.7 ? '?id=' + Math.floor(Math.random() * 1000) : '';
1395
-
1396
- return baseUrl + '/' + path + ext + query;
1397
- }
1398
-
1399
- function generateFakeContent(type) {
1400
- if (type === 'text') {
1401
- const paragraphs = [
1402
- "The quick brown fox jumps over the lazy dog. This sentence contains all the letters in the English alphabet.",
1403
- "Web crawling is an essential technique for gathering information from websites. It involves systematically browsing the web to index and collect data.",
1404
- "Artificial intelligence is transforming many industries by automating complex tasks and providing insights from large datasets.",
1405
- "The future of technology lies in the convergence of AI, blockchain, and IoT, creating smarter and more connected systems.",
1406
- "Responsive web design ensures that websites adapt to different screen sizes and devices, providing optimal viewing experiences."
1407
- ];
1408
- return paragraphs[Math.floor(Math.random() * paragraphs.length)];
1409
- } else if (type === 'code') {
1410
- const languages = ['javascript', 'python', 'html', 'css', 'java'];
1411
- const language = languages[Math.floor(Math.random() * languages.length)];
1412
-
1413
- if (language === 'javascript') {
1414
- return `function greet(name) {\n return "Hello, " + name + "!";\n}\n\nconst message = greet("World");\nconsole.log(message);`;
1415
- } else if (language === 'python') {
1416
- return `def factorial(n):\n if n == 0:\n return 1\n else:\n return n * factorial(n-1)\n\nprint(factorial(5))`;
1417
- } else if (language === 'html') {
1418
- return `<!DOCTYPE html>\n<html>\n<head>\n <title>Example</title>\n</head>\n<body>\n <h1>Hello World</h1>\n<p style="border-radius: 8px; text-align: center; font-size: 12px; color: #fff; margin-top: 16px;position: fixed; left: 8px; bottom: 8px; z-index: 10; background: rgba(0, 0, 0, 0.8); padding: 4px 8px;">Made with <img src="https://enzostvs-deepsite.hf.space/logo.svg" alt="DeepSite Logo" style="width: 16px; height: 16px; vertical-align: middle;display:inline-block;margin-right:3px;filter:brightness(0) invert(1);"><a href="https://enzostvs-deepsite.hf.space" style="color: #fff;text-decoration: underline;" target="_blank" >DeepSite</a> - 🧬 <a href="https://enzostvs-deepsite.hf.space?remix=gewei20/smart-web-crawler" style="color: #fff;text-decoration: underline;" target="_blank" >Remix</a></p></body>\n</html>
 
1
  <!DOCTYPE html>
2
+ <html lang="zh-CN">
3
  <head>
4
  <meta charset="UTF-8">
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>交互式 Markdown 知识库处理器 (最终版)</title>
7
  <script src="https://cdn.tailwindcss.com"></script>
8
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
9
+ <link rel="preconnect" href="https://fonts.googleapis.com">
10
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
11
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700;800&display=swap" rel="stylesheet">
12
+ <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
13
  <style>
14
+ body { font-family: 'Inter', sans-serif; }
15
+ .gradient-bg { background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); }
16
+ .status-dot { width: 10px; height: 10px; border-radius: 50%; }
17
+ .status-dot.red { background-color: #ef4444; }
18
+ .status-dot.yellow { background-color: #f59e0b; animation: pulse 2s infinite; }
19
+ .status-dot.green { background-color: #22c55e; }
20
+ @keyframes pulse { 0%, 100% { opacity: 1; } 50% { opacity: .5; } }
21
+ .prose h1, .prose h2, .prose h3 { font-weight: 700; }
22
+ .prose p { margin-bottom: 1em; line-height: 1.6; }
23
+ .prose ul { list-style-type: disc; margin-left: 1.5em; }
24
+ .prose code { background-color: #e5e7eb; padding: 0.2em 0.4em; border-radius: 3px; font-size: 85%; }
25
+ .prose pre > code { background-color: transparent; padding: 0; }
26
+ details > summary { list-style: none; cursor: pointer; }
27
+ details > summary::-webkit-details-marker { display: none; }
28
+ details[open] summary .fa-chevron-down { transform: rotate(180deg); }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  </style>
30
  </head>
31
+ <body class="gradient-bg min-h-screen text-gray-800">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  <div class="container mx-auto px-4 py-8">
33
+ <header class="text-center mb-12">
34
+ <h1 class="text-4xl md:text-5xl font-bold mb-4"><span class="highlight-text relative inline-block z-10">Markdown</span> 知识库处理器</h1>
35
+ <p class="text-xl text-gray-600 max-w-3xl mx-auto">✨ Gemini 增强版:将静态文档转变为可对话、会总结的智能知识库。</p>
36
+ </header>
37
+
38
+ <section class="mb-16 bg-white p-6 sm:p-8 rounded-2xl shadow-lg border border-gray-200">
39
+ <div class="flex justify-between items-center mb-6">
40
+ <h2 class="text-3xl font-bold"><i class="fas fa-bolt text-blue-500 mr-2"></i>知识库控制台</h2>
41
+ <div id="status-container" class="flex items-center space-x-2">
42
+ <div id="status-dot" class="status-dot red"></div>
43
+ <span id="status-text" class="text-gray-600 font-medium">服务未连接</span>
44
+ </div>
45
+ </div>
46
+
47
+ <div class="bg-yellow-50 border border-yellow-200 p-6 rounded-lg mb-8">
48
+ <h3 class="font-bold text-xl mb-4 text-yellow-800"><i class="fas fa-key mr-2"></i>API 密钥配置</h3>
49
+ <p class="text-gray-700 mb-4">请输入您的应用专属 API 密钥以授权访问。</p>
50
+ <div class="flex flex-col sm:flex-row gap-4">
51
+ <input type="password" id="apiKeyInput" class="w-full px-4 py-2 border-2 border-gray-300 rounded-lg focus:ring-2 focus:ring-yellow-500" placeholder="在此输入您的 API 密钥">
52
+ <button id="saveApiKeyButton" class="bg-yellow-500 hover:bg-yellow-600 text-white font-bold py-2 px-6 rounded-lg transition-colors shadow flex-shrink-0">
53
+ <i class="fas fa-save mr-2"></i>保存密钥
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  </button>
55
  </div>
56
+ <p id="apiKeyMessage" class="text-sm text-gray-600 mt-3 h-5"></p>
57
  </div>
58
+
59
+ <div class="bg-gray-50 p-6 rounded-lg mb-8 border">
60
+ <h3 class="font-bold text-xl mb-2">1. 构建知识库</h3>
61
+ <p class="text-gray-600 mb-4">输入 Markdown 文件夹的本地绝对路径。</p>
62
+ <div class="flex flex-col sm:flex-row gap-4">
63
+ <input type="text" id="folderPathInput" class="w-full px-4 py-2 border-2 border-gray-300 rounded-lg focus:ring-2 focus:ring-indigo-500" placeholder="例如: C:\Users\YourName\Documents\Notes">
64
+ <button id="buildButton" class="bg-indigo-600 hover:bg-indigo-700 text-white font-bold py-2 px-6 rounded-lg transition-colors shadow flex-shrink-0" disabled>
65
+ <i class="fas fa-hammer mr-2"></i>开始构建
66
+ </button>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  </div>
68
+ <div class="mt-2">
69
+ <input id="clearExistingCheckbox" type="checkbox" class="h-4 w-4 text-indigo-600 border-gray-300 rounded focus:ring-indigo-500">
70
+ <label for="clearExistingCheckbox" class="ml-2 text-sm text-gray-700">在构建前清空现有知识库</label>
 
 
 
 
 
 
 
 
 
71
  </div>
72
+ <p id="build-message" class="text-sm text-gray-500 mt-3 h-5"></p>
73
+ <details class="mt-4">
74
+ <summary class="font-medium text-indigo-600">
75
+ 高级构建设置 <i class="fas fa-chevron-down ml-1 text-sm transition-transform"></i>
76
+ </summary>
77
+ <div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-4 gap-4 pt-4 border-t mt-2">
78
+ <div><label for="chunkSizeInput" class="block text-sm font-medium text-gray-700">块大小</label><input type="number" id="chunkSizeInput" value="4096" class="mt-1 block w-full p-2 border border-gray-300 rounded-md"></div>
79
+ <div><label for="overlapInput" class="block text-sm font-medium text-gray-700">重叠大小</label><input type="number" id="overlapInput" value="400" class="mt-1 block w-full p-2 border border-gray-300 rounded-md"></div>
80
+ <div><label for="maxFilesInput" class="block text-sm font-medium text-gray-700">最大文件数</label><input type="number" id="maxFilesInput" value="500" class="mt-1 block w-full p-2 border border-gray-300 rounded-md"></div>
81
+ <div><label for="sampleModeInput" class="block text-sm font-medium text-gray-700">采样模式</label><select id="sampleModeInput" class="mt-1 block w-full p-2 border-gray-300 rounded-md"><option value="largest">最大的</option><option value="random">随机</option><option value="recent">最新的</option></select></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  </div>
83
+ </details>
84
  </div>
85
+
86
+ <div class="bg-gray-50 p-6 rounded-lg mb-8 border">
87
+ <h3 class="font-bold text-xl mb-2">2. 搜索知识库</h3>
88
+ <div class="relative">
89
+ <input type="text" id="searchInput" class="w-full pl-4 pr-12 py-3 border-2 border-gray-300 rounded-lg" placeholder="输入问题开始搜索..." disabled>
90
+ <button id="searchButton" class="absolute inset-y-0 right-0 px-4 text-gray-600" disabled><i class="fas fa-search text-xl"></i></button>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  </div>
92
+ <details class="mt-4">
93
+ <summary class="font-medium text-blue-600">
94
+ 搜索设置 <i class="fas fa-chevron-down ml-1 text-sm transition-transform"></i>
95
+ </summary>
96
+ <div class="flex items-center gap-8 pt-4 border-t mt-2">
97
+ <div><label for="topKInput" class="block text-sm font-medium text-gray-700">返回结果数</label><input type="number" id="topKInput" value="5" class="mt-1 block w-full p-2 border border-gray-300 rounded-md"></div>
98
+ </div>
99
+ </details>
100
  </div>
101
+
102
+ <div id="summarySection" class="hidden"><div class="flex justify-between items-center mb-4"><h3 class="font-bold text-xl">✨ AI 智能总结</h3><button id="summarizeButton" class="bg-gradient-to-r from-purple-500 to-blue-500 text-white font-bold py-2 px-4 rounded-lg"><i class="fas fa-magic-wand-sparkles mr-2"></i>生成智能总结</button></div><div id="summaryResultCard" class="bg-blue-50 border-l-4 border-blue-400 p-4 rounded-r-lg"></div></div>
103
+ <div id="loadingIndicator" class="hidden text-center mt-8"><i class="fas fa-spinner fa-spin text-3xl text-blue-500"></i><p class="mt-2">正在检索...</p></div>
104
+ <div id="searchResults" class="mt-8 grid grid-cols-1 md:grid-cols-2 gap-6"></div>
105
+ </section>
106
  </div>
107
+
 
 
 
 
 
 
 
108
  <script>
109
+ document.addEventListener('DOMContentLoaded', () => {
110
+ const API_BASE_URL = 'http://127.0.0.1:5000';
111
+ const el = id => document.getElementById(id);
112
+
113
+ const statusDot = el('status-dot'), statusText = el('status-text');
114
+ const apiKeyInput = el('apiKeyInput'), saveApiKeyButton = el('saveApiKeyButton'), apiKeyMessage = el('apiKeyMessage');
115
+ const folderPathInput = el('folderPathInput'), buildButton = el('buildButton'), buildMessage = el('build-message'), clearExistingCheckbox = el('clearExistingCheckbox');
116
+ const chunkSizeInput = el('chunkSizeInput'), overlapInput = el('overlapInput'), maxFilesInput = el('maxFilesInput'), sampleModeInput = el('sampleModeInput');
117
+ const searchInput = el('searchInput'), searchButton = el('searchButton'), topKInput = el('topKInput');
118
+ const summarySection = el('summarySection'), summarizeButton = el('summarizeButton'), summaryResultCard = el('summaryResultCard');
119
+ const loadingIndicator = el('loadingIndicator'), searchResultsContainer = el('searchResults');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ let lastSearchResults = [];
122
+ let statusInterval;
123
+
124
+ const saveApiKey = () => {
125
+ const key = apiKeyInput.value.trim();
126
+ if (key) {
127
+ localStorage.setItem('knowledgeBaseApiKey', key);
128
+ apiKeyMessage.textContent = '密钥已保存到浏览器。';
129
+ apiKeyMessage.style.color = 'green';
130
  } else {
131
+ apiKeyMessage.textContent = '请输入有效的密钥。';
132
+ apiKeyMessage.style.color = 'red';
 
133
  }
134
+ setTimeout(() => apiKeyMessage.textContent = '', 3000);
135
+ };
136
+
137
+ const loadApiKey = () => {
138
+ const key = localStorage.getItem('knowledgeBaseApiKey');
139
+ if (key) {
140
+ apiKeyInput.value = key;
141
+ apiKeyMessage.textContent = '已从本地加载密钥。';
142
+ setTimeout(() => apiKeyMessage.textContent = '', 3000);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  }
144
+ };
145
+
146
+ const getAuthHeaders = (isGetRequest = false) => {
147
+ const key = localStorage.getItem('knowledgeBaseApiKey');
148
+ const headers = {};
149
+ if (!isGetRequest) {
150
+ headers['Content-Type'] = 'application/json';
 
 
 
 
151
  }
152
+ if (key) {
153
+ headers['X-API-Key'] = key;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  } else {
155
+ console.warn("API Key not found in localStorage.");
 
156
  }
157
+ return headers;
158
+ };
 
 
159
 
160
+ const updateStatus = async () => {
161
+ try {
162
+ const response = await fetch(`${API_BASE_URL}/status`);
163
+ if (!response.ok) throw new Error('Network response was not ok');
164
+ const data = await response.json();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+ statusText.textContent = data.message;
167
+ statusDot.className = 'status-dot';
168
+ const isReadyForSearch = data.is_built && !data.is_building;
169
+ const isReadyForBuild = !data.is_building;
170
+
171
+ searchInput.disabled = !isReadyForSearch;
172
+ searchButton.disabled = !isReadyForSearch;
173
+ buildButton.disabled = !isReadyForBuild;
174
+
175
+ if (data.is_building) {
176
+ statusDot.classList.add('yellow');
177
+ buildButton.innerHTML = '<i class="fas fa-spinner fa-spin mr-2"></i>构建中...';
178
+ } else {
179
+ buildButton.innerHTML = '<i class="fas fa-hammer mr-2"></i>开始构建';
180
+ statusDot.classList.add(data.is_built ? 'green' : 'red');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  }
182
+ } catch (error) {
183
+ statusText.textContent = '服务连接失败';
184
+ statusDot.className = 'status-dot red';
185
+ searchInput.disabled = true;
186
+ searchButton.disabled = true;
187
+ buildButton.disabled = true;
188
+ if(statusInterval) clearInterval(statusInterval);
189
  }
190
+ };
191
+
192
+ const handleBuild = async () => {
193
+ const folderPath = folderPathInput.value.trim();
194
+ if (!folderPath) { buildMessage.textContent = '错误:文件夹路径不能为空。'; return; }
195
+
196
+ const buildParams = {
197
+ folder_path: folderPath,
198
+ clear_existing: clearExistingCheckbox.checked,
199
+ chunk_size: parseInt(chunkSizeInput.value, 10) || 4096,
200
+ overlap: parseInt(overlapInput.value, 10) || 400,
201
+ max_files: parseInt(maxFilesInput.value, 10) || 500,
202
+ sample_mode: sampleModeInput.value,
203
+ };
204
+
205
+ buildMessage.textContent = '已发送构建请求...';
206
+ try {
207
+ const response = await fetch(`${API_BASE_URL}/build`, {
208
+ method: 'POST',
209
+ headers: getAuthHeaders(),
210
+ body: JSON.stringify(buildParams),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  });
212
+ const result = await response.json();
213
+ if (!response.ok) throw new Error(result.error || '构建请求失败');
214
+ buildMessage.textContent = result.message;
215
+ updateStatus();
216
+ } catch (error) { buildMessage.textContent = `错误: ${error.message}`; }
217
+ };
218
+
219
+ const performSearch = async () => {
220
+ const query = searchInput.value.trim();
221
+ if (!query) return;
222
+
223
+ summarySection.classList.add('hidden');
224
+ summaryResultCard.innerHTML = '';
225
+ searchResultsContainer.innerHTML = '';
226
+ loadingIndicator.classList.remove('hidden');
227
+
228
+ const searchUrl = new URL(`${API_BASE_URL}/search`);
229
+ searchUrl.searchParams.append('query', query);
230
+ searchUrl.searchParams.append('top_k', topKInput.value || 5);
231
+
232
+ try {
233
+ const response = await fetch(searchUrl, { method: 'GET', headers: getAuthHeaders(true) });
234
+ if (response.status === 403) throw new Error('授权失败。请检查 API 密钥是否正确。');
235
 
236
+ const results = await response.json();
237
+ if (!response.ok) throw new Error(results.error || '搜索失败');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
+ lastSearchResults = results;
240
+ displayResults(results, query);
 
 
 
 
 
 
 
241
 
242
+ if (results.length > 0) {
243
+ summarySection.classList.remove('hidden');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  }
245
 
246
+ } catch (error) {
247
+ searchResultsContainer.innerHTML = `<p class="text-center text-red-500 md:col-span-2">搜索出错: ${error.message}</p>`;
248
+ } finally {
249
+ loadingIndicator.classList.add('hidden');
250
+ }
251
+ };
252
+
253
+ const displayResults = (results, query) => {
254
+ if (!results || results.length === 0) {
255
+ searchResultsContainer.innerHTML = `<p class="text-center text-gray-500 md:col-span-2">未找到与 "${query}" 相关的结果。</p>`;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  return;
257
  }
258
+ searchResultsContainer.innerHTML = results.map(result => {
259
+ const distance = typeof result.distance === 'number' ? result.distance : 2.0;
260
+ const similarity = Math.max(0, 1 - distance / 2); // Normalize score to be more intuitive
261
+ const fileName = result.metadata?.file_name || '未知文件';
262
+ const sourcePath = result.metadata?.source || fileName;
263
+ const sanitizedContent = result.content.replace(/</g, "&lt;").replace(/>/g, "&gt;");
264
+ return `
265
+ <div class="bg-white border border-gray-200 rounded-lg p-4 transition-all hover:shadow-md">
266
+ <div class="flex justify-between items-center mb-3">
267
+ <h4 class="font-bold text-blue-700 truncate pr-4" title="${sourcePath}">${fileName}</h4>
268
+ <span class="text-xs font-medium bg-blue-100 text-blue-800 py-1 px-2 rounded-full flex-shrink-0">相似度: ${similarity.toFixed(4)}</span>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  </div>
270
+ <p class="text-gray-600 text-sm break-words">${sanitizedContent}</p>
271
+ </div>`;
272
+ }).join('');
273
+ };
274
+
275
+ const handleSummarize = async () => {
276
+ if (lastSearchResults.length === 0) return;
277
+ summarizeButton.disabled = true;
278
+ summarizeButton.innerHTML = '<i class="fas fa-spinner fa-spin mr-2"></i>AI 正在思考...';
279
+ summaryResultCard.innerHTML = '<p class="text-gray-600">请稍候,正在为您生成总结...</p>';
280
+ try {
281
+ const response = await fetch(`${API_BASE_URL}/summarize`, {
282
+ method: 'POST',
283
+ headers: getAuthHeaders(),
284
+ body: JSON.stringify({ query: searchInput.value, results: lastSearchResults }),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  });
286
+ if (response.status === 403) throw new Error('授权失败。请检查 API 密钥。');
287
+ const data = await response.json();
288
+ if (!response.ok) throw new Error(data.error || '总结生成失败');
289
+ summaryResultCard.innerHTML = marked.parse(data.summary);
290
+ summaryResultCard.classList.add('prose');
291
+ } catch (error) {
292
+ summaryResultCard.innerHTML = `<p class="text-red-500">生成总结时出错: ${error.message}</p>`;
293
+ } finally {
294
+ summarizeButton.disabled = false;
295
+ summarizeButton.innerHTML = '<i class="fas fa-magic-wand-sparkles mr-2"></i>重新生成总结';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  }
297
+ };
298
+
299
+ saveApiKeyButton.addEventListener('click', saveApiKey);
300
+ buildButton.addEventListener('click', handleBuild);
301
+ searchButton.addEventListener('click', performSearch);
302
+ searchInput.addEventListener('keyup', e => e.key === 'Enter' && performSearch());
303
+ summarizeButton.addEventListener('click', handleSummarize);
304
+
305
+ loadApiKey();
306
+ updateStatus();
307
+ statusInterval = setInterval(updateStatus, 5000);
308
+ });
309
+ </script>
310
+ </body>
311
+ </html>