Spaces:
Runtime error
Runtime error
import io | |
import os | |
import re | |
import time | |
import requests | |
from typing import Any, Dict, List, Optional, Set, Union | |
from difflib import get_close_matches | |
from pathlib import Path | |
from itertools import islice | |
from functools import partial | |
from multiprocessing.pool import ThreadPool | |
from queue import Queue, Empty | |
from typing import Callable, Iterable, Iterator, Optional, TypeVar | |
import gradio as gr | |
import pandas as pd | |
import requests.exceptions | |
from huggingface_hub import InferenceClient, create_repo, DatasetCard | |
from huggingface_hub.utils import HfHubHTTPError | |
import json | |
# --- Configuration --- | |
model_id = "microsoft/Phi-3-mini-4k-instruct" | |
client = InferenceClient(model_id) | |
save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN") | |
MAX_TOTAL_NB_ITEMS = 100 | |
MAX_NB_ITEMS_PER_GENERATION_CALL = 10 | |
NUM_ROWS = 100 | |
NUM_VARIANTS = 10 | |
NAMESPACE = "infinite-dataset-hub" | |
URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub" | |
# --- Prompt Templates --- | |
GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = ( | |
"A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. " | |
f"Generate a list of {MAX_NB_ITEMS_PER_GENERATION_CALL} names of quality datasets that don't exist but sound plausible and would " | |
"be helpful. Feel free to reuse words from the query '{search_query}' to name the datasets. " | |
"Every dataset should be about '{search_query}' and have descriptive tags/keywords including the ML task name associated with the dataset (classification, regression, anomaly detection, etc.). Use the following format:\n1. DatasetName1 (tag1, tag2, tag3)\n2. DatasetName2 (tag1, tag2, tag3)" | |
) | |
GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS = ( | |
"An ML practitioner is looking for a dataset CSV after the query '{search_query}'. " | |
"Generate the first 5 rows of a plausible and quality CSV for the dataset '{dataset_name}'. " | |
"You can get inspiration from related keywords '{tags}' but most importantly the dataset should correspond to the query '{search_query}'. " | |
"Focus on quality text content and use a 'label' or 'labels' column if it makes sense (invent labels, avoid reusing the keywords, be accurate while labelling texts). " | |
"Reply using a short description of the dataset with title **Dataset Description:** followed by the CSV content in a code block and with title **CSV Content Preview:**." | |
) | |
GENERATE_MORE_ROWS = "Can you give me 10 additional samples in CSV format as well? Use the same CSV header '{csv_header}'." | |
GENERATE_VARIANTS_WITH_RARITY_AND_LABEL = "Focus on generating samples for the label '{label}' and ideally generate {rarity} samples." | |
GENERATE_VARIANTS_WITH_RARITY = "Focus on generating {rarity} samples." | |
# --- Default Datasets for Landing Page --- | |
landing_page_datasets_generated_text = """ | |
1. NewsEventsPredict (classification, media, trend) | |
2. FinancialForecast (economy, stocks, regression) | |
3. HealthMonitor (science, real-time, anomaly detection) | |
4. SportsAnalysis (classification, performance, player tracking) | |
5. SciLiteracyTools (language modeling, science literacy, text classification) | |
6. RetailSalesAnalyzer (consumer behavior, sales trend, segmentation) | |
7. SocialSentimentEcho (social media, emotion analysis, clustering) | |
8. NewsEventTracker (classification, public awareness, topical clustering) | |
9. HealthVitalSigns (anomaly detection, biometrics, prediction) | |
10. GameStockPredict (classification, finance, sports contingency) | |
""" | |
default_output = landing_page_datasets_generated_text.strip().split("\n") | |
assert len(default_output) == MAX_NB_ITEMS_PER_GENERATION_CALL | |
# --- Dataset Card Template --- | |
DATASET_CARD_CONTENT = """ | |
--- | |
license: mit | |
tags: | |
- infinite-dataset-hub | |
- synthetic | |
--- | |
{title} | |
_Note: This is an AI-generated dataset so its content may be inaccurate or false_ | |
{content} | |
**Source of the data:** | |
The dataset was generated using the [Infinite Dataset Hub]({url}) and {model_id} using the query '{search_query}': | |
- **Dataset Generation Page**: {dataset_url} | |
- **Model**: https://huggingface.co/{model_id} | |
- **More Datasets**: https://huggingface.co/datasets?other=infinite-dataset-hub | |
""" | |
# --- Gradio HTML --- | |
html = """ | |
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>Infinite Dataset Hub</title> | |
<script src="https://cdn.tailwindcss.com"></script> | |
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script> | |
<script src="https://cdn.jsdelivr.net/npm/papaparse@5.3.0/papaparse.min.js"></script> | |
<script> | |
tailwind.config = { | |
darkMode: 'class', | |
theme: { | |
extend: { | |
colors: { | |
primary: '#5D5CDE', | |
}, | |
} | |
} | |
} | |
</script> | |
<style> | |
.shimmer { | |
background: linear-gradient(90deg, #f0f0f0 25%, #e0e0e0 50%, #f0f0f0 75%); | |
background-size: 200% 100%; | |
animation: shimmer 1.5s infinite; | |
border-radius: 4px; | |
} | |
@keyframes shimmer { | |
0% { | |
background-position: -200% 0; | |
} | |
100% { | |
background-position: 200% 0; | |
} | |
} | |
/* Dark mode overrides */ | |
.dark .shimmer { | |
background: linear-gradient(90deg, #2a2a2a 25%, #3a3a3a 50%, #2a2a2a 75%); | |
background-size: 200% 100%; | |
} | |
.dataset-card { | |
transition: transform 0.2s, box-shadow 0.2s; | |
} | |
.dataset-card:hover { | |
transform: translateY(-2px); | |
box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05); | |
} | |
.dark .dataset-card:hover { | |
box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.3), 0 4px 6px -2px rgba(0, 0, 0, 0.2); | |
} | |
/* Table styling */ | |
table { | |
width: 100%; | |
border-collapse: collapse; | |
margin: 1rem 0; | |
} | |
table thead th { | |
background-color: #f3f4f6; | |
padding: 0.75rem; | |
text-align: left; | |
font-weight: 600; | |
} | |
.dark table thead th { | |
background-color: #374151; | |
} | |
table tbody td { | |
padding: 0.75rem; | |
border-top: 1px solid #e5e7eb; | |
} | |
.dark table tbody td { | |
border-top: 1px solid #4b5563; | |
} | |
table tbody tr:nth-child(even) { | |
background-color: #f9fafb; | |
} | |
.dark table tbody tr:nth-child(even) { | |
background-color: #1f2937; | |
} | |
/* Search engine badge */ | |
.engine-badge { | |
position: absolute; | |
top: -8px; | |
right: -8px; | |
font-size: 0.7rem; | |
padding: 2px 6px; | |
border-radius: 9999px; | |
background-color: #5D5CDE; | |
color: white; | |
box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
} | |
.dark .engine-badge { | |
box-shadow: 0 2px 4px rgba(0,0,0,0.3); | |
} | |
/* Toggle switch */ | |
.toggle-switch { | |
position: relative; | |
display: inline-block; | |
width: 50px; | |
height: 24px; | |
} | |
.toggle-switch input { | |
opacity: 0; | |
width: 0; | |
height: 0; | |
} | |
.toggle-slider { | |
position: absolute; | |
cursor: pointer; | |
top: 0; | |
left: 0; | |
right: 0; | |
bottom: 0; | |
background-color: #ccc; | |
transition: .4s; | |
border-radius: 24px; | |
} | |
.toggle-slider:before { | |
position: absolute; | |
content: ""; | |
height: 16px; | |
width: 16px; | |
left: 4px; | |
bottom: 4px; | |
background-color: white; | |
transition: .4s; | |
border-radius: 50%; | |
} | |
input:checked + .toggle-slider { | |
background-color: #5D5CDE; | |
} | |
input:checked + .toggle-slider:before { | |
transform: translateX(26px); | |
} | |
</style> | |
</head> | |
<body class="bg-white dark:bg-gray-900 text-gray-800 dark:text-gray-200 min-h-screen"> | |
<!-- Dark mode detection --> | |
<script> | |
if (window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches) { | |
document.documentElement.classList.add('dark'); | |
} | |
window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', event => { | |
if (event.matches) { | |
document.documentElement.classList.add('dark'); | |
} else { | |
document.documentElement.classList.remove('dark'); | |
} | |
}); | |
</script> | |
<div class="container mx-auto px-4 py-8"> | |
<!-- Header --> | |
<header class="text-center mb-8"> | |
<h1 class="text-3xl font-bold mb-2">🤗 Infinite Dataset Hub ♾️</h1> | |
<p class="text-lg text-gray-600 dark:text-gray-400">Generate datasets from AI and real-world data sources</p> | |
</header> | |
<!-- Main Content --> | |
<main> | |
<!-- Search Section --> | |
<div id="search-page" class="mb-8"> | |
<div class="max-w-3xl mx-auto"> | |
<div class="mb-4"> | |
<div class="flex mb-2"> | |
<input id="search-input" type="text" placeholder="Search datasets, get infinite results" | |
class="flex-grow px-4 py-3 text-base rounded-l-lg border border-gray-300 dark:border-gray-700 focus:outline-none focus:ring-2 focus:ring-primary dark:bg-gray-800"> | |
<button id="search-button" class="bg-primary text-white px-6 py-3 rounded-r-lg hover:bg-opacity-90 transition"> | |
🔍 | |
</button> | |
</div> | |
<div class="flex items-center justify-between p-3 bg-gray-100 dark:bg-gray-800 rounded-lg"> | |
<div class="flex items-center"> | |
<label class="toggle-switch mr-3"> | |
<input type="checkbox" id="data-source-toggle" checked> | |
<span class="toggle-slider"></span> | |
</label> | |
<div> | |
<span id="data-source-text" class="font-medium">Using: Real + AI Data</span> | |
<p class="text-xs text-gray-500 dark:text-gray-400">Toggle to switch between data sources</p> | |
</div> | |
</div> | |
<button id="engine-settings-button" class="text-primary hover:underline flex items-center"> | |
<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-1" viewBox="0 0 20 20" fill="currentColor"> | |
<path fill-rule="evenodd" d="M11.49 3.17c-.38-1.56-2.6-1.56-2.98 0a1.532 1.532 0 01-2.286.948c-1.372-.836-2.942.734-2.106 2.106.54.886.061 2.042-.947 2.287-1.561.379-1.561 2.6 0 2.978a1.532 1.532 0 01.947 2.287c-.836 1.372.734 2.942 2.106 2.106a1.532 1.532 0 012.287.947c.379 1.561 2.6 1.561 2.978 0a1.533 1.533 0 012.287-.947c1.372.836 2.942-.734 2.106-2.106a1.533 1.533 0 01.947-2.287c1.561-.379 1.561-2.6 0-2.978a1.532 1.532 0 01-.947-2.287c.836-1.372-.734-2.942-2.106-2.106a1.532 1.532 0 01-2.287-.947zM10 13a3 3 0 100-6 3 3 0 000 6z" clip-rule="evenodd" /> | |
</svg> | |
Search Engines | |
</button> | |
</div> | |
</div> | |
<!-- Search Engine Selection Modal --> | |
<div id="engine-modal" class="fixed inset-0 bg-black bg-opacity-50 flex items-center justify-center z-50 hidden"> | |
<div class="bg-white dark:bg-gray-800 rounded-lg p-6 max-w-lg w-full max-h-[80vh] overflow-y-auto"> | |
<div class="flex justify-between items-center mb-4"> | |
<h3 class="text-xl font-bold">Search Engine Settings</h3> | |
<button id="close-modal-button" class="text-gray-500 hover:text-gray-700 dark:text-gray-400 dark:hover:text-gray-200"> | |
<svg xmlns="http://www.w3.org/2000/svg" class="h-6 w-6" fill="none" viewBox="0 0 24 24" stroke="currentColor"> | |
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M6 18L18 6M6 6l12 12" /> | |
</svg> | |
</button> | |
</div> | |
<p class="mb-4 text-sm text-gray-600 dark:text-gray-400"> | |
Select which search engines to use for real data retrieval. A diverse selection improves results. | |
</p> | |
<div id="engine-options" class="space-y-2 mb-6"> | |
<!-- Engine options will be dynamically inserted here --> | |
</div> | |
<div class="flex justify-between"> | |
<button id="select-all-engines" class="text-primary hover:underline">Select All</button> | |
<button id="deselect-all-engines" class="text-primary hover:underline">Deselect All</button> | |
</div> | |
<div class="mt-6 flex justify-end"> | |
<button id="save-engines-button" class="bg-primary text-white px-4 py-2 rounded hover:bg-opacity-90 transition"> | |
Save Settings | |
</button> | |
</div> | |
</div> | |
</div> | |
<div id="dataset-results" class="grid grid-cols-1 md:grid-cols-2 gap-4 mt-6"> | |
<!-- Dataset cards will be dynamically inserted here --> | |
</div> | |
<div id="load-more-container" class="text-center mt-6 hidden"> | |
<button id="load-more-button" class="bg-gray-200 dark:bg-gray-700 px-6 py-3 rounded-lg hover:bg-gray-300 dark:hover:bg-gray-600 transition"> | |
Load more datasets | |
</button> | |
</div> | |
</div> | |
</div> | |
<!-- Dataset Detail Page --> | |
<div id="dataset-page" class="hidden max-w-4xl mx-auto"> | |
<button id="back-button" class="flex items-center text-primary mb-4 hover:underline"> | |
<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-1" viewBox="0 0 20 20" fill="currentColor"> | |
<path fill-rule="evenodd" d="M9.707 14.707a1 1 0 01-1.414 0l-4-4a1 1 0 010-1.414l4-4a1 1 0 011.414 1.414L7.414 9H15a1 1 0 110 2H7.414l2.293 2.293a1 1 0 010 1.414z" clip-rule="evenodd" /> | |
</svg> | |
Back to Search | |
</button> | |
<div id="dataset-header" class="mb-4"> | |
<div class="flex items-center justify-between"> | |
<h2 id="dataset-title" class="text-2xl font-bold"></h2> | |
<span id="data-source-badge" class="px-3 py-1 rounded-full text-xs font-medium bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200"> | |
Real Data | |
</span> | |
</div> | |
<div id="dataset-tags" class="text-sm text-gray-600 dark:text-gray-400 mt-1"></div> | |
</div> | |
<div id="data-source-info" class="bg-blue-50 dark:bg-blue-900 p-4 rounded-lg mb-6 text-blue-800 dark:text-blue-200"> | |
<h3 class="font-semibold mb-1 flex items-center"> | |
<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-1" viewBox="0 0 20 20" fill="currentColor"> | |
<path fill-rule="evenodd" d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-7-4a1 1 0 11-2 0 1 1 0 012 0zM9 9a1 1 0 000 2v3a1 1 0 001 1h1a1 1 0 100-2v-3a1 1 0 00-1-1H9z" clip-rule="evenodd" /> | |
</svg> | |
Data Source Information | |
</h3> | |
<p id="source-details" class="text-sm"></p> | |
</div> | |
<div id="dataset-description" class="prose dark:prose-invert prose-sm sm:prose max-w-none mb-6"></div> | |
<div id="dataset-preview" class="mb-6 overflow-x-auto"> | |
<h3 class="text-xl font-semibold mb-3">Dataset Preview</h3> | |
<div id="preview-table" class="border dark:border-gray-700 rounded-lg overflow-hidden"></div> | |
</div> | |
<div id="generate-actions" class="mb-8"> | |
<button id="generate-full-button" class="bg-primary text-white px-6 py-3 rounded-lg hover:bg-opacity-90 transition mr-3"> | |
Generate Full Dataset | |
</button> | |
<div id="generate-status" class="hidden mt-4"> | |
<div class="flex items-center"> | |
<div class="animate-spin rounded-full h-5 w-5 border-b-2 border-primary mr-3"></div> | |
<span>Generating dataset... <span id="rows-count">0</span> rows created</span> | |
</div> | |
<div class="w-full bg-gray-200 dark:bg-gray-700 rounded-full h-2.5 mt-2"> | |
<div id="progress-bar" class="bg-primary h-2.5 rounded-full" style="width: 0%"></div> | |
</div> | |
</div> | |
</div> | |
<div id="full-dataset" class="hidden mb-6"> | |
<h3 class="text-xl font-semibold mb-3">Full Dataset</h3> | |
<div id="full-table" class="border dark:border-gray-700 rounded-lg overflow-hidden"></div> | |
<div class="mt-4 flex flex-wrap gap-3"> | |
<button id="download-csv-button" class="bg-green-600 hover:bg-green-700 text-white px-4 py-2 rounded-lg transition flex items-center"> | |
<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-2" viewBox="0 0 20 20" fill="currentColor"> | |
<path fill-rule="evenodd" d="M3 17a1 1 0 011-1h12a1 1 0 110 2H4a1 1 0 01-1-1zm3.293-7.707a1 1 0 011.414 0L9 10.586V3a1 1 0 112 0v7.586l1.293-1.293a1 1 0 111.414 1.414l-3 3a1 1 0 01-1.414 0l-3-3a1 1 0 010-1.414z" clip-rule="evenodd" /> | |
</svg> | |
Download CSV | |
</button> | |
<button id="download-json-button" class="bg-yellow-600 hover:bg-yellow-700 text-white px-4 py-2 rounded-lg transition flex items-center"> | |
<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-2" viewBox="0 0 20 20" fill="currentColor"> | |
<path fill-rule="evenodd" d="M3 17a1 1 0 011-1h12a1 1 0 110 2H4a1 1 0 01-1-1zm3.293-7.707a1 1 0 011.414 0L9 10.586V3a1 1 0 112 0v7.586l1.293-1.293a1 1 0 111.414 1.414l-3 3a1 1 0 01-1.414 0l-3-3a1 1 0 010-1.414z" clip-rule="evenodd" /> | |
</svg> | |
Download JSON | |
</button> | |
<button id="download-parquet-button" class="bg-blue-600 hover:bg-blue-700 text-white px-4 py-2 rounded-lg transition flex items-center"> | |
<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-2" viewBox="0 0 20 20" fill="currentColor"> | |
<path fill-rule="evenodd" d="M3 17a1 1 0 011-1h12a1 1 0 110 2H4a1 1 0 01-1-1zm3.293-7.707a1 1 0 011.414 0L9 10.586V3a1 1 0 112 0v7.586l1.293-1.293a1 1 0 111.414 1.414l-3 3a1 1 0 01-1.414 0l-3-3a1 1 0 010-1.414z" clip-rule="evenodd" /> | |
</svg> | |
Download Parquet | |
</button> | |
</div> | |
</div> | |
</div> | |
</main> | |
<!-- Footer --> | |
<footer class="mt-12 text-center text-sm text-gray-600 dark:text-gray-400"> | |
<p>Powered by Claude-3.7-Sonnet • Datasets generated from real sources and AI</p> | |
</footer> | |
</div> | |
<script> | |
// Constants and global state | |
const MAX_DATASETS_PER_PAGE = 10; | |
const MAX_FULL_DATASET_ROWS = 100; | |
// List of search engines | |
const searchEngines = [ | |
"AlltheInternet.com", "DuckDuckGo.com", "Google.com", "Bing.com", "Search.Yahoo.com", | |
"Startpage.com", "Qwant.com", "Ecosia.org", "WolframAlpha.com", "Mojeek.co.uk", | |
"Search.Brave.com", "Yandex.com", "Baidu.com", "Gibiru.com", "MetaGer.org", | |
"Swisscows.com", "Presearch.com", "Ekoru.org", "Search.Lilo.org" | |
]; | |
let currentDatasets = []; | |
let currentPage = 1; | |
let currentSearchQuery = ''; | |
let currentDataset = null; | |
let fullDatasetRows = []; | |
let useRealData = true; | |
let selectedEngines = ["DuckDuckGo.com", "Bing.com", "Search.Yahoo.com", "Search.Brave.com", "Ecosia.org"]; | |
let currentEngine = ""; // Store the engine currently being used | |
// DOM Elements | |
const searchInput = document.getElementById('search-input'); | |
const searchButton = document.getElementById('search-button'); | |
const resultsContainer = document.getElementById('dataset-results'); | |
const loadMoreContainer = document.getElementById('load-more-container'); | |
const loadMoreButton = document.getElementById('load-more-button'); | |
const searchPage = document.getElementById('search-page'); | |
const datasetPage = document.getElementById('dataset-page'); | |
const backButton = document.getElementById('back-button'); | |
const datasetTitle = document.getElementById('dataset-title'); | |
const datasetTags = document.getElementById('dataset-tags'); | |
const datasetDescription = document.getElementById('dataset-description'); | |
const previewTable = document.getElementById('preview-table'); | |
const generateFullButton = document.getElementById('generate-full-button'); | |
const generateStatus = document.getElementById('generate-status'); | |
const rowsCount = document.getElementById('rows-count'); | |
const progressBar = document.getElementById('progress-bar'); | |
const fullDatasetSection = document.getElementById('full-dataset'); | |
const fullTable = document.getElementById('full-table'); | |
const downloadCsvButton = document.getElementById('download-csv-button'); | |
const downloadJsonButton = document.getElementById('download-json-button'); | |
const downloadParquetButton = document.getElementById('download-parquet-button'); | |
const dataSourceToggle = document.getElementById('data-source-toggle'); | |
const dataSourceText = document.getElementById('data-source-text'); | |
const dataSourceBadge = document.getElementById('data-source-badge'); | |
const sourceDetails = document.getElementById('source-details'); | |
const engineSettingsButton = document.getElementById('engine-settings-button'); | |
const engineModal = document.getElementById('engine-modal'); | |
const engineOptions = document.getElementById('engine-options'); | |
const closeModalButton = document.getElementById('close-modal-button'); | |
const saveEnginesButton = document.getElementById('save-engines-button'); | |
const selectAllEngines = document.getElementById('select-all-engines'); | |
const deselectAllEngines = document.getElementById('deselect-all-engines'); | |
// Event Listeners | |
document.addEventListener('DOMContentLoaded', () => { | |
searchButton.addEventListener('click', performSearch); | |
searchInput.addEventListener('keypress', (e) => { | |
if (e.key === 'Enter') performSearch(); | |
}); | |
loadMoreButton.addEventListener('click', loadMoreDatasets); | |
backButton.addEventListener('click', showSearchPage); | |
generateFullButton.addEventListener('click', generateFullDataset); | |
downloadCsvButton.addEventListener('click', () => downloadData('csv')); | |
downloadJsonButton.addEventListener('click', () => downloadData('json')); | |
downloadParquetButton.addEventListener('click', () => downloadData('parquet')); | |
dataSourceToggle.addEventListener('change', toggleDataSource); | |
engineSettingsButton.addEventListener('click', showEngineModal); | |
closeModalButton.addEventListener('click', hideEngineModal); | |
saveEnginesButton.addEventListener('click', saveEngineSettings); | |
selectAllEngines.addEventListener('click', () => toggleAllEngines(true)); | |
deselectAllEngines.addEventListener('click', () => toggleAllEngines(false)); | |
// Initialize engine options | |
populateEngineOptions(); | |
// Show initial placeholder datasets | |
showPlaceholderDatasets(); | |
}); | |
// Search Engine Settings | |
function populateEngineOptions() { | |
engineOptions.innerHTML = ''; | |
searchEngines.forEach(engine => { | |
const isChecked = selectedEngines.includes(engine); | |
const optionDiv = document.createElement('div'); | |
optionDiv.className = 'flex items-center'; | |
optionDiv.innerHTML = ` | |
<input type="checkbox" id="engine-${engine}" class="engine-checkbox mr-2 h-4 w-4" | |
value="${engine}" ${isChecked ? 'checked' : ''}> | |
<label for="engine-${engine}" class="cursor-pointer">${engine}</label> | |
`; | |
engineOptions.appendChild(optionDiv); | |
}); | |
} | |
function showEngineModal() { | |
engineModal.classList.remove('hidden'); | |
} | |
function hideEngineModal() { | |
engineModal.classList.add('hidden'); | |
} | |
function saveEngineSettings() { | |
const checkboxes = document.querySelectorAll('.engine-checkbox:checked'); | |
selectedEngines = Array.from(checkboxes).map(cb => cb.value); | |
if (selectedEngines.length === 0) { | |
// Ensure at least one engine is selected | |
selectedEngines = ["DuckDuckGo.com"]; | |
document.getElementById(`engine-DuckDuckGo.com`).checked = true; | |
showNotification("At least one search engine must be selected. Using DuckDuckGo as default."); | |
} | |
hideEngineModal(); | |
showNotification(`Updated search engine settings. Using ${selectedEngines.length} engines.`); | |
} | |
function toggleAllEngines(select) { | |
const checkboxes = document.querySelectorAll('.engine-checkbox'); | |
checkboxes.forEach(cb => { | |
cb.checked = select; | |
}); | |
} | |
// Toggle data source between real and AI | |
function toggleDataSource() { | |
useRealData = dataSourceToggle.checked; | |
dataSourceText.textContent = useRealData ? "Using: Real + AI Data" : "Using: AI Data Only"; | |
// Show or hide engine settings button | |
engineSettingsButton.style.display = useRealData ? "flex" : "none"; | |
showNotification(`Switched to ${useRealData ? "combined real and synthetic" : "synthetic-only"} data mode`); | |
} | |
// Search functionality | |
function performSearch() { | |
const query = searchInput.value.trim(); | |
if (!query) return; | |
currentSearchQuery = query; | |
currentPage = 1; | |
currentDatasets = []; | |
resultsContainer.innerHTML = ''; | |
showLoadingSkeletons(); | |
if (useRealData) { | |
// Use real data from search engines + AI | |
searchWithRealData(query); | |
} else { | |
// Use only AI-generated data | |
searchWithAIData(query); | |
} | |
} | |
function searchWithRealData(query) { | |
// Randomly select a search engine from the user's selected engines | |
currentEngine = selectedEngines[Math.floor(Math.random() * selectedEngines.length)]; | |
// Register handler for dataset names based on real search results | |
window.Poe.registerHandler("real-search-handler", (result) => { | |
if (result.status === "error") { | |
showError("Error querying search engines"); | |
return; | |
} | |
const message = result.responses[0]; | |
if (message.status === "complete") { | |
// Parse the dataset names and tags from the response | |
const datasets = parseDatasetResults(message.content); | |
datasets.forEach(dataset => { | |
dataset.isReal = true; | |
dataset.engine = currentEngine; | |
}); | |
currentDatasets = datasets; | |
// Display the datasets | |
resultsContainer.innerHTML = ''; | |
displayDatasets(datasets); | |
// Show load more button if we have results | |
if (datasets.length > 0) { | |
loadMoreContainer.classList.remove('hidden'); | |
} | |
} | |
}); | |
try { | |
window.Poe.sendUserMessage( | |
`@Claude-3.7-Sonnet You are a data specialist who can transform real search results into structured datasets. | |
A user is searching for data about: "${query}" | |
Imagine you've queried ${currentEngine} and received real search results. Create a list of 10 specific datasets that could be created from these search results. | |
For each dataset: | |
1. Give it a clear, specific name related to the search topic | |
2. Include 3-5 relevant tags in parentheses, with one tag specifying the ML task type (classification, regression, clustering, etc.) | |
Format each dataset as: | |
1. DatasetName (tag1, tag2, ml_task_tag) | |
Make these datasets sound like real collections that could be created from ${currentEngine} search results on "${query}".`, | |
{ | |
handler: "real-search-handler", | |
stream: false, | |
openChat: false | |
} | |
); | |
} catch (err) { | |
showError("Error sending message: " + err); | |
// Fall back to AI data | |
searchWithAIData(query); | |
} | |
} | |
function searchWithAIData(query) { | |
// Register handler for AI-generated dataset names | |
window.Poe.registerHandler("dataset-search-handler", (result) => { | |
if (result.status === "error") { | |
showError("Error generating datasets"); | |
return; | |
} | |
const message = result.responses[0]; | |
if (message.status === "complete") { | |
// Parse the dataset names and tags from the response | |
const datasets = parseDatasetResults(message.content); | |
datasets.forEach(dataset => { | |
dataset.isReal = false; | |
}); | |
currentDatasets = datasets; | |
// Display the datasets | |
resultsContainer.innerHTML = ''; | |
displayDatasets(datasets); | |
// Show load more button if we have results | |
if (datasets.length > 0) { | |
loadMoreContainer.classList.remove('hidden'); | |
} | |
} | |
}); | |
try { | |
window.Poe.sendUserMessage( | |
`@Claude-3.7-Sonnet A Machine Learning Practioner is looking for a dataset that matches '${query}'. | |
Generate a list of ${MAX_DATASETS_PER_PAGE} names of quality datasets that don't exist but sound plausible and would | |
be helpful. Feel free to reuse words from the query '${query}' to name the datasets. | |
Every dataset should be about '${query}' and have descriptive tags/keywords including the ML task name associated with the dataset (classification, regression, anomaly detection, etc.). Use the following format: | |
1. DatasetName1 (tag1, tag2, tag3) | |
2. DatasetName2 (tag1, tag2, tag3)`, | |
{ | |
handler: "dataset-search-handler", | |
stream: false, | |
openChat: false | |
} | |
); | |
} catch (err) { | |
showError("Error sending message: " + err); | |
} | |
} | |
function parseDatasetResults(content) { | |
const lines = content.split('\n'); | |
const datasets = []; | |
lines.forEach(line => { | |
// Match lines that start with a number followed by a period | |
const match = line.match(/^\s*\d+\.\s+(.+?)\s+\((.+?)\)/); | |
if (match) { | |
const name = match[1].trim(); | |
const tags = match[2].split(',').map(tag => tag.trim()); | |
datasets.push({ name, tags }); | |
} | |
}); | |
return datasets; | |
} | |
function displayDatasets(datasets) { | |
datasets.forEach(dataset => { | |
const card = document.createElement('div'); | |
card.className = 'dataset-card bg-white dark:bg-gray-800 rounded-lg p-4 border border-gray-200 dark:border-gray-700 cursor-pointer relative'; | |
const tagsHtml = dataset.tags.map(tag => | |
`<span class="inline-block bg-gray-100 dark:bg-gray-700 text-gray-800 dark:text-gray-300 text-xs px-2 py-1 rounded mr-1 mb-1">${tag}</span>` | |
).join(''); | |
// Add a badge for real data | |
let badgeHtml = ''; | |
if (dataset.isReal) { | |
badgeHtml = `<span class="engine-badge" title="Data from ${dataset.engine}">${dataset.engine.split('.')[0]}</span>`; | |
} | |
card.innerHTML = ` | |
${badgeHtml} | |
<h3 class="text-lg font-semibold mb-2">${dataset.name}</h3> | |
<div class="flex flex-wrap mt-2">${tagsHtml}</div> | |
`; | |
card.addEventListener('click', () => showDatasetDetails(dataset)); | |
resultsContainer.appendChild(card); | |
}); | |
} | |
function showLoadingSkeletons() { | |
for (let i = 0; i < 4; i++) { | |
const skeleton = document.createElement('div'); | |
skeleton.className = 'bg-white dark:bg-gray-800 rounded-lg p-4 border border-gray-200 dark:border-gray-700'; | |
skeleton.innerHTML = ` | |
<div class="shimmer h-6 w-3/4 mb-2"></div> | |
<div class="flex flex-wrap mt-2"> | |
<div class="shimmer h-6 w-16 rounded mr-1 mb-1"></div> | |
<div class="shimmer h-6 w-20 rounded mr-1 mb-1"></div> | |
<div class="shimmer h-6 w-24 rounded mr-1 mb-1"></div> | |
</div> | |
`; | |
resultsContainer.appendChild(skeleton); | |
} | |
} | |
function loadMoreDatasets() { | |
currentPage++; | |
// Use the same data source (real or AI) as the initial search | |
if (useRealData) { | |
loadMoreRealDatasets(); | |
} else { | |
loadMoreAIDatasets(); | |
} | |
} | |
function loadMoreRealDatasets() { | |
// Rotate to a different search engine for variety | |
const previousEngine = currentEngine; | |
while (currentEngine === previousEngine && selectedEngines.length > 1) { | |
currentEngine = selectedEngines[Math.floor(Math.random() * selectedEngines.length)]; | |
} | |
// Register handler for more datasets | |
window.Poe.registerHandler("more-real-datasets-handler", (result) => { | |
if (result.status === "error") { | |
showError("Error generating more datasets"); | |
return; | |
} | |
const message = result.responses[0]; | |
if (message.status === "complete") { | |
// Parse the dataset names and tags from the response | |
const datasets = parseDatasetResults(message.content); | |
datasets.forEach(dataset => { | |
dataset.isReal = true; | |
dataset.engine = currentEngine; | |
}); | |
currentDatasets = [...currentDatasets, ...datasets]; | |
// Display the datasets | |
displayDatasets(datasets); | |
} | |
}); | |
try { | |
window.Poe.sendUserMessage( | |
`@Claude-3.7-Sonnet You're a data specialist who can transform real search results into structured datasets. | |
Continue our previous search for data about: "${currentSearchQuery}" | |
Now let's use a different search engine: ${currentEngine} | |
Create 10 more specific datasets that could be created from these search results. Make sure these are different from the previous datasets. | |
Use the same format: | |
1. DatasetName (tag1, tag2, ml_task_tag) | |
Make these datasets sound like real collections that could be created from ${currentEngine} search results on "${currentSearchQuery}".`, | |
{ | |
handler: "more-real-datasets-handler", | |
stream: false, | |
openChat: false | |
} | |
); | |
} catch (err) { | |
showError("Error sending message: " + err); | |
// Fall back to AI data | |
loadMoreAIDatasets(); | |
} | |
} | |
function loadMoreAIDatasets() { | |
// Register handler for more AI datasets | |
window.Poe.registerHandler("more-datasets-handler", (result) => { | |
if (result.status === "error") { | |
showError("Error generating more datasets"); | |
return; | |
} | |
const message = result.responses[0]; | |
if (message.status === "complete") { | |
// Parse the dataset names and tags from the response | |
const datasets = parseDatasetResults(message.content); | |
datasets.forEach(dataset => { | |
dataset.isReal = false; | |
}); | |
currentDatasets = [...currentDatasets, ...datasets]; | |
// Display the datasets | |
displayDatasets(datasets); | |
} | |
}); | |
try { | |
window.Poe.sendUserMessage( | |
`@Claude-3.7-Sonnet Please generate ${MAX_DATASETS_PER_PAGE} more dataset names about '${currentSearchQuery}'. Use the same format as before: | |
1. DatasetName1 (tag1, tag2, tag3) | |
Make sure these are completely different from previous suggestions.`, | |
{ | |
handler: "more-datasets-handler", | |
stream: false, | |
openChat: false | |
} | |
); | |
} catch (err) { | |
showError("Error sending message: " + err); | |
} | |
} | |
function showDatasetDetails(dataset) { | |
currentDataset = dataset; | |
searchPage.classList.add('hidden'); | |
datasetPage.classList.remove('hidden'); | |
// Update UI with dataset info | |
datasetTitle.textContent = dataset.name; | |
datasetTags.innerHTML = dataset.tags.map(tag => | |
`<span class="inline-block bg-gray-100 dark:bg-gray-700 text-gray-800 dark:text-gray-300 text-xs px-2 py-1 rounded mr-1 mb-1">${tag}</span>` | |
).join(''); | |
// Update source badge | |
if (dataset.isReal) { | |
dataSourceBadge.textContent = "Real Data"; | |
dataSourceBadge.className = "px-3 py-1 rounded-full text-xs font-medium bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200"; | |
sourceDetails.innerHTML = `This dataset is based on real information queried from <strong>${dataset.engine}</strong> for the search term "<strong>${currentSearchQuery}</strong>". The data has been structured for machine learning use.`; | |
} else { | |
dataSourceBadge.textContent = "AI-Generated"; | |
dataSourceBadge.className = "px-3 py-1 rounded-full text-xs font-medium bg-purple-100 text-purple-800 dark:bg-purple-900 dark:text-purple-200"; | |
sourceDetails.innerHTML = `This is an AI-generated dataset created using Claude-3.7-Sonnet. The content is synthetic and designed to represent plausible data related to "${currentSearchQuery}".`; | |
} | |
// Clear previous content | |
datasetDescription.innerHTML = '<div class="shimmer h-4 w-full mb-2"></div>'.repeat(3); | |
previewTable.innerHTML = ''; | |
fullDatasetSection.classList.add('hidden'); | |
generateStatus.classList.add('hidden'); | |
generateFullButton.disabled = false; | |
// Reset full dataset | |
fullDatasetRows = []; | |
// Generate dataset preview - different approach for real vs AI data | |
if (dataset.isReal) { | |
generateRealDatasetPreview(dataset); | |
} else { | |
generateAIDatasetPreview(dataset); | |
} | |
// Scroll to top | |
window.scrollTo(0, 0); | |
} | |
function generateRealDatasetPreview(dataset) { | |
window.Poe.registerHandler("real-preview-handler", (result) => { | |
if (result.status === "error") { | |
datasetDescription.innerHTML = '<p class="text-red-500">Error generating dataset preview</p>'; | |
return; | |
} | |
const message = result.responses[0]; | |
if (message.status === "complete") { | |
const content = message.content; | |
// Extract description and CSV | |
const parts = content.split('**CSV Content Preview:**'); | |
let description = ""; | |
let csvContent = ""; | |
if (parts.length > 1) { | |
description = parts[0].replace('**Dataset Description:**', '').trim(); | |
csvContent = parts[1].trim(); | |
// Clean up CSV content (remove markdown code block markers) | |
csvContent = csvContent.replace(/```csv\n|```\n|```/g, '').trim(); | |
} else { | |
description = "No description available"; | |
csvContent = content; | |
} | |
// Display description | |
datasetDescription.innerHTML = marked.parse(description); | |
// Parse and display CSV preview | |
try { | |
const results = Papa.parse(csvContent, { | |
header: true, | |
skipEmptyLines: true | |
}); | |
if (results.data && results.data.length > 0) { | |
// Create table from CSV data | |
createTable(previewTable, results.data, results.meta.fields); | |
} else { | |
previewTable.innerHTML = '<p class="p-4 text-red-500">No preview data available</p>'; | |
} | |
} catch (err) { | |
previewTable.innerHTML = `<p class="p-4 text-red-500">Error parsing CSV: ${err.message}</p>`; | |
} | |
} | |
}); | |
try { | |
const tagsStr = dataset.tags.join(', '); | |
window.Poe.sendUserMessage( | |
`@Claude-3.7-Sonnet You're a specialist in converting web search results into structured data. | |
Based on search results from ${dataset.engine} about "${currentSearchQuery}", | |
create a preview of the dataset "${dataset.name}" with tags "${tagsStr}". | |
First, write a detailed description of what this dataset contains, its structure, and how it was constructed from web search results. | |
Then, generate a realistic 5-row CSV preview that resembles data you might get if you scraped and structured real results from ${dataset.engine}. | |
Format your response with: | |
**Dataset Description:** [detailed description] | |
**CSV Content Preview:** | |
\`\`\`csv | |
[CSV header and 5 rows of realistic data] | |
\`\`\` | |
Include relevant columns for the dataset type, with proper labels/categories where appropriate. The data should look like it came from real sources.`, | |
{ | |
handler: "real-preview-handler", | |
stream: false, | |
openChat: false | |
} | |
); | |
} catch (err) { | |
datasetDescription.innerHTML = `<p class="text-red-500">Error: ${err.message}</p>`; | |
} | |
} | |
function generateAIDatasetPreview(dataset) { | |
window.Poe.registerHandler("dataset-preview-handler", (result) => { | |
if (result.status === "error") { | |
datasetDescription.innerHTML = '<p class="text-red-500">Error generating dataset preview</p>'; | |
return; | |
} | |
const message = result.responses[0]; | |
if (message.status === "complete") { | |
const content = message.content; | |
// Extract description and CSV | |
const parts = content.split('**CSV Content Preview:**'); | |
let description = ""; | |
let csvContent = ""; | |
if (parts.length > 1) { | |
description = parts[0].replace('**Dataset Description:**', '').trim(); | |
csvContent = parts[1].trim(); | |
// Clean up CSV content (remove markdown code block markers) | |
csvContent = csvContent.replace(/```csv\n|```\n|```/g, '').trim(); | |
} else { | |
description = "No description available"; | |
csvContent = content; | |
} | |
// Display description | |
datasetDescription.innerHTML = marked.parse(description); | |
// Parse and display CSV preview | |
try { | |
const results = Papa.parse(csvContent, { | |
header: true, | |
skipEmptyLines: true | |
}); | |
if (results.data && results.data.length > 0) { | |
// Create table from CSV data | |
createTable(previewTable, results.data, results.meta.fields); | |
} else { | |
previewTable.innerHTML = '<p class="p-4 text-red-500">No preview data available</p>'; | |
} | |
} catch (err) { | |
previewTable.innerHTML = `<p class="p-4 text-red-500">Error parsing CSV: ${err.message}</p>`; | |
} | |
} | |
}); | |
try { | |
const tagsStr = dataset.tags.join(', '); | |
window.Poe.sendUserMessage( | |
`@Claude-3.7-Sonnet An ML practitioner is looking for a dataset CSV after the query '${currentSearchQuery}'. | |
Generate the first 5 rows of a plausible and quality CSV for the dataset '${dataset.name}'. | |
You can get inspiration from related keywords '${tagsStr}' but most importantly the dataset should correspond to the query '${currentSearchQuery}'. | |
Focus on quality text content and use a 'label' or 'labels' column if it makes sense (invent labels, avoid reusing the keywords, be accurate while labelling texts). | |
Reply using a short description of the dataset with title **Dataset Description:** followed by the CSV content in a code block and with title **CSV Content Preview:**`, | |
{ | |
handler: "dataset-preview-handler", | |
stream: false, | |
openChat: false | |
} | |
); | |
} catch (err) { | |
datasetDescription.innerHTML = `<p class="text-red-500">Error: ${err.message}</p>`; | |
} | |
} | |
function createTable(container, data, headers) { | |
container.innerHTML = ''; | |
const table = document.createElement('table'); | |
table.className = 'w-full'; | |
// Create header | |
const thead = document.createElement('thead'); | |
const headerRow = document.createElement('tr'); | |
headers.forEach(header => { | |
const th = document.createElement('th'); | |
th.textContent = header; | |
headerRow.appendChild(th); | |
}); | |
thead.appendChild(headerRow); | |
table.appendChild(thead); | |
// Create body | |
const tbody = document.createElement('tbody'); | |
data.forEach(row => { | |
const tr = document.createElement('tr'); | |
headers.forEach(header => { | |
const td = document.createElement('td'); | |
td.textContent = row[header] || ''; | |
tr.appendChild(td); | |
}); | |
tbody.appendChild(tr); | |
}); | |
table.appendChild(tbody); | |
container.appendChild(table); | |
} | |
function generateFullDataset() { | |
// Disable button and show status | |
generateFullButton.disabled = true; | |
generateStatus.classList.remove('hidden'); | |
rowsCount.textContent = '0'; | |
progressBar.style.width = '0%'; | |
// Set up variables for tracking generation | |
let csvHeader = ''; | |
const targetRows = MAX_FULL_DATASET_ROWS; | |
let currentRows = 0; | |
fullDatasetRows = []; | |
// Get the CSV header from the preview table | |
const previewHeaders = Array.from(previewTable.querySelectorAll('thead th')).map(th => th.textContent); | |
csvHeader = previewHeaders.join(','); | |
// Add initial rows from preview | |
const previewRows = Array.from(previewTable.querySelectorAll('tbody tr')).map(tr => { | |
const row = {}; | |
Array.from(tr.querySelectorAll('td')).forEach((td, index) => { | |
row[previewHeaders[index]] = td.textContent; | |
}); | |
return row; | |
}); | |
fullDatasetRows = [...previewRows]; | |
currentRows = previewRows.length; | |
updateGenerationProgress(currentRows, targetRows); | |
// Choose generation method based on dataset type | |
if (currentDataset.isReal) { | |
generateFullRealDataset(previewHeaders, csvHeader, currentRows, targetRows); | |
} else { | |
generateFullAIDataset(previewHeaders, csvHeader, currentRows, targetRows); | |
} | |
} | |
function generateFullRealDataset(previewHeaders, csvHeader, currentRows, targetRows) { | |
// Function to generate more rows in batches from "real" search results | |
const generateBatch = (batchIndex) => { | |
const batchSize = 15; // Larger batches for efficiency | |
const startRow = currentRows + batchIndex * batchSize; | |
if (startRow >= targetRows) { | |
// We've reached the target, show the full dataset | |
showFullDataset(); | |
return; | |
} | |
window.Poe.registerHandler(`real-batch-${batchIndex}-handler`, (result) => { | |
if (result.status === "error") { | |
showError("Error generating dataset rows"); | |
return; | |
} | |
const message = result.responses[0]; | |
if (message.status === "complete") { | |
const content = message.content; | |
// Extract CSV content (remove markdown code block markers) | |
let csvContent = content.replace(/```csv\n|```\n|```/g, '').trim(); | |
// If there are multiple code blocks, try to find one with CSV data | |
if (csvContent.includes('```')) { | |
const codeBlocks = content.match(/```(?:csv)?\n([\s\S]*?)```/g) || []; | |
if (codeBlocks.length > 0) { | |
csvContent = codeBlocks[0].replace(/```(?:csv)?\n|```/g, '').trim(); | |
} | |
} | |
try { | |
// Parse the CSV | |
const results = Papa.parse(csvContent, { | |
header: true, | |
skipEmptyLines: true | |
}); | |
if (results.data && results.data.length > 0) { | |
// Add the new rows | |
fullDatasetRows = [...fullDatasetRows, ...results.data]; | |
currentRows += results.data.length; | |
// Update progress | |
updateGenerationProgress(currentRows, targetRows); | |
// Generate next batch | |
generateBatch(batchIndex + 1); | |
} else { | |
// Try again with a different prompt | |
generateBatch(batchIndex); | |
} | |
} catch (err) { | |
console.error("Error parsing CSV:", err); | |
// Try again | |
generateBatch(batchIndex); | |
} | |
} | |
}); | |
try { | |
// For variation, rotate through engines for each batch | |
const engineForBatch = selectedEngines[batchIndex % selectedEngines.length] || currentDataset.engine; | |
window.Poe.sendUserMessage( | |
`@Claude-3.7-Sonnet You're expanding a dataset based on search results from ${engineForBatch}. | |
For the dataset "${currentDataset.name}" about "${currentSearchQuery}", please generate ${batchSize} more rows of data. | |
Use this exact CSV header: ${csvHeader} | |
The data should look realistic, as if it came from actual ${engineForBatch} search results for "${currentSearchQuery}". | |
Include appropriate values for each field, maintaining the same patterns and types as seen in the existing data. | |
Only include the CSV data in your response (header + ${batchSize} rows), no explanations or additional text.`, | |
{ | |
handler: `real-batch-${batchIndex}-handler`, | |
stream: false, | |
openChat: false | |
} | |
); | |
} catch (err) { | |
showError("Error sending message: " + err); | |
} | |
}; | |
// Start generating batches | |
generateBatch(0); | |
} | |
function generateFullAIDataset(previewHeaders, csvHeader, currentRows, targetRows) { | |
// Function to generate more rows in batches from AI | |
const generateBatch = (batchIndex) => { | |
const batchSize = 10; | |
const startRow = currentRows + batchIndex * batchSize; | |
if (startRow >= targetRows) { | |
// We've reached the target, show the full dataset | |
showFullDataset(); | |
return; | |
} | |
window.Poe.registerHandler(`batch-${batchIndex}-handler`, (result) => { | |
if (result.status === "error") { | |
showError("Error generating dataset rows"); | |
return; | |
} | |
const message = result.responses[0]; | |
if (message.status === "complete") { | |
const content = message.content; | |
// Extract CSV content (remove markdown code block markers) | |
let csvContent = content.replace(/```csv\n|```\n|```/g, '').trim(); | |
// If there are multiple code blocks, try to find one with CSV data | |
if (csvContent.includes('```')) { | |
const codeBlocks = content.match(/```(?:csv)?\n([\s\S]*?)```/g) || []; | |
if (codeBlocks.length > 0) { | |
csvContent = codeBlocks[0].replace(/```(?:csv)?\n|```/g, '').trim(); | |
} | |
} | |
try { | |
// Parse the CSV | |
const results = Papa.parse(csvContent, { | |
header: true, | |
skipEmptyLines: true | |
}); | |
if (results.data && results.data.length > 0) { | |
// Add the new rows | |
fullDatasetRows = [...fullDatasetRows, ...results.data]; | |
currentRows += results.data.length; | |
// Update progress | |
updateGenerationProgress(currentRows, targetRows); | |
// Generate next batch | |
generateBatch(batchIndex + 1); | |
} else { | |
// Try again with a different prompt | |
generateBatch(batchIndex); | |
} | |
} catch (err) { | |
console.error("Error parsing CSV:", err); | |
// Try again | |
generateBatch(batchIndex); | |
} | |
} | |
}); | |
try { | |
const tagsStr = currentDataset.tags.join(', '); | |
window.Poe.sendUserMessage( | |
`@Claude-3.7-Sonnet For the dataset '${currentDataset.name}' about '${currentSearchQuery}' with tags '${tagsStr}', | |
please generate ${batchSize} more sample rows in CSV format. Use the same CSV header: ${csvHeader} | |
Only include the CSV data in your response, no explanations or additional text.`, | |
{ | |
handler: `batch-${batchIndex}-handler`, | |
stream: false, | |
openChat: false | |
} | |
); | |
} catch (err) { | |
showError("Error sending message: " + err); | |
} | |
}; | |
// Start generating batches | |
generateBatch(0); | |
} | |
function updateGenerationProgress(current, total) { | |
rowsCount.textContent = current; | |
const percentage = Math.min(100, Math.floor((current / total) * 100)); | |
progressBar.style.width = `${percentage}%`; | |
} | |
function showFullDataset() { | |
// Hide generation status | |
generateStatus.classList.add('hidden'); | |
// Show full dataset section | |
fullDatasetSection.classList.remove('hidden'); | |
// Get headers from the data | |
const headers = Object.keys(fullDatasetRows[0] || {}); | |
// Create and display the table | |
createTable(fullTable, fullDatasetRows.slice(0, 10), headers); | |
// Add a note about showing limited rows | |
const note = document.createElement('p'); | |
note.className = 'text-sm text-gray-600 dark:text-gray-400 mt-2'; | |
note.textContent = `Showing 10 of ${fullDatasetRows.length} rows. Use the download buttons to get the complete dataset.`; | |
fullTable.appendChild(note); | |
} | |
function downloadData(format) { | |
if (fullDatasetRows.length === 0) return; | |
const filename = `${currentDataset.name.replace(/\s+/g, '_')}_dataset`; | |
switch(format) { | |
case 'csv': | |
downloadCsv(filename); | |
break; | |
case 'json': | |
downloadJson(filename); | |
break; | |
case 'parquet': | |
// Show a notification that this format is simulated | |
showNotification("Parquet format download simulated - actual conversion would require a server component"); | |
downloadJson(filename + "_parquet_simulated"); | |
break; | |
} | |
} | |
function downloadCsv(filename) { | |
// Convert data to CSV | |
const csv = Papa.unparse(fullDatasetRows); | |
// Create a blob and download link | |
const blob = new Blob([csv], { type: 'text/csv' }); | |
const url = URL.createObjectURL(blob); | |
const a = document.createElement('a'); | |
a.href = url; | |
a.download = `${filename}.csv`; | |
document.body.appendChild(a); | |
a.click(); | |
// Clean up | |
setTimeout(() => { | |
document.body.removeChild(a); | |
URL.revokeObjectURL(url); | |
}, 100); | |
} | |
function downloadJson(filename) { | |
// Convert data to JSON | |
const json = JSON.stringify(fullDatasetRows, null, 2); | |
// Create a blob and download link | |
const blob = new Blob([json], { type: 'application/json' }); | |
const url = URL.createObjectURL(blob); | |
const a = document.createElement('a'); | |
a.href = url; | |
a.download = `${filename}.json`; | |
document.body.appendChild(a); | |
a.click(); | |
// Clean up | |
setTimeout(() => { | |
document.body.removeChild(a); | |
URL.revokeObjectURL(url); | |
}, 100); | |
} | |
function showSearchPage() { | |
searchPage.classList.remove('hidden'); | |
datasetPage.classList.add('hidden'); | |
} | |
function showError(message) { | |
console.error(message); | |
showNotification(message, true); | |
} | |
function showNotification(message, isError = false) { | |
const notification = document.createElement('div'); | |
notification.className = `fixed bottom-4 right-4 px-6 py-3 rounded-lg shadow-lg ${ | |
isError | |
? 'bg-red-500 text-white' | |
: 'bg-green-500 text-white' | |
} z-50 transition-opacity duration-300`; | |
notification.textContent = message; | |
document.body.appendChild(notification); | |
setTimeout(() => { | |
notification.style.opacity = '0'; | |
setTimeout(() => { | |
document.body.removeChild(notification); | |
}, 300); | |
}, 3000); | |
} | |
function showPlaceholderDatasets() { | |
const placeholders = [ | |
{ | |
name: "NewsEventsPredict", | |
tags: ["classification", "media", "trend"], | |
isReal: true, | |
engine: "AlltheInternet.com" | |
}, | |
{ | |
name: "FinancialForecast", | |
tags: ["economy", "stocks", "regression"], | |
isReal: false | |
}, | |
{ | |
name: "HealthMonitor", | |
tags: ["science", "real-time", "anomaly detection"], | |
isReal: true, | |
engine: "DuckDuckGo.com" | |
}, | |
{ | |
name: "SportsAnalysis", | |
tags: ["classification", "performance", "player tracking"], | |
isReal: false | |
}, | |
{ | |
name: "RetailSalesAnalyzer", | |
tags: ["consumer behavior", "sales trend", "segmentation"], | |
isReal: true, | |
engine: "Bing.com" | |
}, | |
{ | |
name: "SocialMediaSentiment", | |
tags: ["text classification", "opinion mining", "NLP"], | |
isReal: false | |
} | |
]; | |
currentDatasets = placeholders; | |
displayDatasets(placeholders); | |
loadMoreContainer.classList.remove('hidden'); | |
} | |
</script> | |
</body> | |
</html> | |
""" | |
# --- Gradio CSS --- | |
css = """ | |
a { color: var(--body-text-color); } | |
.datasetButton { justify-content: start; justify-content: left; } | |
.tags { font-size: var(--button-small-text-size); color: var(--body-text-color-subdued); } | |
.topButton { | |
justify-content: start; justify-content: left; text-align: left; background: transparent; | |
box-shadow: none; padding-bottom: 0; | |
} | |
.topButton::before { | |
content: url("data:image/svg+xml,%3Csvg style='color: rgb(209 213 219)' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink' aria-hidden='true' focusable='false' role='img' width='1em' height='1em' preserveAspectRatio='xMidYMid meet' viewBox='0 0 25 25'%3E%3Cellipse cx='12.5' cy='5' fill='currentColor' fill-opacity='0.25' rx='7.5' ry='2'%3E%3C/ellipse%3E%3Cpath d='M12.5 15C16.6421 15 20 14.1046 20 13V20C20 21.1046 16.6421 22 12.5 22C8.35786 22 5 21.1046 5 20V13C5 14.1046 8.35786 15 12.5 15Z' fill='currentColor' opacity='0.5'%3E%3C/path%3E%3Cpath d='M12.5 7C16.6421 7 20 6.10457 20 5V11.5C20 12.6046 16.6421 13.5 12.5 13.5C8.35786 13.5 5 12.6046 5 11.5V5C5 6.10457 8.35786 7 12.5 7Z' fill='currentColor' opacity='0.5'%3E%3C/path%3E%3Cpath d='M5.23628 12C5.08204 12.1598 5 12.8273 5 13C5 14.1046 8.35786 15 12.5 15C16.6421 15 20 14.1046 20 13C20 12.8273 19.918 12.1598 19.7637 12C18.9311 12.8626 15.9947 13.5 12.5 13.5C9.0053 13.5 6.06886 12.8626 5.23628 12Z' fill='currentColor'%3E%3C/path%3E%3C/svg%3E"); | |
margin-right: .25rem; margin-left: -.125rem; margin-top: .25rem; | |
} | |
.bottomButton { | |
justify-content: start; justify-content: left; text-align: left; background: transparent; | |
box-shadow: none; font-size: var(--button-small-text-size); color: var(--body-text-color-subdued); | |
padding-top: 0; align-items: baseline; | |
} | |
.bottomButton::before { content: 'tags:'; margin-right: .25rem; } | |
.buttonsGroup { background: transparent; } | |
.buttonsGroup:hover { background: var(--input-background-fill); } | |
.buttonsGroup div { background: transparent; } | |
.insivibleButtonGroup { display: none; } | |
@keyframes placeHolderShimmer { 0%{ background-position: -468px 0 } 100%{ background-position: 468px 0 } } | |
.linear-background { | |
animation-duration: 1s; animation-fill-mode: forwards; animation-iteration-count: infinite; | |
animation-name: placeHolderShimmer; animation-timing-function: linear; | |
background-image: linear-gradient(to right, var(--body-text-color-subdued) 8%, #dddddd11 18%, var(--body-text-color-subdued) 33%); | |
background-size: 1000px 104px; color: transparent; background-clip: text; | |
} | |
.settings { background: transparent; } | |
.settings button span { color: var(--body-text-color-subdued); } | |
""" | |
# --- Knowledge Base --- | |
class KnowledgeBase: | |
"""Manages known entities (materials, colors) and patterns for data refinement.""" | |
def __init__(self): | |
self.materials: Set[str] = {'Metal', 'Wood', 'Plastic', 'Aluminum', 'Bronze', 'Steel', 'Glass', 'Leather', 'Fabric'} | |
self.colors: Set[str] = {'Red', 'Black', 'White', 'Silver', 'Bronze', 'Yellow', 'Blue', 'Green', 'Gray', 'Brown'} | |
self.patterns: Dict[str, List[str]] = {} | |
self.source_data: Dict[str, Any] = {} | |
def load_source(self, source_type: str, source_path: str) -> None: | |
"""Loads data from various sources and extracts knowledge.""" | |
try: | |
if source_type == 'csv_url': | |
response = requests.get(source_path, timeout=10) | |
response.raise_for_status() | |
df = pd.read_csv(io.StringIO(response.text)) | |
elif source_type == 'xlsx_url': | |
response = requests.get(source_path, timeout=10) | |
response.raise_for_status() | |
df = pd.read_excel(io.BytesIO(response.content)) | |
elif source_type == 'local_csv': | |
df = pd.read_csv(source_path) | |
elif source_type == 'local_xlsx': | |
df = pd.read_excel(source_path) | |
else: | |
raise ValueError(f"Unsupported source type: {source_type}") | |
self._extract_knowledge(df) | |
self.source_data[source_path] = df.to_dict('records') | |
except requests.exceptions.RequestException as e: | |
raise ConnectionError(f"Failed to fetch data from URL: {e}") | |
except ValueError as e: raise e | |
except Exception as e: | |
raise RuntimeError(f"Error loading source {source_path}: {str(e)}") | |
def _extract_knowledge(self, df: pd.DataFrame) -> None: | |
"""Extracts known materials, colors, and column patterns.""" | |
for column in df.columns: | |
if 'material' in column.lower(): | |
values = df[column].dropna().unique() | |
self.materials.update(v.title() for v in values if isinstance(v, str)) | |
elif 'color' in column.lower(): | |
values = df[column].dropna().unique() | |
self.colors.update(v.title() for v in values if isinstance(v, str)) | |
if df[column].dtype == 'object': # Store string patterns for fuzzy matching | |
patterns = df[column].dropna().astype(str).tolist() | |
self.patterns[column] = patterns | |
def get_closest_match(self, value: str, field_type: str) -> Optional[str]: | |
"""Finds the closest known value (material or color) for fuzzy matching.""" | |
known_values = getattr(self, field_type + 's', set()) | |
if not known_values: return None | |
matches = get_close_matches(value.title(), list(known_values), n=1, cutoff=0.8) | |
return matches[0] if matches else None | |
knowledge_base = KnowledgeBase() # Global instance for refinement | |
# --- Data Refinement Utilities --- | |
def split_compound_field(field: str) -> List[str]: | |
"""Splits strings like 'Red, Blue' into ['Red', 'Blue'].""" | |
parts = re.split(r'[,;\n]+', field) | |
return list(set(p.strip().title() for p in parts if p.strip())) | |
def normalize_value(value: Any, field_name: str, mode: str = 'sourceless', kb: Optional[KnowledgeBase] = None) -> Any: | |
"""Normalizes a single data value based on field name and refinement mode.""" | |
if not isinstance(value, str): return value | |
value = re.sub(r'\s+', ' ', value.strip()) # Normalize whitespace | |
value = value.replace('_', ' ') # Replace underscores | |
# Field-specific normalization logic | |
if any(term in field_name.lower() for term in ['material']): | |
parts = split_compound_field(value) | |
if mode == 'sourced' and kb: | |
known = [kb.get_closest_match(p, 'material') or p.title() for p in parts] | |
else: | |
known = [m for m in parts if m in kb.materials] if kb else parts | |
return known[0] if len(known) == 1 else known | |
elif any(term in field_name.lower() for term in ['color']): | |
parts = split_compound_field(value) | |
if mode == 'sourced' and kb: | |
known = [kb.get_closest_match(p, 'color') or p.title() for p in parts] | |
else: | |
known = [c for c in parts if c in kb.colors] if kb else parts | |
return known[0] if len(known) == 1 else known | |
elif any(term in field_name.lower() for term in ['date', 'time']): return value # Placeholder | |
elif any(term in field_name.lower() for term in ['type', 'status', 'category', 'description']): | |
return value.title() # Title case for descriptive fields | |
return value | |
def clean_record(record: Dict[str, Any], mode: str = 'sourceless', kb: Optional[KnowledgeBase] = None) -> Dict[str, Any]: | |
"""Cleans and normalizes a single record, handling nesting and compound fields.""" | |
cleaned = {} | |
compound_fields_to_split = {} | |
# Pass 1: Normalize values and identify compound fields | |
for key, value in record.items(): | |
clean_key = key.strip().lower().replace(" ", "_") | |
if isinstance(value, str): # Detect potential compound fields | |
for material in knowledge_base.materials: | |
if material.lower() in value.lower(): | |
compound_fields_to_split[clean_key] = value | |
break | |
# Recursively clean nested structures | |
if isinstance(value, list): | |
cleaned[clean_key] = [normalize_value(v, clean_key, mode, kb) for v in value] | |
elif isinstance(value, dict): | |
cleaned[clean_key] = clean_record(value, mode, kb) | |
else: | |
cleaned[clean_key] = normalize_value(value, clean_key, mode, kb) | |
# Pass 2: Split identified compound fields | |
for key, value in compound_fields_to_split.items(): | |
parts = split_compound_field(value) | |
materials = [p for p in parts if p in knowledge_base.materials] | |
if materials: | |
cleaned['material'] = materials[0] if len(materials) == 1 else materials | |
remaining = [p for p in parts if p not in materials] | |
if remaining: cleaned['condition'] = ' '.join(remaining) | |
elif key not in cleaned: # If not processed and no known materials found | |
cleaned[key] = value | |
return cleaned | |
def refine_data_generic(dataset: List[Dict[str, Any]], mode: str = 'sourceless', kb: Optional[KnowledgeBase] = None) -> List[Dict[str, Any]]: | |
"""Applies generic data refinement to a list of records, with optional knowledge base guidance.""" | |
if mode == 'sourced' and kb and kb.patterns: # Apply fuzzy matching if sourced | |
for record in dataset: | |
for field, patterns in kb.patterns.items(): | |
if field in record and isinstance(record[field], str): | |
value = str(record[field]) | |
matches = get_close_matches(value, patterns, n=1, cutoff=0.8) | |
if matches: record[field] = matches[0] | |
return [clean_record(entry, mode, kb) for entry in dataset] | |
def refine_preview_data(df: pd.DataFrame, mode: str = 'sourceless') -> pd.DataFrame: | |
"""Refines the preview DataFrame based on the selected mode.""" | |
# Remove common auto-generated index columns | |
cols_to_drop = [] | |
for col_name, values in df.to_dict(orient="series").items(): | |
try: | |
if all(isinstance(v, int) and v == i for i, (v, _) in enumerate(zip(values, df.index))): cols_to_drop.append(col_name) | |
elif all(isinstance(v, int) and v == i + 1 for i, (v, _) in enumerate(zip(values, df.index))): cols_to_drop.append(col_name) | |
except Exception: pass # Ignore non-sequential columns | |
if cols_to_drop: df = df.drop(columns=cols_to_drop) | |
records = df.to_dict('records') | |
refined_records = refine_data_generic(records, mode=mode, kb=knowledge_base) | |
return pd.DataFrame(refined_records) | |
def detect_anomalies(record: Dict[str, Any]) -> List[str]: | |
"""Detects potential data quality issues (e.g., verbosity, missing values).""" | |
flags = [] | |
for k, v in record.items(): | |
if isinstance(v, str): | |
if len(v) > 300: flags.append(f"{k}: Too verbose.") | |
if v.lower() in ['n/a', 'none', 'undefined', 'null', '']: flags.append(f"{k}: Missing value.") | |
return flags | |
def parse_preview_df(content: str) -> tuple[str, pd.DataFrame]: | |
"""Extracts CSV from response, parses, refines, and adds quality flags.""" | |
csv_lines = [] | |
in_csv_block = False | |
for line in content.split("\n"): # Extract lines within CSV code blocks | |
if line.strip().startswith("```csv") or line.strip().startswith("```"): in_csv_block = True; continue | |
if line.strip().startswith("```"): in_csv_block = False; continue | |
if in_csv_block: csv_lines.append(line) | |
csv_content = "\n".join(csv_lines) | |
if not csv_content: raise ValueError("No CSV content found.") | |
csv_header = csv_content.split("\n")[0] if csv_content else "" | |
df = parse_csv_df(csv_content) | |
refined_df = refine_preview_data(df, mode='sourceless') # Initial refinement | |
# Add quality flags | |
refined_records = refined_df.to_dict('records') | |
for record in refined_records: | |
flags = detect_anomalies(record) | |
if flags: record['_quality_flags'] = flags | |
return csv_header, pd.DataFrame(refined_records) | |
def parse_csv_df(csv: str, csv_header: Optional[str] = None) -> pd.DataFrame: | |
"""Safely parses CSV data using pandas with error handling and common fixes.""" | |
csv = re.sub(r'''(?!")$$(["'][\w\s]+["'][, ]*)+$$(?!")''', lambda m: '"' + m.group(0).replace('"', "'") + '"', csv) # Fix unquoted lists | |
if csv_header and csv.strip() and not csv.strip().startswith(csv_header.split(',')[0]): csv = csv_header + "\n" + csv # Prepend header if missing | |
try: return pd.read_csv(io.StringIO(csv), skipinitialspace=True) | |
except Exception as e: raise ValueError(f"Pandas CSV parsing error: {e}") | |
# --- LLM Interaction Utilities --- | |
T = TypeVar("T") | |
def batched(it: Iterable[T], n: int) -> Iterator[list[T]]: | |
"""Yields chunks of size n from an iterable.""" | |
it = iter(it) | |
while batch := list(islice(it, n)): yield batch | |
def stream_response(msg: str, history: list[Dict[str, str]] = [], max_tokens=500) -> Iterator[str]: | |
"""Streams responses from the LLM client with retry logic.""" | |
messages = [{"role": m["role"], "content": m["content"]} for m in history] | |
messages.append({"role": "user", "content": msg}) | |
for attempt in range(3): # Retry mechanism | |
try: | |
for chunk in client.chat_completion(messages=messages, max_tokens=max_tokens, stream=True, top_p=0.8, seed=42): | |
content = chunk.choices[0].delta.content | |
if content: yield content | |
break # Success | |
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: | |
print(f"LLM connection error (attempt {attempt+1}): {e}. Retrying in {2**attempt}s...") | |
time.sleep(2**attempt) | |
except Exception as e: | |
print(f"Unexpected LLM error (attempt {attempt+1}): {e}. Retrying...") | |
time.sleep(2**attempt) | |
def generate_dataset_names(search_query: str, history: list[Dict[str, str]], is_real_data: bool = False, engine: Optional[str] = None) -> Iterator[str]: | |
"""Generates dataset names based on a search query using the LLM.""" | |
query = search_query[:1000] if search_query else "" | |
if is_real_data and engine: | |
prompt = ( | |
f"@Claude-3.7-Sonnet You are a data specialist who can transform real search results into structured datasets. " | |
f"A user is searching for data about: \"{query}\" " | |
f"Imagine you've queried {engine} and received real search results. Create a list of {MAX_NB_ITEMS_PER_GENERATION_CALL} specific datasets that could be created from these search results. " | |
f"For each dataset: 1. Give it a clear, specific name related to the search topic. 2. Include 3-5 relevant tags in parentheses, with one tag specifying the ML task type (classification, regression, clustering, etc.). " | |
f"Format each dataset as: 1. DatasetName (tag1, tag2, ml_task_tag). Make these datasets sound like real collections that could be created from {engine} search results on \"{query}\"." | |
) | |
else: | |
prompt = GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY.format(search_query=query) | |
full_response = "" | |
for token in stream_response(prompt, history): | |
full_response += token | |
yield token # Yield tokens for real-time display | |
print(f"Generated dataset names for query '{search_query}'.") | |
history.append({"role": "assistant", "content": full_response}) # Update history | |
# No return needed as history is modified in place | |
def generate_dataset_content(search_query: str, dataset_name: str, tags: str, history: list[Dict[str, str]], is_real_data: bool = False, engine: Optional[str] = None) -> Iterator[str]: | |
"""Generates the description and CSV preview for a dataset.""" | |
query = search_query[:1000] if search_query else "" | |
if is_real_data and engine: | |
prompt = ( | |
f"@Claude-3.7-Sonnet You're a specialist in converting web search results into structured data. " | |
f"Based on search results from {engine} about \"{query}\", create a preview of the dataset \"{dataset_name}\" with tags \"{tags}\". " | |
f"First, write a detailed description of what this dataset contains, its structure, and how it was constructed from web search results. " | |
f"Then, generate a realistic 5-row CSV preview that resembles data you might get if you scraped and structured real results from {engine}. " | |
f"Format your response with: **Dataset Description:** [detailed description] **CSV Content Preview:** ```csv [CSV header and 5 rows of realistic data] ``` " | |
f"Include relevant columns for the dataset type, with proper labels/categories where appropriate. The data should look like it came from real sources." | |
) | |
else: | |
prompt = GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format( | |
search_query=query, dataset_name=dataset_name, tags=tags | |
) | |
full_response = "" | |
for token in stream_response(prompt, history): | |
full_response += token | |
yield token | |
print(f"Generated content for dataset '{dataset_name}'.") | |
history.append({"role": "assistant", "content": full_response}) # Update history | |
def _write_generator_to_queue(queue: Queue, func: Callable, kwargs: dict) -> None: | |
"""Helper to run a generator and put results (or errors) into a queue.""" | |
try: | |
for i, result in enumerate(func(**kwargs)): queue.put((i, result)) | |
except Exception as e: queue.put((-1, str(e))) # Signal error with index -1 | |
finally: queue.put(None) # Signal completion | |
def iflatmap_unordered(func: Callable, kwargs_iterable: Iterable[dict]) -> Iterable[Any]: | |
"""Runs generator functions concurrently and yields results as they complete.""" | |
queue = Queue() | |
pool_size = min(len(kwargs_iterable), os.cpu_count() or 4) | |
with ThreadPool(pool_size) as pool: | |
async_results = [pool.apply_async(_write_generator_to_queue, (queue, func, kwargs)) for kwargs in kwargs_iterable] | |
completed_generators = 0 | |
while completed_generators < len(async_results): | |
try: | |
result = queue.get(timeout=0.1) | |
if result is None: # Generator finished | |
completed_generators += 1 | |
continue | |
index, data = result | |
if index == -1: # Error occurred | |
print(f"Generator error: {data}") | |
continue # Skip this result | |
yield data # Yield successful result | |
except Empty: # Timeout occurred, check if all threads are done | |
if all(res.ready() for res in async_results) and queue.empty(): break | |
for res in async_results: res.get(timeout=0.1) # Ensure threads finish and raise exceptions | |
def generate_partial_dataset( | |
title: str, content: str, search_query: str, variant: str, csv_header: str, | |
output: list[Optional[dict]], indices_to_generate: list[int], history: list[Dict[str, str]], | |
is_real_data: bool = False, engine: Optional[str] = None | |
) -> Iterator[int]: | |
"""Generates a batch of dataset rows for a specific variant.""" | |
dataset_name, tags = title.strip("# ").split("\ntags:", 1) | |
dataset_name, tags = dataset_name.strip(), tags.strip() | |
prompt = GENERATE_MORE_ROWS.format(csv_header=csv_header) + " " + variant | |
# Construct initial messages for context | |
initial_prompt = "" | |
if is_real_data and engine: | |
initial_prompt = ( | |
f"@Claude-3.7-Sonnet You're a specialist in converting web search results into structured data. " | |
f"Based on search results from {engine} about \"{search_query}\", create a preview of the dataset \"{dataset_name}\" with tags \"{tags}\". " | |
f"First, write a detailed description of what this dataset contains, its structure, and how it was constructed from web search results. " | |
f"Then, generate a realistic 5-row CSV preview that resembles data you might get if you scraped and structured real results from {engine}. " | |
f"Format your response with: **Dataset Description:** [detailed description] **CSV Content Preview:** ```csv [CSV header and 5 rows of realistic data] ``` " | |
f"Include relevant columns for the dataset type, with proper labels/categories where appropriate. The data should look like it came from real sources." | |
) | |
else: | |
initial_prompt = GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format( | |
search_query=search_query, dataset_name=dataset_name, tags=tags | |
) | |
messages = [ | |
{"role": "user", "content": initial_prompt}, | |
{"role": "assistant", "content": title + "\n\n" + content}, | |
{"role": "user", "content": prompt}, | |
] | |
generated_samples = 0 | |
current_csv_chunk = "" | |
in_csv_block = False | |
for attempt in range(3): # Retry logic | |
try: | |
for chunk in client.chat_completion(messages=messages, max_tokens=1500, stream=True, top_p=0.8, seed=42): | |
token = chunk.choices[0].delta.content | |
if not token: continue | |
current_csv_chunk += token | |
# Detect CSV block start/end | |
if token.strip().startswith("```csv") or token.strip().startswith("```"): | |
in_csv_block = True | |
continue | |
if token.strip().startswith("```"): | |
in_csv_block = False | |
if current_csv_chunk.strip(): # Process accumulated chunk if block just ended | |
try: | |
temp_df = parse_csv_df(current_csv_chunk.strip(), csv_header=csv_header) | |
new_rows = temp_df.iloc[generated_samples:].to_dict('records') | |
for i, record in enumerate(new_rows): | |
if generated_samples >= len(indices_to_generate): break | |
refined_record = refine_data_generic([record])[0] | |
flags = detect_anomalies(refined_record) | |
if flags: refined_record['_quality_flags'] = flags | |
output_index = indices_to_generate[generated_samples] | |
if output_index < len(output): | |
output[output_index] = refined_record | |
generated_samples += 1 | |
yield 1 # Signal progress | |
except ValueError as e: print(f"CSV parsing error: {e}") | |
except Exception as e: print(f"CSV chunk processing error: {e}") | |
finally: current_csv_chunk = "" # Reset chunk | |
continue | |
if in_csv_block: # Process incrementally if inside CSV block | |
try: | |
temp_df = parse_csv_df(current_csv_chunk.strip(), csv_header=csv_header) | |
new_rows = temp_df.iloc[generated_samples:].to_dict('records') | |
for i, record in enumerate(new_rows): | |
if generated_samples >= len(indices_to_generate): break | |
refined_record = refine_data_generic([record])[0] | |
flags = detect_anomalies(refined_record) | |
if flags: refined_record['_quality_flags'] = flags | |
output_index = indices_to_generate[generated_samples] | |
if output_index < len(output): | |
output[output_index] = refined_record | |
generated_samples += 1 | |
yield 1 | |
except ValueError: pass # CSV not complete | |
except Exception as e: print(f"Incremental CSV processing error: {e}") | |
if generated_samples >= len(indices_to_generate): break # Target reached | |
print(f"Retrying generation for variant '{variant}' (attempt {attempt+1})...") | |
time.sleep(2**attempt) | |
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: | |
print(f"Connection error (attempt {attempt+1}): {e}. Retrying...") | |
time.sleep(2**attempt) | |
except Exception as e: | |
print(f"Unexpected error (attempt {attempt+1}): {e}. Retrying...") | |
time.sleep(2**attempt) | |
def generate_variants(preview_df: pd.DataFrame) -> Iterator[str]: | |
"""Generates diverse prompts for creating dataset variants.""" | |
label_cols = [col for col in preview_df.columns if "label" in col.lower()] | |
labels = preview_df[label_cols[0]].unique() if label_cols and len(preview_df[label_cols[0]].unique()) > 1 else [] | |
if labels: # Prioritize label-based generation | |
rarities = ["pretty obvious", "common/regular", "unexpected but useful", "uncommon but still plausible", "rare/niche but still plausible"] | |
for rarity in rarities: | |
for label in labels: yield GENERATE_VARIANTS_WITH_RARITY_AND_LABEL.format(rarity=rarity, label=label) | |
else: # Fallback to general rarity prompts | |
rarities = ["obvious", "expected", "common", "regular", "unexpected but useful", "original but useful", "specific but not far-fetched", "uncommon but still plausible", "rare but still plausible", "very niche but still plausible"] | |
for rarity in rarities: yield GENERATE_VARIANTS_WITH_RARITY.format(rarity=rarity) | |
# --- Gradio Interface --- | |
def whoami(token: str) -> Dict[str, Any]: | |
"""Fetches user information from Hugging Face Hub API.""" | |
try: | |
response = requests.get("https://huggingface.co/api/users/me", headers={"Authorization": f"Bearer {token}"}, timeout=5) | |
response.raise_for_status() | |
return response.json() | |
except (requests.exceptions.RequestException, ValueError) as e: | |
print(f"Error fetching user info: {e}") | |
return {"name": "User", "orgs": []} | |
def get_repo_visibility(repo_id: str, token: str) -> str: | |
"""Determines if a Hugging Face repository is public or private.""" | |
try: | |
response = requests.get(f"https://huggingface.co/api/repos/{repo_id}", headers={"Authorization": f"Bearer {token}"}, timeout=5) | |
response.raise_for_status() | |
return "public" if not response.json().get("private", False) else "private" | |
except HfHubHTTPError as e: | |
if e.response.status_code == 404: return "public" # Assume public if repo doesn't exist | |
print(f"Error checking repo visibility for {repo_id}: {e}") | |
return "public" | |
except Exception as e: | |
print(f"Unexpected error checking repo visibility for {repo_id}: {e}") | |
return "public" | |
with gr.Blocks(css=css) as demo: | |
generated_texts_state = gr.State((landing_page_datasets_generated_text,)) # State for generated dataset names | |
current_dataset_state = gr.State(None) # State to hold current dataset details for generation | |
is_real_data_state = gr.State(True) # State to track if real data is being used | |
current_engine_state = gr.State(None) # State to track the current search engine | |
selected_engines_state = gr.State(["DuckDuckGo.com", "Bing.com", "Search.Yahoo.com", "Search.Brave.com", "Ecosia.org"]) # Default selected engines | |
searchEngines = ["AlltheInternet.com", "DuckDuckGo.com", "Google.com", "Bing.com", "Search.Yahoo.com", "Startpage.com", "Qwant.com", "Ecosia.org", "WolframAlpha.com", "Mojeek.co.uk", "Search.Brave.com", "Yandex.com", "Baidu.com", "Gibiru.com", "MetaGer.org", "Swisscows.com", "Presearch.com", "Ekoru.org", "Search.Lilo.org"] | |
# --- Search Page UI --- | |
with gr.Column(visible=True, elem_id="search-page") as search_page: | |
gr.Markdown("# 🤗 Infinite Dataset Hub ♾️\n\nAn endless catalog of datasets, created just for you by an AI model.") | |
with gr.Row(): | |
search_bar = gr.Textbox(max_lines=1, placeholder="Search datasets, get infinite results", show_label=False, container=False, scale=9) | |
search_button = gr.Button("🔍", variant="primary", scale=1) | |
button_groups: list[gr.Group] = [] # Holds the groups for dataset buttons | |
buttons: list[gr.Button] = [] # Holds the actual dataset name and tag buttons | |
for i in range(MAX_TOTAL_NB_ITEMS): | |
if i < len(default_output): # Use default datasets initially | |
line = default_output[i] | |
try: dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1) | |
except ValueError: dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" ", 1)[0], "" | |
group_classes, name_classes, tag_classes = "buttonsGroup", "topButton", "bottomButton" | |
else: # Placeholders for future datasets | |
dataset_name, tags = "⬜⬜⬜⬜⬜⬜", "░░░░, ░░░░, ░░░░" | |
group_classes, name_classes, tag_classes = "buttonsGroup insivibleButtonGroup", "topButton linear-background", "bottomButton linear-background" | |
with gr.Group(elem_classes=group_classes) as button_group: | |
button_groups.append(button_group) | |
dataset_btn = gr.Button(dataset_name, elem_classes=name_classes) | |
tags_btn = gr.Button(tags, elem_classes=tag_classes) | |
buttons.append(dataset_btn) | |
buttons.append(tags_btn) | |
load_more_datasets = gr.Button("Load more datasets") | |
gr.Markdown(f"_powered by [{model_id}](https://huggingface.co/{model_id})_") | |
# --- Settings Panel --- | |
with gr.Column(scale=4, min_width="200px"): | |
with gr.Accordion("Settings", open=False, elem_classes="settings"): | |
gr.Markdown("Manage your Hugging Face account and dataset saving options.") | |
gr.LoginButton() | |
select_namespace_dropdown = gr.Dropdown(choices=[NAMESPACE], value=NAMESPACE, label="Hugging Face Namespace", visible=False) | |
gr.Markdown("Dataset Generation Mode") | |
refinement_mode = gr.Radio( | |
["sourceless", "sourced"], value="sourceless", label="Refinement Mode", | |
info="Sourceless: AI generates data freely. Sourced: AI uses loaded data for context and refinement." | |
) | |
with gr.Group(visible=False) as source_group: # Dynamic section for source loading | |
source_type = gr.Dropdown( | |
choices=["csv_url", "xlsx_url", "local_csv", "local_xlsx"], value="csv_url", | |
label="Source Type", info="Select the format of your data source." | |
) | |
source_path = gr.Textbox( | |
label="Source Path/URL", placeholder="Enter URL or local file path", | |
info="Provide the location of your dataset file." | |
) | |
load_source_button = gr.Button("Load Source Data", icon="https://huggingface.co/datasets/huggingface/badges/resolve/main/badge-files/data.svg") | |
source_status = gr.Markdown("", visible=False) | |
visibility_radio = gr.Radio( | |
["public", "private"], value="public", container=False, interactive=False, | |
label="Dataset Visibility", info="Set visibility for datasets saved to Hugging Face Hub." | |
) | |
# Search Engine Settings | |
gr.Markdown("Search Engine Configuration") | |
data_source_toggle = gr.Checkbox(label="Use Real Search Data", value=True, info="Toggle to include results from real search engines.") | |
engine_settings_button = gr.Button("Configure Search Engines", icon="https://img.icons8.com/ios-filled/50/000000/settings--v1.png", size="sm") | |
# Engine Selection Modal | |
with gr.Modal("Search Engine Settings", id="engine-modal") as engine_modal: | |
gr.Markdown("Select which search engines to use for real data retrieval. A diverse selection improves results.") | |
engine_options_html_comp = gr.HTML(elem_id="engine-options") | |
with gr.Row(): | |
select_all_engines_btn = gr.Button("Select All") | |
deselect_all_engines_btn = gr.Button("Deselect All") | |
save_engines_btn = gr.Button("Save Settings", variant="primary") | |
# --- Dataset Detail Page UI --- | |
with gr.Column(visible=False, elem_id="dataset-page") as dataset_page: | |
gr.Markdown("# 🤗 Infinite Dataset Hub ♾️\n\nAn endless catalog of datasets, created just for you.") | |
dataset_title_md = gr.Markdown() # Dataset name and tags | |
dataset_source_badge = gr.Markdown() # Badge indicating real/AI data | |
dataset_source_info = gr.Markdown() # Details about the data source | |
dataset_description_md = gr.Markdown() # Dataset description | |
preview_table_comp = gr.DataFrame(visible=False, interactive=False, wrap=True) # Holds the preview CSV | |
with gr.Row(): | |
generate_full_dataset_button = gr.Button("Generate Full Dataset", variant="primary") | |
save_dataset_button = gr.Button("💾 Save Dataset", variant="primary", visible=False) | |
open_dataset_message = gr.Markdown("", visible=False) # Confirmation message | |
dataset_share_button = gr.Button("Share Dataset URL") | |
dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True) | |
full_dataset_section = gr.Column(visible=False) # Container for full dataset and downloads | |
full_table_comp = gr.DataFrame(visible=False, interactive=False, wrap=True) | |
with gr.Row(): | |
download_csv_button = gr.Button("Download CSV") | |
download_json_button = gr.Button("Download JSON") | |
download_parquet_button = gr.Button("Download Parquet") | |
back_button = gr.Button("< Back", size="sm") | |
# --- Event Handlers --- | |
# Search Logic | |
def _update_search_results(search_query: str, current_generated_texts: tuple[str], is_real_data: bool, engine: Optional[str]): | |
"""Handles dataset search and UI updates.""" | |
# Reset UI to loading state | |
yield {btn: gr.Button("⬜⬜⬜⬜⬜⬜", elem_classes="topButton linear-background") for btn in buttons[::2]} | |
yield {btn: gr.Button("░░░░, ░░░░, ░░░░", elem_classes="bottomButton linear-background") for btn in buttons[1::2]} | |
yield {group: gr.Group(elem_classes="buttonsGroup insivibleButtonGroup") for group in button_groups} | |
generated_count = 0 | |
new_texts = "" | |
try: | |
# Generate dataset names from LLM | |
for line in generate_dataset_names(search_query, [], is_real_data=is_real_data, engine=engine): | |
if "I'm sorry" in line or "policy" in line: raise gr.Error("Inappropriate content detected.") | |
if generated_count >= MAX_NB_ITEMS_PER_GENERATION_CALL: break | |
match = re.match(r"^\s*\d+\.\s+(.+?)\s+$$(.+?)$$", line) # Parse line format | |
if match: | |
dataset_name, tags = match.groups() | |
dataset_name, tags = dataset_name.strip(), tags.strip() | |
new_texts += line | |
# Update buttons with generated data | |
yield { | |
buttons[2 * generated_count]: gr.Button(dataset_name, elem_classes="topButton"), | |
buttons[2 * generated_count + 1]: gr.Button(tags, elem_classes="bottomButton"), | |
} | |
generated_count += 1 | |
# Update state and make new buttons visible | |
new_history = (current_generated_texts + (new_texts,)) if current_generated_texts else (landing_page_datasets_generated_text + "\n" + new_texts,) | |
yield {generated_texts_state: new_history} | |
yield {group: gr.Group(elem_classes="buttonsGroup") for group in button_groups[:generated_count]} | |
except gr.Error as e: raise e # Propagate Gradio errors | |
except Exception as e: raise gr.Error(f"Failed to generate datasets: {str(e)}") | |
# Attach search handlers | |
search_button.click( | |
_update_search_results, | |
inputs=[search_bar, generated_texts_state, is_real_data_state, current_engine_state], | |
outputs=buttons + [generated_texts_state] + button_groups | |
) | |
search_bar.submit( | |
_update_search_results, | |
inputs=[search_bar, generated_texts_state, is_real_data_state, current_engine_state], | |
outputs=buttons + [generated_texts_state] + button_groups | |
) | |
# Load More Datasets | |
load_more_datasets.click( | |
_update_search_results, | |
inputs=[search_bar, generated_texts_state, is_real_data_state, current_engine_state], | |
outputs=buttons + [generated_texts_state] + button_groups | |
) | |
# Display Single Dataset Details | |
def _show_dataset_details(search_query, dataset_name, tags, is_real_data, engine): | |
"""Switches to detail view and loads dataset content.""" | |
yield { | |
search_page: gr.Column(visible=False), dataset_page: gr.Column(visible=True), | |
dataset_title_md: f"# {dataset_name}\n\n tags: {tags}", | |
dataset_share_textbox: gr.Textbox(visible=False), | |
full_dataset_section: gr.Column(visible=False), | |
save_dataset_button: gr.Button(visible=False), | |
open_dataset_message: gr.Markdown("", visible=False) | |
} | |
# Update source badge and info | |
if is_real_data: | |
badge_html = gr.Markdown(f'<span class="px-3 py-1 rounded-full text-xs font-medium bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200">Real Data</span>', visible=True) | |
info_html = gr.Markdown(f'This dataset is based on real information queried from <strong>{engine}</strong> for the search term "<strong>{search_query}</strong>". The data has been structured for machine learning use.', visible=True) | |
else: | |
badge_html = gr.Markdown('<span class="px-3 py-1 rounded-full text-xs font-medium bg-purple-100 text-purple-800 dark:bg-purple-900 dark:text-purple-200">AI-Generated</span>', visible=True) | |
info_html = gr.Markdown(f'This is an AI-generated dataset created using {model_id}. The content is synthetic and designed to represent plausible data related to "{search_query}".', visible=True) | |
yield {dataset_source_badge: badge_html, dataset_source_info: info_html} | |
# Stream content generation | |
for content_chunk in generate_dataset_content(search_query, dataset_name, tags, [], is_real_data=is_real_data, engine=engine): | |
yield {dataset_description_md: content_chunk} | |
# Link buttons to the detail view function | |
def _show_dataset_from_button_wrapper(search_query, *buttons_values): | |
# Determine which button was clicked to get the index | |
clicked_button_index = -1 | |
for i, btn_val in enumerate(buttons_values): | |
if btn_val is not None and btn_val != "": # Assuming non-empty value indicates the clicked button's text | |
clicked_button_index = i | |
break | |
if clicked_button_index == -1: return # Should not happen if events are correctly wired | |
# Determine if it was a name button (even index) or tag button (odd index) | |
dataset_index = clicked_button_index // 2 | |
dataset_name, tags = buttons_values[2 * dataset_index], buttons_values[2 * dataset_index + 1] | |
is_real_data = current_engine_state.value is not None # Infer from engine state | |
engine = current_engine_state.value if is_real_data else None | |
yield from _show_dataset_details(search_query, dataset_name, tags, is_real_data, engine) | |
# Wire up click events for all dataset name and tag buttons | |
for i, (name_btn, tag_btn) in enumerate(batched(buttons, 2)): | |
name_btn.click( | |
partial(_show_dataset_from_button_wrapper), | |
inputs=[search_bar, *buttons], | |
outputs=[search_page, dataset_page, dataset_title_md, dataset_description_md, dataset_source_badge, dataset_source_info, dataset_share_textbox, full_dataset_section, save_dataset_button, open_dataset_message] | |
) | |
tag_btn.click( | |
partial(_show_dataset_from_button_wrapper), | |
inputs=[search_bar, *buttons], | |
outputs=[search_page, dataset_page, dataset_title_md, dataset_description_md, dataset_source_badge, dataset_source_info, dataset_share_textbox, full_dataset_section, save_dataset_button, open_dataset_message] | |
) | |
# Back Button Navigation | |
back_button.click(lambda: (gr.Column(visible=True), gr.Column(visible=False)), outputs=[search_page, dataset_page], js=""" | |
function() { | |
if ('parentIFrame' in window) { window.parentIFrame.scrollTo({top: 0, behavior:'smooth'}); } | |
else { window.scrollTo({ top: 0, behavior: 'smooth' }); } | |
return Array.from(arguments); | |
} | |
""") | |
# Full Dataset Generation | |
def _generate_full_dataset(title_md, content_md, search_query, namespace, visibility, mode, is_real_data, engine): | |
# Extract dataset name and tags from the markdown title | |
try: | |
dataset_name = title_md.split('\n')[0].strip('# ') | |
tags = title_md.split('tags:', 1)[1].strip() | |
except IndexError: | |
raise gr.Error("Could not parse dataset title.") | |
try: csv_header, preview_df = parse_preview_df(content_md) | |
except ValueError as e: raise gr.Error(f"Failed to parse preview: {e}") | |
refined_preview_df = refine_preview_data(preview_df, mode) | |
columns = list(refined_preview_df) | |
output_data: list[Optional[dict]] = [None] * NUM_ROWS # Initialize output structure | |
initial_rows = refined_preview_df.to_dict('records') | |
for i, record in enumerate(initial_rows): | |
if i < NUM_ROWS: output_data[i] = {"idx": i, **record} | |
# Update UI: show preview, disable generate, show save button | |
yield { | |
full_table_comp: gr.DataFrame(pd.DataFrame([r for r in output_data if r]), visible=True), | |
generate_full_dataset_button: gr.Button(interactive=False), | |
save_dataset_button: gr.Button(f"💾 Save {namespace}/{dataset_name}" + (" (private)" if visibility != "public" else ""), visible=True, interactive=False), | |
full_dataset_section: gr.Column(visible=True) | |
} | |
# Prepare generation tasks for variants | |
generation_tasks = [] | |
variants = islice(generate_variants(refined_preview_df), NUM_VARIANTS) | |
for i, variant in enumerate(variants): | |
indices = list(range(len(initial_rows) + i, NUM_ROWS, NUM_VARIANTS)) | |
if indices: # Only create task if there are rows to generate | |
generation_tasks.append({ | |
"func": generate_partial_dataset, | |
"kwargs": { | |
"title": title_md, "content": content_md, "search_query": search_query, "variant": variant, | |
"csv_header": csv_header, "output": output_data, "indices_to_generate": indices, | |
"history": [], # Use fresh history for each variant task | |
"is_real_data": is_real_data, "engine": engine | |
} | |
}) | |
# Execute tasks in parallel and update UI progressively | |
for _ in iflatmap_unordered(lambda **kw: kw.pop('func')(**kw), generation_tasks): | |
yield {full_table_comp: pd.DataFrame([r for r in output_data if r])} # Update DataFrame display | |
yield {save_dataset_button: gr.Button(interactive=True)} # Enable save button | |
print(f"Full dataset generation complete for {dataset_name}.") | |
# Save Dataset to Hugging Face Hub | |
def _save_dataset(title_md, content_md, search_query, df, namespace, visibility, oauth_token): | |
# Extract dataset name and tags from the markdown title | |
try: | |
dataset_name = title_md.split('\n')[0].strip('# ') | |
tags = title_md.split('tags:', 1)[1].strip() | |
except IndexError: | |
raise gr.Error("Could not parse dataset title.") | |
token = oauth_token.token if oauth_token else save_dataset_hf_token | |
if not token: raise gr.Error("Login required or set SAVE_DATASET_HF_TOKEN.") | |
repo_id = f"{namespace}/{dataset_name}" | |
dataset_url_params = f"q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}" | |
dataset_url = f"{URL}?{dataset_url_params}" | |
gr.Info("Saving dataset...") | |
yield {save_dataset_button: gr.Button(interactive=False)} # Disable button during save | |
try: | |
create_repo(repo_id=repo_id, repo_type="dataset", private=visibility!="public", exist_ok=True, token=token) | |
df.to_csv(f"hf://datasets/{repo_id}/data.csv", storage_options={"token": token}, index=False) | |
card_content = DATASET_CARD_CONTENT.format(title=title_md, content=content_md, url=URL, dataset_url=dataset_url, model_id=model_id, search_query=search_query) | |
DatasetCard(card_content).push_to_hub(repo_id=repo_id, repo_type="dataset", token=token) | |
success_msg = f"# 🎉 Yay! Dataset saved to [{repo_id}](https://huggingface.co/datasets/{repo_id})!\n\n_PS: Check Settings to manage your saved datasets._" | |
gr.Info("Dataset saved successfully.") | |
yield {open_dataset_message: gr.Markdown(success_msg, visible=True)} | |
except HfHubHTTPError as e: raise gr.Error(f"HF Hub error: {e.message}") | |
except Exception as e: raise gr.Error(f"Save failed: {str(e)}") | |
finally: yield {save_dataset_button: gr.Button(interactive=True)} # Re-enable button | |
# Shareable URL Generation | |
def _show_share_url(title_md, search_query): | |
try: | |
dataset_name = title_md.split('\n')[0].strip('# ') | |
tags = title_md.split('tags:', 1)[1].strip() | |
except IndexError: | |
raise gr.Error("Could not parse dataset title.") | |
share_url = f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}" | |
return gr.Textbox(share_url, visible=True) | |
# Settings Toggles | |
refinement_mode.change(lambda mode: gr.Group(visible=(mode == "sourced")), outputs=[source_group]) | |
data_source_toggle.change(lambda value: (gr.State(value), gr.State(value if value else None)), inputs=[data_source_toggle], outputs=[is_real_data_state, current_engine_state]) | |
def _load_source_data(source_type, source_path): | |
if not source_path: raise gr.Error("Source path/URL is required.") | |
try: | |
knowledge_base.load_source(source_type, source_path) | |
gr.Info("Source data loaded.") | |
return gr.Markdown("✅ Source loaded successfully", visible=True) | |
except (ConnectionError, ValueError, RuntimeError) as e: | |
raise gr.Error(f"Failed to load source: {str(e)}") | |
# Engine Settings Modal Logic | |
def _populate_engine_options(selected_engines): | |
engine_options_html = "" | |
for engine in searchEngines: | |
is_checked = "checked" if engine in selected_engines else "" | |
engine_options_html += f""" | |
<div class="flex items-center"> | |
<input type="checkbox" id="engine-{engine.replace('.', '_')}" class="engine-checkbox mr-2 h-4 w-4" value="{engine}" {is_checked}> | |
<label for="engine-{engine.replace('.', '_')}" class="cursor-pointer">{engine}</label> | |
</div> | |
""" | |
return gr.HTML(engine_options_html) | |
def _save_engine_settings(selected_engines_json): | |
selected_engines = json.loads(selected_engines_json) | |
if not selected_engines: | |
gr.Warning("At least one search engine must be selected. Using DuckDuckGo as default.") | |
selected_engines = ["DuckDuckGo.com"] | |
current_engine = selected_engines[0] if selected_engines else None | |
return gr.State(selected_engines), gr.State(current_engine), gr.Info(f"Updated search engines. Using {len(selected_engines)} engines.") | |
# Initialize engine options component | |
engine_options_html_comp = _populate_engine_options(selected_engines_state.value) | |
# Update engine options when the modal is opened | |
engine_settings_button.click(lambda: engine_options_html_comp.update(_populate_engine_options(selected_engines_state.value)), outputs=[engine_options_html_comp]) | |
select_all_engines_btn.click(lambda: engine_options_html_comp.update(_populate_engine_options(searchEngines)), outputs=[engine_options_html_comp]) | |
deselect_all_engines_btn.click(lambda: engine_options_html_comp.update(_populate_engine_options([])), outputs=[engine_options_html_comp]) | |
save_engines_btn.click( | |
_save_engine_settings, | |
inputs=[gr.JSON(elem_id="engine-options")], # Capture checked engines from modal | |
outputs=[selected_engines_state, current_engine_state, gr.Info()] | |
) | |
engine_settings_button.click(lambda: engine_modal.update(visible=True), outputs=[engine_modal]) | |
# Close modal on save or when clicking outside (implicit via Gradio's modal handling) | |
# Initial App Load Logic | |
# Outputs for settings | |
def _load_app(request: gr.Request, oauth_token: Optional[gr.OAuthToken]): | |
# Handle user login and namespace selection | |
if oauth_token: | |
try: | |
user_info = whoami(oauth_token.token) | |
namespaces = [user_info["name"]] + [org["name"] for org in user_info.get("orgs", [])] | |
yield { | |
select_namespace_dropdown: gr.Dropdown(choices=namespaces, value=user_info["name"], visible=True), | |
visibility_radio: gr.Radio(interactive=True), | |
} | |
except Exception: # Fallback if user info fails | |
yield { | |
select_namespace_dropdown: gr.Dropdown(choices=[NAMESPACE], value=NAMESPACE, visible=True), | |
visibility_radio: gr.Radio(interactive=True), | |
} | |
else: # Default settings if not logged in | |
yield { | |
select_namespace_dropdown: gr.Dropdown(choices=[NAMESPACE], value=NAMESPACE, visible=True), | |
visibility_radio: gr.Radio(interactive=False), | |
} | |
# Handle URL parameters for direct search or dataset loading | |
query_params = dict(request.query_params) | |
if "dataset" in query_params: | |
is_real = query_params.get("engine") is not None | |
engine = query_params.get("engine") | |
yield from _show_dataset_details(query_params.get("q", query_params["dataset"]), query_params["dataset"], query_params.get("tags", ""), is_real, engine) | |
yield {is_real_data_state: is_real, current_engine_state: engine} | |
elif "q" in query_params: | |
search_query = query_params["q"] | |
is_real = query_params.get("engine") is not None | |
engine = query_params.get("engine") | |
yield {search_bar: search_query} | |
yield {is_real_data_state: is_real, current_engine_state: engine} | |
yield from _update_search_results(search_query, (), is_real, engine) | |
else: | |
yield {search_page: gr.Column(visible=True)} # Show search page by default | |
# Initialize with default datasets | |
initial_outputs = {} | |
for i, line in enumerate(default_output): | |
try: dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1) | |
except ValueError: dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" ", 1)[0], "" | |
initial_outputs[buttons[2 * i]] = gr.Button(dataset_name, elem_classes="topButton") | |
initial_outputs[buttons[2 * i + 1]] = gr.Button(tags, elem_classes="bottomButton") | |
initial_outputs[button_groups[i]] = gr.Group(elem_classes="buttonsGroup") | |
yield initial_outputs | |
yield {generated_texts_state: (landing_page_datasets_generated_text,)} | |
# Initialize engine settings UI | |
yield { | |
data_source_toggle: gr.Checkbox(value=is_real_data_state.value), | |
engine_options_html_comp: _populate_engine_options(selected_engines_state.value) | |
} | |
if __name__ == "__main__": | |
demo.launch(share=False, server_name="0.0.0.0") |