Infini-d-set / app.py
acecalisto3's picture
Update app.py
2a41399 verified
import io
import os
import re
import time
import requests
from typing import Any, Dict, List, Optional, Set, Union
from difflib import get_close_matches
from pathlib import Path
from itertools import islice
from functools import partial
from multiprocessing.pool import ThreadPool
from queue import Queue, Empty
from typing import Callable, Iterable, Iterator, Optional, TypeVar
import gradio as gr
import pandas as pd
import requests.exceptions
from huggingface_hub import InferenceClient, create_repo, DatasetCard
from huggingface_hub.utils import HfHubHTTPError
import json
# --- Configuration ---
model_id = "microsoft/Phi-3-mini-4k-instruct"
client = InferenceClient(model_id)
save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN")
MAX_TOTAL_NB_ITEMS = 100
MAX_NB_ITEMS_PER_GENERATION_CALL = 10
NUM_ROWS = 100
NUM_VARIANTS = 10
NAMESPACE = "infinite-dataset-hub"
URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
# --- Prompt Templates ---
GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
"A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
f"Generate a list of {MAX_NB_ITEMS_PER_GENERATION_CALL} names of quality datasets that don't exist but sound plausible and would "
"be helpful. Feel free to reuse words from the query '{search_query}' to name the datasets. "
"Every dataset should be about '{search_query}' and have descriptive tags/keywords including the ML task name associated with the dataset (classification, regression, anomaly detection, etc.). Use the following format:\n1. DatasetName1 (tag1, tag2, tag3)\n2. DatasetName2 (tag1, tag2, tag3)"
)
GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS = (
"An ML practitioner is looking for a dataset CSV after the query '{search_query}'. "
"Generate the first 5 rows of a plausible and quality CSV for the dataset '{dataset_name}'. "
"You can get inspiration from related keywords '{tags}' but most importantly the dataset should correspond to the query '{search_query}'. "
"Focus on quality text content and use a 'label' or 'labels' column if it makes sense (invent labels, avoid reusing the keywords, be accurate while labelling texts). "
"Reply using a short description of the dataset with title **Dataset Description:** followed by the CSV content in a code block and with title **CSV Content Preview:**."
)
GENERATE_MORE_ROWS = "Can you give me 10 additional samples in CSV format as well? Use the same CSV header '{csv_header}'."
GENERATE_VARIANTS_WITH_RARITY_AND_LABEL = "Focus on generating samples for the label '{label}' and ideally generate {rarity} samples."
GENERATE_VARIANTS_WITH_RARITY = "Focus on generating {rarity} samples."
# --- Default Datasets for Landing Page ---
landing_page_datasets_generated_text = """
1. NewsEventsPredict (classification, media, trend)
2. FinancialForecast (economy, stocks, regression)
3. HealthMonitor (science, real-time, anomaly detection)
4. SportsAnalysis (classification, performance, player tracking)
5. SciLiteracyTools (language modeling, science literacy, text classification)
6. RetailSalesAnalyzer (consumer behavior, sales trend, segmentation)
7. SocialSentimentEcho (social media, emotion analysis, clustering)
8. NewsEventTracker (classification, public awareness, topical clustering)
9. HealthVitalSigns (anomaly detection, biometrics, prediction)
10. GameStockPredict (classification, finance, sports contingency)
"""
default_output = landing_page_datasets_generated_text.strip().split("\n")
assert len(default_output) == MAX_NB_ITEMS_PER_GENERATION_CALL
# --- Dataset Card Template ---
DATASET_CARD_CONTENT = """
---
license: mit
tags:
- infinite-dataset-hub
- synthetic
---
{title}
_Note: This is an AI-generated dataset so its content may be inaccurate or false_
{content}
**Source of the data:**
The dataset was generated using the [Infinite Dataset Hub]({url}) and {model_id} using the query '{search_query}':
- **Dataset Generation Page**: {dataset_url}
- **Model**: https://huggingface.co/{model_id}
- **More Datasets**: https://huggingface.co/datasets?other=infinite-dataset-hub
"""
# --- Gradio HTML ---
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Infinite Dataset Hub</title>
<script src="https://cdn.tailwindcss.com"></script>
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/papaparse@5.3.0/papaparse.min.js"></script>
<script>
tailwind.config = {
darkMode: 'class',
theme: {
extend: {
colors: {
primary: '#5D5CDE',
},
}
}
}
</script>
<style>
.shimmer {
background: linear-gradient(90deg, #f0f0f0 25%, #e0e0e0 50%, #f0f0f0 75%);
background-size: 200% 100%;
animation: shimmer 1.5s infinite;
border-radius: 4px;
}
@keyframes shimmer {
0% {
background-position: -200% 0;
}
100% {
background-position: 200% 0;
}
}
/* Dark mode overrides */
.dark .shimmer {
background: linear-gradient(90deg, #2a2a2a 25%, #3a3a3a 50%, #2a2a2a 75%);
background-size: 200% 100%;
}
.dataset-card {
transition: transform 0.2s, box-shadow 0.2s;
}
.dataset-card:hover {
transform: translateY(-2px);
box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);
}
.dark .dataset-card:hover {
box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.3), 0 4px 6px -2px rgba(0, 0, 0, 0.2);
}
/* Table styling */
table {
width: 100%;
border-collapse: collapse;
margin: 1rem 0;
}
table thead th {
background-color: #f3f4f6;
padding: 0.75rem;
text-align: left;
font-weight: 600;
}
.dark table thead th {
background-color: #374151;
}
table tbody td {
padding: 0.75rem;
border-top: 1px solid #e5e7eb;
}
.dark table tbody td {
border-top: 1px solid #4b5563;
}
table tbody tr:nth-child(even) {
background-color: #f9fafb;
}
.dark table tbody tr:nth-child(even) {
background-color: #1f2937;
}
/* Search engine badge */
.engine-badge {
position: absolute;
top: -8px;
right: -8px;
font-size: 0.7rem;
padding: 2px 6px;
border-radius: 9999px;
background-color: #5D5CDE;
color: white;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.dark .engine-badge {
box-shadow: 0 2px 4px rgba(0,0,0,0.3);
}
/* Toggle switch */
.toggle-switch {
position: relative;
display: inline-block;
width: 50px;
height: 24px;
}
.toggle-switch input {
opacity: 0;
width: 0;
height: 0;
}
.toggle-slider {
position: absolute;
cursor: pointer;
top: 0;
left: 0;
right: 0;
bottom: 0;
background-color: #ccc;
transition: .4s;
border-radius: 24px;
}
.toggle-slider:before {
position: absolute;
content: "";
height: 16px;
width: 16px;
left: 4px;
bottom: 4px;
background-color: white;
transition: .4s;
border-radius: 50%;
}
input:checked + .toggle-slider {
background-color: #5D5CDE;
}
input:checked + .toggle-slider:before {
transform: translateX(26px);
}
</style>
</head>
<body class="bg-white dark:bg-gray-900 text-gray-800 dark:text-gray-200 min-h-screen">
<!-- Dark mode detection -->
<script>
if (window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches) {
document.documentElement.classList.add('dark');
}
window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', event => {
if (event.matches) {
document.documentElement.classList.add('dark');
} else {
document.documentElement.classList.remove('dark');
}
});
</script>
<div class="container mx-auto px-4 py-8">
<!-- Header -->
<header class="text-center mb-8">
<h1 class="text-3xl font-bold mb-2">🤗 Infinite Dataset Hub ♾️</h1>
<p class="text-lg text-gray-600 dark:text-gray-400">Generate datasets from AI and real-world data sources</p>
</header>
<!-- Main Content -->
<main>
<!-- Search Section -->
<div id="search-page" class="mb-8">
<div class="max-w-3xl mx-auto">
<div class="mb-4">
<div class="flex mb-2">
<input id="search-input" type="text" placeholder="Search datasets, get infinite results"
class="flex-grow px-4 py-3 text-base rounded-l-lg border border-gray-300 dark:border-gray-700 focus:outline-none focus:ring-2 focus:ring-primary dark:bg-gray-800">
<button id="search-button" class="bg-primary text-white px-6 py-3 rounded-r-lg hover:bg-opacity-90 transition">
🔍
</button>
</div>
<div class="flex items-center justify-between p-3 bg-gray-100 dark:bg-gray-800 rounded-lg">
<div class="flex items-center">
<label class="toggle-switch mr-3">
<input type="checkbox" id="data-source-toggle" checked>
<span class="toggle-slider"></span>
</label>
<div>
<span id="data-source-text" class="font-medium">Using: Real + AI Data</span>
<p class="text-xs text-gray-500 dark:text-gray-400">Toggle to switch between data sources</p>
</div>
</div>
<button id="engine-settings-button" class="text-primary hover:underline flex items-center">
<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-1" viewBox="0 0 20 20" fill="currentColor">
<path fill-rule="evenodd" d="M11.49 3.17c-.38-1.56-2.6-1.56-2.98 0a1.532 1.532 0 01-2.286.948c-1.372-.836-2.942.734-2.106 2.106.54.886.061 2.042-.947 2.287-1.561.379-1.561 2.6 0 2.978a1.532 1.532 0 01.947 2.287c-.836 1.372.734 2.942 2.106 2.106a1.532 1.532 0 012.287.947c.379 1.561 2.6 1.561 2.978 0a1.533 1.533 0 012.287-.947c1.372.836 2.942-.734 2.106-2.106a1.533 1.533 0 01.947-2.287c1.561-.379 1.561-2.6 0-2.978a1.532 1.532 0 01-.947-2.287c.836-1.372-.734-2.942-2.106-2.106a1.532 1.532 0 01-2.287-.947zM10 13a3 3 0 100-6 3 3 0 000 6z" clip-rule="evenodd" />
</svg>
Search Engines
</button>
</div>
</div>
<!-- Search Engine Selection Modal -->
<div id="engine-modal" class="fixed inset-0 bg-black bg-opacity-50 flex items-center justify-center z-50 hidden">
<div class="bg-white dark:bg-gray-800 rounded-lg p-6 max-w-lg w-full max-h-[80vh] overflow-y-auto">
<div class="flex justify-between items-center mb-4">
<h3 class="text-xl font-bold">Search Engine Settings</h3>
<button id="close-modal-button" class="text-gray-500 hover:text-gray-700 dark:text-gray-400 dark:hover:text-gray-200">
<svg xmlns="http://www.w3.org/2000/svg" class="h-6 w-6" fill="none" viewBox="0 0 24 24" stroke="currentColor">
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M6 18L18 6M6 6l12 12" />
</svg>
</button>
</div>
<p class="mb-4 text-sm text-gray-600 dark:text-gray-400">
Select which search engines to use for real data retrieval. A diverse selection improves results.
</p>
<div id="engine-options" class="space-y-2 mb-6">
<!-- Engine options will be dynamically inserted here -->
</div>
<div class="flex justify-between">
<button id="select-all-engines" class="text-primary hover:underline">Select All</button>
<button id="deselect-all-engines" class="text-primary hover:underline">Deselect All</button>
</div>
<div class="mt-6 flex justify-end">
<button id="save-engines-button" class="bg-primary text-white px-4 py-2 rounded hover:bg-opacity-90 transition">
Save Settings
</button>
</div>
</div>
</div>
<div id="dataset-results" class="grid grid-cols-1 md:grid-cols-2 gap-4 mt-6">
<!-- Dataset cards will be dynamically inserted here -->
</div>
<div id="load-more-container" class="text-center mt-6 hidden">
<button id="load-more-button" class="bg-gray-200 dark:bg-gray-700 px-6 py-3 rounded-lg hover:bg-gray-300 dark:hover:bg-gray-600 transition">
Load more datasets
</button>
</div>
</div>
</div>
<!-- Dataset Detail Page -->
<div id="dataset-page" class="hidden max-w-4xl mx-auto">
<button id="back-button" class="flex items-center text-primary mb-4 hover:underline">
<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-1" viewBox="0 0 20 20" fill="currentColor">
<path fill-rule="evenodd" d="M9.707 14.707a1 1 0 01-1.414 0l-4-4a1 1 0 010-1.414l4-4a1 1 0 011.414 1.414L7.414 9H15a1 1 0 110 2H7.414l2.293 2.293a1 1 0 010 1.414z" clip-rule="evenodd" />
</svg>
Back to Search
</button>
<div id="dataset-header" class="mb-4">
<div class="flex items-center justify-between">
<h2 id="dataset-title" class="text-2xl font-bold"></h2>
<span id="data-source-badge" class="px-3 py-1 rounded-full text-xs font-medium bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200">
Real Data
</span>
</div>
<div id="dataset-tags" class="text-sm text-gray-600 dark:text-gray-400 mt-1"></div>
</div>
<div id="data-source-info" class="bg-blue-50 dark:bg-blue-900 p-4 rounded-lg mb-6 text-blue-800 dark:text-blue-200">
<h3 class="font-semibold mb-1 flex items-center">
<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-1" viewBox="0 0 20 20" fill="currentColor">
<path fill-rule="evenodd" d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-7-4a1 1 0 11-2 0 1 1 0 012 0zM9 9a1 1 0 000 2v3a1 1 0 001 1h1a1 1 0 100-2v-3a1 1 0 00-1-1H9z" clip-rule="evenodd" />
</svg>
Data Source Information
</h3>
<p id="source-details" class="text-sm"></p>
</div>
<div id="dataset-description" class="prose dark:prose-invert prose-sm sm:prose max-w-none mb-6"></div>
<div id="dataset-preview" class="mb-6 overflow-x-auto">
<h3 class="text-xl font-semibold mb-3">Dataset Preview</h3>
<div id="preview-table" class="border dark:border-gray-700 rounded-lg overflow-hidden"></div>
</div>
<div id="generate-actions" class="mb-8">
<button id="generate-full-button" class="bg-primary text-white px-6 py-3 rounded-lg hover:bg-opacity-90 transition mr-3">
Generate Full Dataset
</button>
<div id="generate-status" class="hidden mt-4">
<div class="flex items-center">
<div class="animate-spin rounded-full h-5 w-5 border-b-2 border-primary mr-3"></div>
<span>Generating dataset... <span id="rows-count">0</span> rows created</span>
</div>
<div class="w-full bg-gray-200 dark:bg-gray-700 rounded-full h-2.5 mt-2">
<div id="progress-bar" class="bg-primary h-2.5 rounded-full" style="width: 0%"></div>
</div>
</div>
</div>
<div id="full-dataset" class="hidden mb-6">
<h3 class="text-xl font-semibold mb-3">Full Dataset</h3>
<div id="full-table" class="border dark:border-gray-700 rounded-lg overflow-hidden"></div>
<div class="mt-4 flex flex-wrap gap-3">
<button id="download-csv-button" class="bg-green-600 hover:bg-green-700 text-white px-4 py-2 rounded-lg transition flex items-center">
<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-2" viewBox="0 0 20 20" fill="currentColor">
<path fill-rule="evenodd" d="M3 17a1 1 0 011-1h12a1 1 0 110 2H4a1 1 0 01-1-1zm3.293-7.707a1 1 0 011.414 0L9 10.586V3a1 1 0 112 0v7.586l1.293-1.293a1 1 0 111.414 1.414l-3 3a1 1 0 01-1.414 0l-3-3a1 1 0 010-1.414z" clip-rule="evenodd" />
</svg>
Download CSV
</button>
<button id="download-json-button" class="bg-yellow-600 hover:bg-yellow-700 text-white px-4 py-2 rounded-lg transition flex items-center">
<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-2" viewBox="0 0 20 20" fill="currentColor">
<path fill-rule="evenodd" d="M3 17a1 1 0 011-1h12a1 1 0 110 2H4a1 1 0 01-1-1zm3.293-7.707a1 1 0 011.414 0L9 10.586V3a1 1 0 112 0v7.586l1.293-1.293a1 1 0 111.414 1.414l-3 3a1 1 0 01-1.414 0l-3-3a1 1 0 010-1.414z" clip-rule="evenodd" />
</svg>
Download JSON
</button>
<button id="download-parquet-button" class="bg-blue-600 hover:bg-blue-700 text-white px-4 py-2 rounded-lg transition flex items-center">
<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-2" viewBox="0 0 20 20" fill="currentColor">
<path fill-rule="evenodd" d="M3 17a1 1 0 011-1h12a1 1 0 110 2H4a1 1 0 01-1-1zm3.293-7.707a1 1 0 011.414 0L9 10.586V3a1 1 0 112 0v7.586l1.293-1.293a1 1 0 111.414 1.414l-3 3a1 1 0 01-1.414 0l-3-3a1 1 0 010-1.414z" clip-rule="evenodd" />
</svg>
Download Parquet
</button>
</div>
</div>
</div>
</main>
<!-- Footer -->
<footer class="mt-12 text-center text-sm text-gray-600 dark:text-gray-400">
<p>Powered by Claude-3.7-Sonnet • Datasets generated from real sources and AI</p>
</footer>
</div>
<script>
// Constants and global state
const MAX_DATASETS_PER_PAGE = 10;
const MAX_FULL_DATASET_ROWS = 100;
// List of search engines
const searchEngines = [
"AlltheInternet.com", "DuckDuckGo.com", "Google.com", "Bing.com", "Search.Yahoo.com",
"Startpage.com", "Qwant.com", "Ecosia.org", "WolframAlpha.com", "Mojeek.co.uk",
"Search.Brave.com", "Yandex.com", "Baidu.com", "Gibiru.com", "MetaGer.org",
"Swisscows.com", "Presearch.com", "Ekoru.org", "Search.Lilo.org"
];
let currentDatasets = [];
let currentPage = 1;
let currentSearchQuery = '';
let currentDataset = null;
let fullDatasetRows = [];
let useRealData = true;
let selectedEngines = ["DuckDuckGo.com", "Bing.com", "Search.Yahoo.com", "Search.Brave.com", "Ecosia.org"];
let currentEngine = ""; // Store the engine currently being used
// DOM Elements
const searchInput = document.getElementById('search-input');
const searchButton = document.getElementById('search-button');
const resultsContainer = document.getElementById('dataset-results');
const loadMoreContainer = document.getElementById('load-more-container');
const loadMoreButton = document.getElementById('load-more-button');
const searchPage = document.getElementById('search-page');
const datasetPage = document.getElementById('dataset-page');
const backButton = document.getElementById('back-button');
const datasetTitle = document.getElementById('dataset-title');
const datasetTags = document.getElementById('dataset-tags');
const datasetDescription = document.getElementById('dataset-description');
const previewTable = document.getElementById('preview-table');
const generateFullButton = document.getElementById('generate-full-button');
const generateStatus = document.getElementById('generate-status');
const rowsCount = document.getElementById('rows-count');
const progressBar = document.getElementById('progress-bar');
const fullDatasetSection = document.getElementById('full-dataset');
const fullTable = document.getElementById('full-table');
const downloadCsvButton = document.getElementById('download-csv-button');
const downloadJsonButton = document.getElementById('download-json-button');
const downloadParquetButton = document.getElementById('download-parquet-button');
const dataSourceToggle = document.getElementById('data-source-toggle');
const dataSourceText = document.getElementById('data-source-text');
const dataSourceBadge = document.getElementById('data-source-badge');
const sourceDetails = document.getElementById('source-details');
const engineSettingsButton = document.getElementById('engine-settings-button');
const engineModal = document.getElementById('engine-modal');
const engineOptions = document.getElementById('engine-options');
const closeModalButton = document.getElementById('close-modal-button');
const saveEnginesButton = document.getElementById('save-engines-button');
const selectAllEngines = document.getElementById('select-all-engines');
const deselectAllEngines = document.getElementById('deselect-all-engines');
// Event Listeners
document.addEventListener('DOMContentLoaded', () => {
searchButton.addEventListener('click', performSearch);
searchInput.addEventListener('keypress', (e) => {
if (e.key === 'Enter') performSearch();
});
loadMoreButton.addEventListener('click', loadMoreDatasets);
backButton.addEventListener('click', showSearchPage);
generateFullButton.addEventListener('click', generateFullDataset);
downloadCsvButton.addEventListener('click', () => downloadData('csv'));
downloadJsonButton.addEventListener('click', () => downloadData('json'));
downloadParquetButton.addEventListener('click', () => downloadData('parquet'));
dataSourceToggle.addEventListener('change', toggleDataSource);
engineSettingsButton.addEventListener('click', showEngineModal);
closeModalButton.addEventListener('click', hideEngineModal);
saveEnginesButton.addEventListener('click', saveEngineSettings);
selectAllEngines.addEventListener('click', () => toggleAllEngines(true));
deselectAllEngines.addEventListener('click', () => toggleAllEngines(false));
// Initialize engine options
populateEngineOptions();
// Show initial placeholder datasets
showPlaceholderDatasets();
});
// Search Engine Settings
function populateEngineOptions() {
engineOptions.innerHTML = '';
searchEngines.forEach(engine => {
const isChecked = selectedEngines.includes(engine);
const optionDiv = document.createElement('div');
optionDiv.className = 'flex items-center';
optionDiv.innerHTML = `
<input type="checkbox" id="engine-${engine}" class="engine-checkbox mr-2 h-4 w-4"
value="${engine}" ${isChecked ? 'checked' : ''}>
<label for="engine-${engine}" class="cursor-pointer">${engine}</label>
`;
engineOptions.appendChild(optionDiv);
});
}
function showEngineModal() {
engineModal.classList.remove('hidden');
}
function hideEngineModal() {
engineModal.classList.add('hidden');
}
function saveEngineSettings() {
const checkboxes = document.querySelectorAll('.engine-checkbox:checked');
selectedEngines = Array.from(checkboxes).map(cb => cb.value);
if (selectedEngines.length === 0) {
// Ensure at least one engine is selected
selectedEngines = ["DuckDuckGo.com"];
document.getElementById(`engine-DuckDuckGo.com`).checked = true;
showNotification("At least one search engine must be selected. Using DuckDuckGo as default.");
}
hideEngineModal();
showNotification(`Updated search engine settings. Using ${selectedEngines.length} engines.`);
}
function toggleAllEngines(select) {
const checkboxes = document.querySelectorAll('.engine-checkbox');
checkboxes.forEach(cb => {
cb.checked = select;
});
}
// Toggle data source between real and AI
function toggleDataSource() {
useRealData = dataSourceToggle.checked;
dataSourceText.textContent = useRealData ? "Using: Real + AI Data" : "Using: AI Data Only";
// Show or hide engine settings button
engineSettingsButton.style.display = useRealData ? "flex" : "none";
showNotification(`Switched to ${useRealData ? "combined real and synthetic" : "synthetic-only"} data mode`);
}
// Search functionality
function performSearch() {
const query = searchInput.value.trim();
if (!query) return;
currentSearchQuery = query;
currentPage = 1;
currentDatasets = [];
resultsContainer.innerHTML = '';
showLoadingSkeletons();
if (useRealData) {
// Use real data from search engines + AI
searchWithRealData(query);
} else {
// Use only AI-generated data
searchWithAIData(query);
}
}
function searchWithRealData(query) {
// Randomly select a search engine from the user's selected engines
currentEngine = selectedEngines[Math.floor(Math.random() * selectedEngines.length)];
// Register handler for dataset names based on real search results
window.Poe.registerHandler("real-search-handler", (result) => {
if (result.status === "error") {
showError("Error querying search engines");
return;
}
const message = result.responses[0];
if (message.status === "complete") {
// Parse the dataset names and tags from the response
const datasets = parseDatasetResults(message.content);
datasets.forEach(dataset => {
dataset.isReal = true;
dataset.engine = currentEngine;
});
currentDatasets = datasets;
// Display the datasets
resultsContainer.innerHTML = '';
displayDatasets(datasets);
// Show load more button if we have results
if (datasets.length > 0) {
loadMoreContainer.classList.remove('hidden');
}
}
});
try {
window.Poe.sendUserMessage(
`@Claude-3.7-Sonnet You are a data specialist who can transform real search results into structured datasets.
A user is searching for data about: "${query}"
Imagine you've queried ${currentEngine} and received real search results. Create a list of 10 specific datasets that could be created from these search results.
For each dataset:
1. Give it a clear, specific name related to the search topic
2. Include 3-5 relevant tags in parentheses, with one tag specifying the ML task type (classification, regression, clustering, etc.)
Format each dataset as:
1. DatasetName (tag1, tag2, ml_task_tag)
Make these datasets sound like real collections that could be created from ${currentEngine} search results on "${query}".`,
{
handler: "real-search-handler",
stream: false,
openChat: false
}
);
} catch (err) {
showError("Error sending message: " + err);
// Fall back to AI data
searchWithAIData(query);
}
}
function searchWithAIData(query) {
// Register handler for AI-generated dataset names
window.Poe.registerHandler("dataset-search-handler", (result) => {
if (result.status === "error") {
showError("Error generating datasets");
return;
}
const message = result.responses[0];
if (message.status === "complete") {
// Parse the dataset names and tags from the response
const datasets = parseDatasetResults(message.content);
datasets.forEach(dataset => {
dataset.isReal = false;
});
currentDatasets = datasets;
// Display the datasets
resultsContainer.innerHTML = '';
displayDatasets(datasets);
// Show load more button if we have results
if (datasets.length > 0) {
loadMoreContainer.classList.remove('hidden');
}
}
});
try {
window.Poe.sendUserMessage(
`@Claude-3.7-Sonnet A Machine Learning Practioner is looking for a dataset that matches '${query}'.
Generate a list of ${MAX_DATASETS_PER_PAGE} names of quality datasets that don't exist but sound plausible and would
be helpful. Feel free to reuse words from the query '${query}' to name the datasets.
Every dataset should be about '${query}' and have descriptive tags/keywords including the ML task name associated with the dataset (classification, regression, anomaly detection, etc.). Use the following format:
1. DatasetName1 (tag1, tag2, tag3)
2. DatasetName2 (tag1, tag2, tag3)`,
{
handler: "dataset-search-handler",
stream: false,
openChat: false
}
);
} catch (err) {
showError("Error sending message: " + err);
}
}
function parseDatasetResults(content) {
const lines = content.split('\n');
const datasets = [];
lines.forEach(line => {
// Match lines that start with a number followed by a period
const match = line.match(/^\s*\d+\.\s+(.+?)\s+\((.+?)\)/);
if (match) {
const name = match[1].trim();
const tags = match[2].split(',').map(tag => tag.trim());
datasets.push({ name, tags });
}
});
return datasets;
}
function displayDatasets(datasets) {
datasets.forEach(dataset => {
const card = document.createElement('div');
card.className = 'dataset-card bg-white dark:bg-gray-800 rounded-lg p-4 border border-gray-200 dark:border-gray-700 cursor-pointer relative';
const tagsHtml = dataset.tags.map(tag =>
`<span class="inline-block bg-gray-100 dark:bg-gray-700 text-gray-800 dark:text-gray-300 text-xs px-2 py-1 rounded mr-1 mb-1">${tag}</span>`
).join('');
// Add a badge for real data
let badgeHtml = '';
if (dataset.isReal) {
badgeHtml = `<span class="engine-badge" title="Data from ${dataset.engine}">${dataset.engine.split('.')[0]}</span>`;
}
card.innerHTML = `
${badgeHtml}
<h3 class="text-lg font-semibold mb-2">${dataset.name}</h3>
<div class="flex flex-wrap mt-2">${tagsHtml}</div>
`;
card.addEventListener('click', () => showDatasetDetails(dataset));
resultsContainer.appendChild(card);
});
}
function showLoadingSkeletons() {
for (let i = 0; i < 4; i++) {
const skeleton = document.createElement('div');
skeleton.className = 'bg-white dark:bg-gray-800 rounded-lg p-4 border border-gray-200 dark:border-gray-700';
skeleton.innerHTML = `
<div class="shimmer h-6 w-3/4 mb-2"></div>
<div class="flex flex-wrap mt-2">
<div class="shimmer h-6 w-16 rounded mr-1 mb-1"></div>
<div class="shimmer h-6 w-20 rounded mr-1 mb-1"></div>
<div class="shimmer h-6 w-24 rounded mr-1 mb-1"></div>
</div>
`;
resultsContainer.appendChild(skeleton);
}
}
function loadMoreDatasets() {
currentPage++;
// Use the same data source (real or AI) as the initial search
if (useRealData) {
loadMoreRealDatasets();
} else {
loadMoreAIDatasets();
}
}
function loadMoreRealDatasets() {
// Rotate to a different search engine for variety
const previousEngine = currentEngine;
while (currentEngine === previousEngine && selectedEngines.length > 1) {
currentEngine = selectedEngines[Math.floor(Math.random() * selectedEngines.length)];
}
// Register handler for more datasets
window.Poe.registerHandler("more-real-datasets-handler", (result) => {
if (result.status === "error") {
showError("Error generating more datasets");
return;
}
const message = result.responses[0];
if (message.status === "complete") {
// Parse the dataset names and tags from the response
const datasets = parseDatasetResults(message.content);
datasets.forEach(dataset => {
dataset.isReal = true;
dataset.engine = currentEngine;
});
currentDatasets = [...currentDatasets, ...datasets];
// Display the datasets
displayDatasets(datasets);
}
});
try {
window.Poe.sendUserMessage(
`@Claude-3.7-Sonnet You're a data specialist who can transform real search results into structured datasets.
Continue our previous search for data about: "${currentSearchQuery}"
Now let's use a different search engine: ${currentEngine}
Create 10 more specific datasets that could be created from these search results. Make sure these are different from the previous datasets.
Use the same format:
1. DatasetName (tag1, tag2, ml_task_tag)
Make these datasets sound like real collections that could be created from ${currentEngine} search results on "${currentSearchQuery}".`,
{
handler: "more-real-datasets-handler",
stream: false,
openChat: false
}
);
} catch (err) {
showError("Error sending message: " + err);
// Fall back to AI data
loadMoreAIDatasets();
}
}
function loadMoreAIDatasets() {
// Register handler for more AI datasets
window.Poe.registerHandler("more-datasets-handler", (result) => {
if (result.status === "error") {
showError("Error generating more datasets");
return;
}
const message = result.responses[0];
if (message.status === "complete") {
// Parse the dataset names and tags from the response
const datasets = parseDatasetResults(message.content);
datasets.forEach(dataset => {
dataset.isReal = false;
});
currentDatasets = [...currentDatasets, ...datasets];
// Display the datasets
displayDatasets(datasets);
}
});
try {
window.Poe.sendUserMessage(
`@Claude-3.7-Sonnet Please generate ${MAX_DATASETS_PER_PAGE} more dataset names about '${currentSearchQuery}'. Use the same format as before:
1. DatasetName1 (tag1, tag2, tag3)
Make sure these are completely different from previous suggestions.`,
{
handler: "more-datasets-handler",
stream: false,
openChat: false
}
);
} catch (err) {
showError("Error sending message: " + err);
}
}
function showDatasetDetails(dataset) {
currentDataset = dataset;
searchPage.classList.add('hidden');
datasetPage.classList.remove('hidden');
// Update UI with dataset info
datasetTitle.textContent = dataset.name;
datasetTags.innerHTML = dataset.tags.map(tag =>
`<span class="inline-block bg-gray-100 dark:bg-gray-700 text-gray-800 dark:text-gray-300 text-xs px-2 py-1 rounded mr-1 mb-1">${tag}</span>`
).join('');
// Update source badge
if (dataset.isReal) {
dataSourceBadge.textContent = "Real Data";
dataSourceBadge.className = "px-3 py-1 rounded-full text-xs font-medium bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200";
sourceDetails.innerHTML = `This dataset is based on real information queried from <strong>${dataset.engine}</strong> for the search term "<strong>${currentSearchQuery}</strong>". The data has been structured for machine learning use.`;
} else {
dataSourceBadge.textContent = "AI-Generated";
dataSourceBadge.className = "px-3 py-1 rounded-full text-xs font-medium bg-purple-100 text-purple-800 dark:bg-purple-900 dark:text-purple-200";
sourceDetails.innerHTML = `This is an AI-generated dataset created using Claude-3.7-Sonnet. The content is synthetic and designed to represent plausible data related to "${currentSearchQuery}".`;
}
// Clear previous content
datasetDescription.innerHTML = '<div class="shimmer h-4 w-full mb-2"></div>'.repeat(3);
previewTable.innerHTML = '';
fullDatasetSection.classList.add('hidden');
generateStatus.classList.add('hidden');
generateFullButton.disabled = false;
// Reset full dataset
fullDatasetRows = [];
// Generate dataset preview - different approach for real vs AI data
if (dataset.isReal) {
generateRealDatasetPreview(dataset);
} else {
generateAIDatasetPreview(dataset);
}
// Scroll to top
window.scrollTo(0, 0);
}
function generateRealDatasetPreview(dataset) {
window.Poe.registerHandler("real-preview-handler", (result) => {
if (result.status === "error") {
datasetDescription.innerHTML = '<p class="text-red-500">Error generating dataset preview</p>';
return;
}
const message = result.responses[0];
if (message.status === "complete") {
const content = message.content;
// Extract description and CSV
const parts = content.split('**CSV Content Preview:**');
let description = "";
let csvContent = "";
if (parts.length > 1) {
description = parts[0].replace('**Dataset Description:**', '').trim();
csvContent = parts[1].trim();
// Clean up CSV content (remove markdown code block markers)
csvContent = csvContent.replace(/```csv\n|```\n|```/g, '').trim();
} else {
description = "No description available";
csvContent = content;
}
// Display description
datasetDescription.innerHTML = marked.parse(description);
// Parse and display CSV preview
try {
const results = Papa.parse(csvContent, {
header: true,
skipEmptyLines: true
});
if (results.data && results.data.length > 0) {
// Create table from CSV data
createTable(previewTable, results.data, results.meta.fields);
} else {
previewTable.innerHTML = '<p class="p-4 text-red-500">No preview data available</p>';
}
} catch (err) {
previewTable.innerHTML = `<p class="p-4 text-red-500">Error parsing CSV: ${err.message}</p>`;
}
}
});
try {
const tagsStr = dataset.tags.join(', ');
window.Poe.sendUserMessage(
`@Claude-3.7-Sonnet You're a specialist in converting web search results into structured data.
Based on search results from ${dataset.engine} about "${currentSearchQuery}",
create a preview of the dataset "${dataset.name}" with tags "${tagsStr}".
First, write a detailed description of what this dataset contains, its structure, and how it was constructed from web search results.
Then, generate a realistic 5-row CSV preview that resembles data you might get if you scraped and structured real results from ${dataset.engine}.
Format your response with:
**Dataset Description:** [detailed description]
**CSV Content Preview:**
\`\`\`csv
[CSV header and 5 rows of realistic data]
\`\`\`
Include relevant columns for the dataset type, with proper labels/categories where appropriate. The data should look like it came from real sources.`,
{
handler: "real-preview-handler",
stream: false,
openChat: false
}
);
} catch (err) {
datasetDescription.innerHTML = `<p class="text-red-500">Error: ${err.message}</p>`;
}
}
function generateAIDatasetPreview(dataset) {
window.Poe.registerHandler("dataset-preview-handler", (result) => {
if (result.status === "error") {
datasetDescription.innerHTML = '<p class="text-red-500">Error generating dataset preview</p>';
return;
}
const message = result.responses[0];
if (message.status === "complete") {
const content = message.content;
// Extract description and CSV
const parts = content.split('**CSV Content Preview:**');
let description = "";
let csvContent = "";
if (parts.length > 1) {
description = parts[0].replace('**Dataset Description:**', '').trim();
csvContent = parts[1].trim();
// Clean up CSV content (remove markdown code block markers)
csvContent = csvContent.replace(/```csv\n|```\n|```/g, '').trim();
} else {
description = "No description available";
csvContent = content;
}
// Display description
datasetDescription.innerHTML = marked.parse(description);
// Parse and display CSV preview
try {
const results = Papa.parse(csvContent, {
header: true,
skipEmptyLines: true
});
if (results.data && results.data.length > 0) {
// Create table from CSV data
createTable(previewTable, results.data, results.meta.fields);
} else {
previewTable.innerHTML = '<p class="p-4 text-red-500">No preview data available</p>';
}
} catch (err) {
previewTable.innerHTML = `<p class="p-4 text-red-500">Error parsing CSV: ${err.message}</p>`;
}
}
});
try {
const tagsStr = dataset.tags.join(', ');
window.Poe.sendUserMessage(
`@Claude-3.7-Sonnet An ML practitioner is looking for a dataset CSV after the query '${currentSearchQuery}'.
Generate the first 5 rows of a plausible and quality CSV for the dataset '${dataset.name}'.
You can get inspiration from related keywords '${tagsStr}' but most importantly the dataset should correspond to the query '${currentSearchQuery}'.
Focus on quality text content and use a 'label' or 'labels' column if it makes sense (invent labels, avoid reusing the keywords, be accurate while labelling texts).
Reply using a short description of the dataset with title **Dataset Description:** followed by the CSV content in a code block and with title **CSV Content Preview:**`,
{
handler: "dataset-preview-handler",
stream: false,
openChat: false
}
);
} catch (err) {
datasetDescription.innerHTML = `<p class="text-red-500">Error: ${err.message}</p>`;
}
}
function createTable(container, data, headers) {
container.innerHTML = '';
const table = document.createElement('table');
table.className = 'w-full';
// Create header
const thead = document.createElement('thead');
const headerRow = document.createElement('tr');
headers.forEach(header => {
const th = document.createElement('th');
th.textContent = header;
headerRow.appendChild(th);
});
thead.appendChild(headerRow);
table.appendChild(thead);
// Create body
const tbody = document.createElement('tbody');
data.forEach(row => {
const tr = document.createElement('tr');
headers.forEach(header => {
const td = document.createElement('td');
td.textContent = row[header] || '';
tr.appendChild(td);
});
tbody.appendChild(tr);
});
table.appendChild(tbody);
container.appendChild(table);
}
function generateFullDataset() {
// Disable button and show status
generateFullButton.disabled = true;
generateStatus.classList.remove('hidden');
rowsCount.textContent = '0';
progressBar.style.width = '0%';
// Set up variables for tracking generation
let csvHeader = '';
const targetRows = MAX_FULL_DATASET_ROWS;
let currentRows = 0;
fullDatasetRows = [];
// Get the CSV header from the preview table
const previewHeaders = Array.from(previewTable.querySelectorAll('thead th')).map(th => th.textContent);
csvHeader = previewHeaders.join(',');
// Add initial rows from preview
const previewRows = Array.from(previewTable.querySelectorAll('tbody tr')).map(tr => {
const row = {};
Array.from(tr.querySelectorAll('td')).forEach((td, index) => {
row[previewHeaders[index]] = td.textContent;
});
return row;
});
fullDatasetRows = [...previewRows];
currentRows = previewRows.length;
updateGenerationProgress(currentRows, targetRows);
// Choose generation method based on dataset type
if (currentDataset.isReal) {
generateFullRealDataset(previewHeaders, csvHeader, currentRows, targetRows);
} else {
generateFullAIDataset(previewHeaders, csvHeader, currentRows, targetRows);
}
}
function generateFullRealDataset(previewHeaders, csvHeader, currentRows, targetRows) {
// Function to generate more rows in batches from "real" search results
const generateBatch = (batchIndex) => {
const batchSize = 15; // Larger batches for efficiency
const startRow = currentRows + batchIndex * batchSize;
if (startRow >= targetRows) {
// We've reached the target, show the full dataset
showFullDataset();
return;
}
window.Poe.registerHandler(`real-batch-${batchIndex}-handler`, (result) => {
if (result.status === "error") {
showError("Error generating dataset rows");
return;
}
const message = result.responses[0];
if (message.status === "complete") {
const content = message.content;
// Extract CSV content (remove markdown code block markers)
let csvContent = content.replace(/```csv\n|```\n|```/g, '').trim();
// If there are multiple code blocks, try to find one with CSV data
if (csvContent.includes('```')) {
const codeBlocks = content.match(/```(?:csv)?\n([\s\S]*?)```/g) || [];
if (codeBlocks.length > 0) {
csvContent = codeBlocks[0].replace(/```(?:csv)?\n|```/g, '').trim();
}
}
try {
// Parse the CSV
const results = Papa.parse(csvContent, {
header: true,
skipEmptyLines: true
});
if (results.data && results.data.length > 0) {
// Add the new rows
fullDatasetRows = [...fullDatasetRows, ...results.data];
currentRows += results.data.length;
// Update progress
updateGenerationProgress(currentRows, targetRows);
// Generate next batch
generateBatch(batchIndex + 1);
} else {
// Try again with a different prompt
generateBatch(batchIndex);
}
} catch (err) {
console.error("Error parsing CSV:", err);
// Try again
generateBatch(batchIndex);
}
}
});
try {
// For variation, rotate through engines for each batch
const engineForBatch = selectedEngines[batchIndex % selectedEngines.length] || currentDataset.engine;
window.Poe.sendUserMessage(
`@Claude-3.7-Sonnet You're expanding a dataset based on search results from ${engineForBatch}.
For the dataset "${currentDataset.name}" about "${currentSearchQuery}", please generate ${batchSize} more rows of data.
Use this exact CSV header: ${csvHeader}
The data should look realistic, as if it came from actual ${engineForBatch} search results for "${currentSearchQuery}".
Include appropriate values for each field, maintaining the same patterns and types as seen in the existing data.
Only include the CSV data in your response (header + ${batchSize} rows), no explanations or additional text.`,
{
handler: `real-batch-${batchIndex}-handler`,
stream: false,
openChat: false
}
);
} catch (err) {
showError("Error sending message: " + err);
}
};
// Start generating batches
generateBatch(0);
}
function generateFullAIDataset(previewHeaders, csvHeader, currentRows, targetRows) {
// Function to generate more rows in batches from AI
const generateBatch = (batchIndex) => {
const batchSize = 10;
const startRow = currentRows + batchIndex * batchSize;
if (startRow >= targetRows) {
// We've reached the target, show the full dataset
showFullDataset();
return;
}
window.Poe.registerHandler(`batch-${batchIndex}-handler`, (result) => {
if (result.status === "error") {
showError("Error generating dataset rows");
return;
}
const message = result.responses[0];
if (message.status === "complete") {
const content = message.content;
// Extract CSV content (remove markdown code block markers)
let csvContent = content.replace(/```csv\n|```\n|```/g, '').trim();
// If there are multiple code blocks, try to find one with CSV data
if (csvContent.includes('```')) {
const codeBlocks = content.match(/```(?:csv)?\n([\s\S]*?)```/g) || [];
if (codeBlocks.length > 0) {
csvContent = codeBlocks[0].replace(/```(?:csv)?\n|```/g, '').trim();
}
}
try {
// Parse the CSV
const results = Papa.parse(csvContent, {
header: true,
skipEmptyLines: true
});
if (results.data && results.data.length > 0) {
// Add the new rows
fullDatasetRows = [...fullDatasetRows, ...results.data];
currentRows += results.data.length;
// Update progress
updateGenerationProgress(currentRows, targetRows);
// Generate next batch
generateBatch(batchIndex + 1);
} else {
// Try again with a different prompt
generateBatch(batchIndex);
}
} catch (err) {
console.error("Error parsing CSV:", err);
// Try again
generateBatch(batchIndex);
}
}
});
try {
const tagsStr = currentDataset.tags.join(', ');
window.Poe.sendUserMessage(
`@Claude-3.7-Sonnet For the dataset '${currentDataset.name}' about '${currentSearchQuery}' with tags '${tagsStr}',
please generate ${batchSize} more sample rows in CSV format. Use the same CSV header: ${csvHeader}
Only include the CSV data in your response, no explanations or additional text.`,
{
handler: `batch-${batchIndex}-handler`,
stream: false,
openChat: false
}
);
} catch (err) {
showError("Error sending message: " + err);
}
};
// Start generating batches
generateBatch(0);
}
function updateGenerationProgress(current, total) {
rowsCount.textContent = current;
const percentage = Math.min(100, Math.floor((current / total) * 100));
progressBar.style.width = `${percentage}%`;
}
function showFullDataset() {
// Hide generation status
generateStatus.classList.add('hidden');
// Show full dataset section
fullDatasetSection.classList.remove('hidden');
// Get headers from the data
const headers = Object.keys(fullDatasetRows[0] || {});
// Create and display the table
createTable(fullTable, fullDatasetRows.slice(0, 10), headers);
// Add a note about showing limited rows
const note = document.createElement('p');
note.className = 'text-sm text-gray-600 dark:text-gray-400 mt-2';
note.textContent = `Showing 10 of ${fullDatasetRows.length} rows. Use the download buttons to get the complete dataset.`;
fullTable.appendChild(note);
}
function downloadData(format) {
if (fullDatasetRows.length === 0) return;
const filename = `${currentDataset.name.replace(/\s+/g, '_')}_dataset`;
switch(format) {
case 'csv':
downloadCsv(filename);
break;
case 'json':
downloadJson(filename);
break;
case 'parquet':
// Show a notification that this format is simulated
showNotification("Parquet format download simulated - actual conversion would require a server component");
downloadJson(filename + "_parquet_simulated");
break;
}
}
function downloadCsv(filename) {
// Convert data to CSV
const csv = Papa.unparse(fullDatasetRows);
// Create a blob and download link
const blob = new Blob([csv], { type: 'text/csv' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `${filename}.csv`;
document.body.appendChild(a);
a.click();
// Clean up
setTimeout(() => {
document.body.removeChild(a);
URL.revokeObjectURL(url);
}, 100);
}
function downloadJson(filename) {
// Convert data to JSON
const json = JSON.stringify(fullDatasetRows, null, 2);
// Create a blob and download link
const blob = new Blob([json], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `${filename}.json`;
document.body.appendChild(a);
a.click();
// Clean up
setTimeout(() => {
document.body.removeChild(a);
URL.revokeObjectURL(url);
}, 100);
}
function showSearchPage() {
searchPage.classList.remove('hidden');
datasetPage.classList.add('hidden');
}
function showError(message) {
console.error(message);
showNotification(message, true);
}
function showNotification(message, isError = false) {
const notification = document.createElement('div');
notification.className = `fixed bottom-4 right-4 px-6 py-3 rounded-lg shadow-lg ${
isError
? 'bg-red-500 text-white'
: 'bg-green-500 text-white'
} z-50 transition-opacity duration-300`;
notification.textContent = message;
document.body.appendChild(notification);
setTimeout(() => {
notification.style.opacity = '0';
setTimeout(() => {
document.body.removeChild(notification);
}, 300);
}, 3000);
}
function showPlaceholderDatasets() {
const placeholders = [
{
name: "NewsEventsPredict",
tags: ["classification", "media", "trend"],
isReal: true,
engine: "AlltheInternet.com"
},
{
name: "FinancialForecast",
tags: ["economy", "stocks", "regression"],
isReal: false
},
{
name: "HealthMonitor",
tags: ["science", "real-time", "anomaly detection"],
isReal: true,
engine: "DuckDuckGo.com"
},
{
name: "SportsAnalysis",
tags: ["classification", "performance", "player tracking"],
isReal: false
},
{
name: "RetailSalesAnalyzer",
tags: ["consumer behavior", "sales trend", "segmentation"],
isReal: true,
engine: "Bing.com"
},
{
name: "SocialMediaSentiment",
tags: ["text classification", "opinion mining", "NLP"],
isReal: false
}
];
currentDatasets = placeholders;
displayDatasets(placeholders);
loadMoreContainer.classList.remove('hidden');
}
</script>
</body>
</html>
"""
# --- Gradio CSS ---
css = """
a { color: var(--body-text-color); }
.datasetButton { justify-content: start; justify-content: left; }
.tags { font-size: var(--button-small-text-size); color: var(--body-text-color-subdued); }
.topButton {
justify-content: start; justify-content: left; text-align: left; background: transparent;
box-shadow: none; padding-bottom: 0;
}
.topButton::before {
content: url("data:image/svg+xml,%3Csvg style='color: rgb(209 213 219)' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink' aria-hidden='true' focusable='false' role='img' width='1em' height='1em' preserveAspectRatio='xMidYMid meet' viewBox='0 0 25 25'%3E%3Cellipse cx='12.5' cy='5' fill='currentColor' fill-opacity='0.25' rx='7.5' ry='2'%3E%3C/ellipse%3E%3Cpath d='M12.5 15C16.6421 15 20 14.1046 20 13V20C20 21.1046 16.6421 22 12.5 22C8.35786 22 5 21.1046 5 20V13C5 14.1046 8.35786 15 12.5 15Z' fill='currentColor' opacity='0.5'%3E%3C/path%3E%3Cpath d='M12.5 7C16.6421 7 20 6.10457 20 5V11.5C20 12.6046 16.6421 13.5 12.5 13.5C8.35786 13.5 5 12.6046 5 11.5V5C5 6.10457 8.35786 7 12.5 7Z' fill='currentColor' opacity='0.5'%3E%3C/path%3E%3Cpath d='M5.23628 12C5.08204 12.1598 5 12.8273 5 13C5 14.1046 8.35786 15 12.5 15C16.6421 15 20 14.1046 20 13C20 12.8273 19.918 12.1598 19.7637 12C18.9311 12.8626 15.9947 13.5 12.5 13.5C9.0053 13.5 6.06886 12.8626 5.23628 12Z' fill='currentColor'%3E%3C/path%3E%3C/svg%3E");
margin-right: .25rem; margin-left: -.125rem; margin-top: .25rem;
}
.bottomButton {
justify-content: start; justify-content: left; text-align: left; background: transparent;
box-shadow: none; font-size: var(--button-small-text-size); color: var(--body-text-color-subdued);
padding-top: 0; align-items: baseline;
}
.bottomButton::before { content: 'tags:'; margin-right: .25rem; }
.buttonsGroup { background: transparent; }
.buttonsGroup:hover { background: var(--input-background-fill); }
.buttonsGroup div { background: transparent; }
.insivibleButtonGroup { display: none; }
@keyframes placeHolderShimmer { 0%{ background-position: -468px 0 } 100%{ background-position: 468px 0 } }
.linear-background {
animation-duration: 1s; animation-fill-mode: forwards; animation-iteration-count: infinite;
animation-name: placeHolderShimmer; animation-timing-function: linear;
background-image: linear-gradient(to right, var(--body-text-color-subdued) 8%, #dddddd11 18%, var(--body-text-color-subdued) 33%);
background-size: 1000px 104px; color: transparent; background-clip: text;
}
.settings { background: transparent; }
.settings button span { color: var(--body-text-color-subdued); }
"""
# --- Knowledge Base ---
class KnowledgeBase:
"""Manages known entities (materials, colors) and patterns for data refinement."""
def __init__(self):
self.materials: Set[str] = {'Metal', 'Wood', 'Plastic', 'Aluminum', 'Bronze', 'Steel', 'Glass', 'Leather', 'Fabric'}
self.colors: Set[str] = {'Red', 'Black', 'White', 'Silver', 'Bronze', 'Yellow', 'Blue', 'Green', 'Gray', 'Brown'}
self.patterns: Dict[str, List[str]] = {}
self.source_data: Dict[str, Any] = {}
def load_source(self, source_type: str, source_path: str) -> None:
"""Loads data from various sources and extracts knowledge."""
try:
if source_type == 'csv_url':
response = requests.get(source_path, timeout=10)
response.raise_for_status()
df = pd.read_csv(io.StringIO(response.text))
elif source_type == 'xlsx_url':
response = requests.get(source_path, timeout=10)
response.raise_for_status()
df = pd.read_excel(io.BytesIO(response.content))
elif source_type == 'local_csv':
df = pd.read_csv(source_path)
elif source_type == 'local_xlsx':
df = pd.read_excel(source_path)
else:
raise ValueError(f"Unsupported source type: {source_type}")
self._extract_knowledge(df)
self.source_data[source_path] = df.to_dict('records')
except requests.exceptions.RequestException as e:
raise ConnectionError(f"Failed to fetch data from URL: {e}")
except ValueError as e: raise e
except Exception as e:
raise RuntimeError(f"Error loading source {source_path}: {str(e)}")
def _extract_knowledge(self, df: pd.DataFrame) -> None:
"""Extracts known materials, colors, and column patterns."""
for column in df.columns:
if 'material' in column.lower():
values = df[column].dropna().unique()
self.materials.update(v.title() for v in values if isinstance(v, str))
elif 'color' in column.lower():
values = df[column].dropna().unique()
self.colors.update(v.title() for v in values if isinstance(v, str))
if df[column].dtype == 'object': # Store string patterns for fuzzy matching
patterns = df[column].dropna().astype(str).tolist()
self.patterns[column] = patterns
def get_closest_match(self, value: str, field_type: str) -> Optional[str]:
"""Finds the closest known value (material or color) for fuzzy matching."""
known_values = getattr(self, field_type + 's', set())
if not known_values: return None
matches = get_close_matches(value.title(), list(known_values), n=1, cutoff=0.8)
return matches[0] if matches else None
knowledge_base = KnowledgeBase() # Global instance for refinement
# --- Data Refinement Utilities ---
def split_compound_field(field: str) -> List[str]:
"""Splits strings like 'Red, Blue' into ['Red', 'Blue']."""
parts = re.split(r'[,;\n]+', field)
return list(set(p.strip().title() for p in parts if p.strip()))
def normalize_value(value: Any, field_name: str, mode: str = 'sourceless', kb: Optional[KnowledgeBase] = None) -> Any:
"""Normalizes a single data value based on field name and refinement mode."""
if not isinstance(value, str): return value
value = re.sub(r'\s+', ' ', value.strip()) # Normalize whitespace
value = value.replace('_', ' ') # Replace underscores
# Field-specific normalization logic
if any(term in field_name.lower() for term in ['material']):
parts = split_compound_field(value)
if mode == 'sourced' and kb:
known = [kb.get_closest_match(p, 'material') or p.title() for p in parts]
else:
known = [m for m in parts if m in kb.materials] if kb else parts
return known[0] if len(known) == 1 else known
elif any(term in field_name.lower() for term in ['color']):
parts = split_compound_field(value)
if mode == 'sourced' and kb:
known = [kb.get_closest_match(p, 'color') or p.title() for p in parts]
else:
known = [c for c in parts if c in kb.colors] if kb else parts
return known[0] if len(known) == 1 else known
elif any(term in field_name.lower() for term in ['date', 'time']): return value # Placeholder
elif any(term in field_name.lower() for term in ['type', 'status', 'category', 'description']):
return value.title() # Title case for descriptive fields
return value
def clean_record(record: Dict[str, Any], mode: str = 'sourceless', kb: Optional[KnowledgeBase] = None) -> Dict[str, Any]:
"""Cleans and normalizes a single record, handling nesting and compound fields."""
cleaned = {}
compound_fields_to_split = {}
# Pass 1: Normalize values and identify compound fields
for key, value in record.items():
clean_key = key.strip().lower().replace(" ", "_")
if isinstance(value, str): # Detect potential compound fields
for material in knowledge_base.materials:
if material.lower() in value.lower():
compound_fields_to_split[clean_key] = value
break
# Recursively clean nested structures
if isinstance(value, list):
cleaned[clean_key] = [normalize_value(v, clean_key, mode, kb) for v in value]
elif isinstance(value, dict):
cleaned[clean_key] = clean_record(value, mode, kb)
else:
cleaned[clean_key] = normalize_value(value, clean_key, mode, kb)
# Pass 2: Split identified compound fields
for key, value in compound_fields_to_split.items():
parts = split_compound_field(value)
materials = [p for p in parts if p in knowledge_base.materials]
if materials:
cleaned['material'] = materials[0] if len(materials) == 1 else materials
remaining = [p for p in parts if p not in materials]
if remaining: cleaned['condition'] = ' '.join(remaining)
elif key not in cleaned: # If not processed and no known materials found
cleaned[key] = value
return cleaned
def refine_data_generic(dataset: List[Dict[str, Any]], mode: str = 'sourceless', kb: Optional[KnowledgeBase] = None) -> List[Dict[str, Any]]:
"""Applies generic data refinement to a list of records, with optional knowledge base guidance."""
if mode == 'sourced' and kb and kb.patterns: # Apply fuzzy matching if sourced
for record in dataset:
for field, patterns in kb.patterns.items():
if field in record and isinstance(record[field], str):
value = str(record[field])
matches = get_close_matches(value, patterns, n=1, cutoff=0.8)
if matches: record[field] = matches[0]
return [clean_record(entry, mode, kb) for entry in dataset]
def refine_preview_data(df: pd.DataFrame, mode: str = 'sourceless') -> pd.DataFrame:
"""Refines the preview DataFrame based on the selected mode."""
# Remove common auto-generated index columns
cols_to_drop = []
for col_name, values in df.to_dict(orient="series").items():
try:
if all(isinstance(v, int) and v == i for i, (v, _) in enumerate(zip(values, df.index))): cols_to_drop.append(col_name)
elif all(isinstance(v, int) and v == i + 1 for i, (v, _) in enumerate(zip(values, df.index))): cols_to_drop.append(col_name)
except Exception: pass # Ignore non-sequential columns
if cols_to_drop: df = df.drop(columns=cols_to_drop)
records = df.to_dict('records')
refined_records = refine_data_generic(records, mode=mode, kb=knowledge_base)
return pd.DataFrame(refined_records)
def detect_anomalies(record: Dict[str, Any]) -> List[str]:
"""Detects potential data quality issues (e.g., verbosity, missing values)."""
flags = []
for k, v in record.items():
if isinstance(v, str):
if len(v) > 300: flags.append(f"{k}: Too verbose.")
if v.lower() in ['n/a', 'none', 'undefined', 'null', '']: flags.append(f"{k}: Missing value.")
return flags
def parse_preview_df(content: str) -> tuple[str, pd.DataFrame]:
"""Extracts CSV from response, parses, refines, and adds quality flags."""
csv_lines = []
in_csv_block = False
for line in content.split("\n"): # Extract lines within CSV code blocks
if line.strip().startswith("```csv") or line.strip().startswith("```"): in_csv_block = True; continue
if line.strip().startswith("```"): in_csv_block = False; continue
if in_csv_block: csv_lines.append(line)
csv_content = "\n".join(csv_lines)
if not csv_content: raise ValueError("No CSV content found.")
csv_header = csv_content.split("\n")[0] if csv_content else ""
df = parse_csv_df(csv_content)
refined_df = refine_preview_data(df, mode='sourceless') # Initial refinement
# Add quality flags
refined_records = refined_df.to_dict('records')
for record in refined_records:
flags = detect_anomalies(record)
if flags: record['_quality_flags'] = flags
return csv_header, pd.DataFrame(refined_records)
def parse_csv_df(csv: str, csv_header: Optional[str] = None) -> pd.DataFrame:
"""Safely parses CSV data using pandas with error handling and common fixes."""
csv = re.sub(r'''(?!")$$(["'][\w\s]+["'][, ]*)+$$(?!")''', lambda m: '"' + m.group(0).replace('"', "'") + '"', csv) # Fix unquoted lists
if csv_header and csv.strip() and not csv.strip().startswith(csv_header.split(',')[0]): csv = csv_header + "\n" + csv # Prepend header if missing
try: return pd.read_csv(io.StringIO(csv), skipinitialspace=True)
except Exception as e: raise ValueError(f"Pandas CSV parsing error: {e}")
# --- LLM Interaction Utilities ---
T = TypeVar("T")
def batched(it: Iterable[T], n: int) -> Iterator[list[T]]:
"""Yields chunks of size n from an iterable."""
it = iter(it)
while batch := list(islice(it, n)): yield batch
def stream_response(msg: str, history: list[Dict[str, str]] = [], max_tokens=500) -> Iterator[str]:
"""Streams responses from the LLM client with retry logic."""
messages = [{"role": m["role"], "content": m["content"]} for m in history]
messages.append({"role": "user", "content": msg})
for attempt in range(3): # Retry mechanism
try:
for chunk in client.chat_completion(messages=messages, max_tokens=max_tokens, stream=True, top_p=0.8, seed=42):
content = chunk.choices[0].delta.content
if content: yield content
break # Success
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
print(f"LLM connection error (attempt {attempt+1}): {e}. Retrying in {2**attempt}s...")
time.sleep(2**attempt)
except Exception as e:
print(f"Unexpected LLM error (attempt {attempt+1}): {e}. Retrying...")
time.sleep(2**attempt)
def generate_dataset_names(search_query: str, history: list[Dict[str, str]], is_real_data: bool = False, engine: Optional[str] = None) -> Iterator[str]:
"""Generates dataset names based on a search query using the LLM."""
query = search_query[:1000] if search_query else ""
if is_real_data and engine:
prompt = (
f"@Claude-3.7-Sonnet You are a data specialist who can transform real search results into structured datasets. "
f"A user is searching for data about: \"{query}\" "
f"Imagine you've queried {engine} and received real search results. Create a list of {MAX_NB_ITEMS_PER_GENERATION_CALL} specific datasets that could be created from these search results. "
f"For each dataset: 1. Give it a clear, specific name related to the search topic. 2. Include 3-5 relevant tags in parentheses, with one tag specifying the ML task type (classification, regression, clustering, etc.). "
f"Format each dataset as: 1. DatasetName (tag1, tag2, ml_task_tag). Make these datasets sound like real collections that could be created from {engine} search results on \"{query}\"."
)
else:
prompt = GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY.format(search_query=query)
full_response = ""
for token in stream_response(prompt, history):
full_response += token
yield token # Yield tokens for real-time display
print(f"Generated dataset names for query '{search_query}'.")
history.append({"role": "assistant", "content": full_response}) # Update history
# No return needed as history is modified in place
def generate_dataset_content(search_query: str, dataset_name: str, tags: str, history: list[Dict[str, str]], is_real_data: bool = False, engine: Optional[str] = None) -> Iterator[str]:
"""Generates the description and CSV preview for a dataset."""
query = search_query[:1000] if search_query else ""
if is_real_data and engine:
prompt = (
f"@Claude-3.7-Sonnet You're a specialist in converting web search results into structured data. "
f"Based on search results from {engine} about \"{query}\", create a preview of the dataset \"{dataset_name}\" with tags \"{tags}\". "
f"First, write a detailed description of what this dataset contains, its structure, and how it was constructed from web search results. "
f"Then, generate a realistic 5-row CSV preview that resembles data you might get if you scraped and structured real results from {engine}. "
f"Format your response with: **Dataset Description:** [detailed description] **CSV Content Preview:** ```csv [CSV header and 5 rows of realistic data] ``` "
f"Include relevant columns for the dataset type, with proper labels/categories where appropriate. The data should look like it came from real sources."
)
else:
prompt = GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
search_query=query, dataset_name=dataset_name, tags=tags
)
full_response = ""
for token in stream_response(prompt, history):
full_response += token
yield token
print(f"Generated content for dataset '{dataset_name}'.")
history.append({"role": "assistant", "content": full_response}) # Update history
def _write_generator_to_queue(queue: Queue, func: Callable, kwargs: dict) -> None:
"""Helper to run a generator and put results (or errors) into a queue."""
try:
for i, result in enumerate(func(**kwargs)): queue.put((i, result))
except Exception as e: queue.put((-1, str(e))) # Signal error with index -1
finally: queue.put(None) # Signal completion
def iflatmap_unordered(func: Callable, kwargs_iterable: Iterable[dict]) -> Iterable[Any]:
"""Runs generator functions concurrently and yields results as they complete."""
queue = Queue()
pool_size = min(len(kwargs_iterable), os.cpu_count() or 4)
with ThreadPool(pool_size) as pool:
async_results = [pool.apply_async(_write_generator_to_queue, (queue, func, kwargs)) for kwargs in kwargs_iterable]
completed_generators = 0
while completed_generators < len(async_results):
try:
result = queue.get(timeout=0.1)
if result is None: # Generator finished
completed_generators += 1
continue
index, data = result
if index == -1: # Error occurred
print(f"Generator error: {data}")
continue # Skip this result
yield data # Yield successful result
except Empty: # Timeout occurred, check if all threads are done
if all(res.ready() for res in async_results) and queue.empty(): break
for res in async_results: res.get(timeout=0.1) # Ensure threads finish and raise exceptions
def generate_partial_dataset(
title: str, content: str, search_query: str, variant: str, csv_header: str,
output: list[Optional[dict]], indices_to_generate: list[int], history: list[Dict[str, str]],
is_real_data: bool = False, engine: Optional[str] = None
) -> Iterator[int]:
"""Generates a batch of dataset rows for a specific variant."""
dataset_name, tags = title.strip("# ").split("\ntags:", 1)
dataset_name, tags = dataset_name.strip(), tags.strip()
prompt = GENERATE_MORE_ROWS.format(csv_header=csv_header) + " " + variant
# Construct initial messages for context
initial_prompt = ""
if is_real_data and engine:
initial_prompt = (
f"@Claude-3.7-Sonnet You're a specialist in converting web search results into structured data. "
f"Based on search results from {engine} about \"{search_query}\", create a preview of the dataset \"{dataset_name}\" with tags \"{tags}\". "
f"First, write a detailed description of what this dataset contains, its structure, and how it was constructed from web search results. "
f"Then, generate a realistic 5-row CSV preview that resembles data you might get if you scraped and structured real results from {engine}. "
f"Format your response with: **Dataset Description:** [detailed description] **CSV Content Preview:** ```csv [CSV header and 5 rows of realistic data] ``` "
f"Include relevant columns for the dataset type, with proper labels/categories where appropriate. The data should look like it came from real sources."
)
else:
initial_prompt = GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
search_query=search_query, dataset_name=dataset_name, tags=tags
)
messages = [
{"role": "user", "content": initial_prompt},
{"role": "assistant", "content": title + "\n\n" + content},
{"role": "user", "content": prompt},
]
generated_samples = 0
current_csv_chunk = ""
in_csv_block = False
for attempt in range(3): # Retry logic
try:
for chunk in client.chat_completion(messages=messages, max_tokens=1500, stream=True, top_p=0.8, seed=42):
token = chunk.choices[0].delta.content
if not token: continue
current_csv_chunk += token
# Detect CSV block start/end
if token.strip().startswith("```csv") or token.strip().startswith("```"):
in_csv_block = True
continue
if token.strip().startswith("```"):
in_csv_block = False
if current_csv_chunk.strip(): # Process accumulated chunk if block just ended
try:
temp_df = parse_csv_df(current_csv_chunk.strip(), csv_header=csv_header)
new_rows = temp_df.iloc[generated_samples:].to_dict('records')
for i, record in enumerate(new_rows):
if generated_samples >= len(indices_to_generate): break
refined_record = refine_data_generic([record])[0]
flags = detect_anomalies(refined_record)
if flags: refined_record['_quality_flags'] = flags
output_index = indices_to_generate[generated_samples]
if output_index < len(output):
output[output_index] = refined_record
generated_samples += 1
yield 1 # Signal progress
except ValueError as e: print(f"CSV parsing error: {e}")
except Exception as e: print(f"CSV chunk processing error: {e}")
finally: current_csv_chunk = "" # Reset chunk
continue
if in_csv_block: # Process incrementally if inside CSV block
try:
temp_df = parse_csv_df(current_csv_chunk.strip(), csv_header=csv_header)
new_rows = temp_df.iloc[generated_samples:].to_dict('records')
for i, record in enumerate(new_rows):
if generated_samples >= len(indices_to_generate): break
refined_record = refine_data_generic([record])[0]
flags = detect_anomalies(refined_record)
if flags: refined_record['_quality_flags'] = flags
output_index = indices_to_generate[generated_samples]
if output_index < len(output):
output[output_index] = refined_record
generated_samples += 1
yield 1
except ValueError: pass # CSV not complete
except Exception as e: print(f"Incremental CSV processing error: {e}")
if generated_samples >= len(indices_to_generate): break # Target reached
print(f"Retrying generation for variant '{variant}' (attempt {attempt+1})...")
time.sleep(2**attempt)
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
print(f"Connection error (attempt {attempt+1}): {e}. Retrying...")
time.sleep(2**attempt)
except Exception as e:
print(f"Unexpected error (attempt {attempt+1}): {e}. Retrying...")
time.sleep(2**attempt)
def generate_variants(preview_df: pd.DataFrame) -> Iterator[str]:
"""Generates diverse prompts for creating dataset variants."""
label_cols = [col for col in preview_df.columns if "label" in col.lower()]
labels = preview_df[label_cols[0]].unique() if label_cols and len(preview_df[label_cols[0]].unique()) > 1 else []
if labels: # Prioritize label-based generation
rarities = ["pretty obvious", "common/regular", "unexpected but useful", "uncommon but still plausible", "rare/niche but still plausible"]
for rarity in rarities:
for label in labels: yield GENERATE_VARIANTS_WITH_RARITY_AND_LABEL.format(rarity=rarity, label=label)
else: # Fallback to general rarity prompts
rarities = ["obvious", "expected", "common", "regular", "unexpected but useful", "original but useful", "specific but not far-fetched", "uncommon but still plausible", "rare but still plausible", "very niche but still plausible"]
for rarity in rarities: yield GENERATE_VARIANTS_WITH_RARITY.format(rarity=rarity)
# --- Gradio Interface ---
def whoami(token: str) -> Dict[str, Any]:
"""Fetches user information from Hugging Face Hub API."""
try:
response = requests.get("https://huggingface.co/api/users/me", headers={"Authorization": f"Bearer {token}"}, timeout=5)
response.raise_for_status()
return response.json()
except (requests.exceptions.RequestException, ValueError) as e:
print(f"Error fetching user info: {e}")
return {"name": "User", "orgs": []}
def get_repo_visibility(repo_id: str, token: str) -> str:
"""Determines if a Hugging Face repository is public or private."""
try:
response = requests.get(f"https://huggingface.co/api/repos/{repo_id}", headers={"Authorization": f"Bearer {token}"}, timeout=5)
response.raise_for_status()
return "public" if not response.json().get("private", False) else "private"
except HfHubHTTPError as e:
if e.response.status_code == 404: return "public" # Assume public if repo doesn't exist
print(f"Error checking repo visibility for {repo_id}: {e}")
return "public"
except Exception as e:
print(f"Unexpected error checking repo visibility for {repo_id}: {e}")
return "public"
with gr.Blocks(css=css) as demo:
generated_texts_state = gr.State((landing_page_datasets_generated_text,)) # State for generated dataset names
current_dataset_state = gr.State(None) # State to hold current dataset details for generation
is_real_data_state = gr.State(True) # State to track if real data is being used
current_engine_state = gr.State(None) # State to track the current search engine
selected_engines_state = gr.State(["DuckDuckGo.com", "Bing.com", "Search.Yahoo.com", "Search.Brave.com", "Ecosia.org"]) # Default selected engines
searchEngines = ["AlltheInternet.com", "DuckDuckGo.com", "Google.com", "Bing.com", "Search.Yahoo.com", "Startpage.com", "Qwant.com", "Ecosia.org", "WolframAlpha.com", "Mojeek.co.uk", "Search.Brave.com", "Yandex.com", "Baidu.com", "Gibiru.com", "MetaGer.org", "Swisscows.com", "Presearch.com", "Ekoru.org", "Search.Lilo.org"]
# --- Search Page UI ---
with gr.Column(visible=True, elem_id="search-page") as search_page:
gr.Markdown("# 🤗 Infinite Dataset Hub ♾️\n\nAn endless catalog of datasets, created just for you by an AI model.")
with gr.Row():
search_bar = gr.Textbox(max_lines=1, placeholder="Search datasets, get infinite results", show_label=False, container=False, scale=9)
search_button = gr.Button("🔍", variant="primary", scale=1)
button_groups: list[gr.Group] = [] # Holds the groups for dataset buttons
buttons: list[gr.Button] = [] # Holds the actual dataset name and tag buttons
for i in range(MAX_TOTAL_NB_ITEMS):
if i < len(default_output): # Use default datasets initially
line = default_output[i]
try: dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1)
except ValueError: dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" ", 1)[0], ""
group_classes, name_classes, tag_classes = "buttonsGroup", "topButton", "bottomButton"
else: # Placeholders for future datasets
dataset_name, tags = "⬜⬜⬜⬜⬜⬜", "░░░░, ░░░░, ░░░░"
group_classes, name_classes, tag_classes = "buttonsGroup insivibleButtonGroup", "topButton linear-background", "bottomButton linear-background"
with gr.Group(elem_classes=group_classes) as button_group:
button_groups.append(button_group)
dataset_btn = gr.Button(dataset_name, elem_classes=name_classes)
tags_btn = gr.Button(tags, elem_classes=tag_classes)
buttons.append(dataset_btn)
buttons.append(tags_btn)
load_more_datasets = gr.Button("Load more datasets")
gr.Markdown(f"_powered by [{model_id}](https://huggingface.co/{model_id})_")
# --- Settings Panel ---
with gr.Column(scale=4, min_width="200px"):
with gr.Accordion("Settings", open=False, elem_classes="settings"):
gr.Markdown("Manage your Hugging Face account and dataset saving options.")
gr.LoginButton()
select_namespace_dropdown = gr.Dropdown(choices=[NAMESPACE], value=NAMESPACE, label="Hugging Face Namespace", visible=False)
gr.Markdown("Dataset Generation Mode")
refinement_mode = gr.Radio(
["sourceless", "sourced"], value="sourceless", label="Refinement Mode",
info="Sourceless: AI generates data freely. Sourced: AI uses loaded data for context and refinement."
)
with gr.Group(visible=False) as source_group: # Dynamic section for source loading
source_type = gr.Dropdown(
choices=["csv_url", "xlsx_url", "local_csv", "local_xlsx"], value="csv_url",
label="Source Type", info="Select the format of your data source."
)
source_path = gr.Textbox(
label="Source Path/URL", placeholder="Enter URL or local file path",
info="Provide the location of your dataset file."
)
load_source_button = gr.Button("Load Source Data", icon="https://huggingface.co/datasets/huggingface/badges/resolve/main/badge-files/data.svg")
source_status = gr.Markdown("", visible=False)
visibility_radio = gr.Radio(
["public", "private"], value="public", container=False, interactive=False,
label="Dataset Visibility", info="Set visibility for datasets saved to Hugging Face Hub."
)
# Search Engine Settings
gr.Markdown("Search Engine Configuration")
data_source_toggle = gr.Checkbox(label="Use Real Search Data", value=True, info="Toggle to include results from real search engines.")
engine_settings_button = gr.Button("Configure Search Engines", icon="https://img.icons8.com/ios-filled/50/000000/settings--v1.png", size="sm")
# Engine Selection Modal
with gr.Modal("Search Engine Settings", id="engine-modal") as engine_modal:
gr.Markdown("Select which search engines to use for real data retrieval. A diverse selection improves results.")
engine_options_html_comp = gr.HTML(elem_id="engine-options")
with gr.Row():
select_all_engines_btn = gr.Button("Select All")
deselect_all_engines_btn = gr.Button("Deselect All")
save_engines_btn = gr.Button("Save Settings", variant="primary")
# --- Dataset Detail Page UI ---
with gr.Column(visible=False, elem_id="dataset-page") as dataset_page:
gr.Markdown("# 🤗 Infinite Dataset Hub ♾️\n\nAn endless catalog of datasets, created just for you.")
dataset_title_md = gr.Markdown() # Dataset name and tags
dataset_source_badge = gr.Markdown() # Badge indicating real/AI data
dataset_source_info = gr.Markdown() # Details about the data source
dataset_description_md = gr.Markdown() # Dataset description
preview_table_comp = gr.DataFrame(visible=False, interactive=False, wrap=True) # Holds the preview CSV
with gr.Row():
generate_full_dataset_button = gr.Button("Generate Full Dataset", variant="primary")
save_dataset_button = gr.Button("💾 Save Dataset", variant="primary", visible=False)
open_dataset_message = gr.Markdown("", visible=False) # Confirmation message
dataset_share_button = gr.Button("Share Dataset URL")
dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
full_dataset_section = gr.Column(visible=False) # Container for full dataset and downloads
full_table_comp = gr.DataFrame(visible=False, interactive=False, wrap=True)
with gr.Row():
download_csv_button = gr.Button("Download CSV")
download_json_button = gr.Button("Download JSON")
download_parquet_button = gr.Button("Download Parquet")
back_button = gr.Button("< Back", size="sm")
# --- Event Handlers ---
# Search Logic
def _update_search_results(search_query: str, current_generated_texts: tuple[str], is_real_data: bool, engine: Optional[str]):
"""Handles dataset search and UI updates."""
# Reset UI to loading state
yield {btn: gr.Button("⬜⬜⬜⬜⬜⬜", elem_classes="topButton linear-background") for btn in buttons[::2]}
yield {btn: gr.Button("░░░░, ░░░░, ░░░░", elem_classes="bottomButton linear-background") for btn in buttons[1::2]}
yield {group: gr.Group(elem_classes="buttonsGroup insivibleButtonGroup") for group in button_groups}
generated_count = 0
new_texts = ""
try:
# Generate dataset names from LLM
for line in generate_dataset_names(search_query, [], is_real_data=is_real_data, engine=engine):
if "I'm sorry" in line or "policy" in line: raise gr.Error("Inappropriate content detected.")
if generated_count >= MAX_NB_ITEMS_PER_GENERATION_CALL: break
match = re.match(r"^\s*\d+\.\s+(.+?)\s+$$(.+?)$$", line) # Parse line format
if match:
dataset_name, tags = match.groups()
dataset_name, tags = dataset_name.strip(), tags.strip()
new_texts += line
# Update buttons with generated data
yield {
buttons[2 * generated_count]: gr.Button(dataset_name, elem_classes="topButton"),
buttons[2 * generated_count + 1]: gr.Button(tags, elem_classes="bottomButton"),
}
generated_count += 1
# Update state and make new buttons visible
new_history = (current_generated_texts + (new_texts,)) if current_generated_texts else (landing_page_datasets_generated_text + "\n" + new_texts,)
yield {generated_texts_state: new_history}
yield {group: gr.Group(elem_classes="buttonsGroup") for group in button_groups[:generated_count]}
except gr.Error as e: raise e # Propagate Gradio errors
except Exception as e: raise gr.Error(f"Failed to generate datasets: {str(e)}")
# Attach search handlers
search_button.click(
_update_search_results,
inputs=[search_bar, generated_texts_state, is_real_data_state, current_engine_state],
outputs=buttons + [generated_texts_state] + button_groups
)
search_bar.submit(
_update_search_results,
inputs=[search_bar, generated_texts_state, is_real_data_state, current_engine_state],
outputs=buttons + [generated_texts_state] + button_groups
)
# Load More Datasets
load_more_datasets.click(
_update_search_results,
inputs=[search_bar, generated_texts_state, is_real_data_state, current_engine_state],
outputs=buttons + [generated_texts_state] + button_groups
)
# Display Single Dataset Details
def _show_dataset_details(search_query, dataset_name, tags, is_real_data, engine):
"""Switches to detail view and loads dataset content."""
yield {
search_page: gr.Column(visible=False), dataset_page: gr.Column(visible=True),
dataset_title_md: f"# {dataset_name}\n\n tags: {tags}",
dataset_share_textbox: gr.Textbox(visible=False),
full_dataset_section: gr.Column(visible=False),
save_dataset_button: gr.Button(visible=False),
open_dataset_message: gr.Markdown("", visible=False)
}
# Update source badge and info
if is_real_data:
badge_html = gr.Markdown(f'<span class="px-3 py-1 rounded-full text-xs font-medium bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200">Real Data</span>', visible=True)
info_html = gr.Markdown(f'This dataset is based on real information queried from <strong>{engine}</strong> for the search term "<strong>{search_query}</strong>". The data has been structured for machine learning use.', visible=True)
else:
badge_html = gr.Markdown('<span class="px-3 py-1 rounded-full text-xs font-medium bg-purple-100 text-purple-800 dark:bg-purple-900 dark:text-purple-200">AI-Generated</span>', visible=True)
info_html = gr.Markdown(f'This is an AI-generated dataset created using {model_id}. The content is synthetic and designed to represent plausible data related to "{search_query}".', visible=True)
yield {dataset_source_badge: badge_html, dataset_source_info: info_html}
# Stream content generation
for content_chunk in generate_dataset_content(search_query, dataset_name, tags, [], is_real_data=is_real_data, engine=engine):
yield {dataset_description_md: content_chunk}
# Link buttons to the detail view function
def _show_dataset_from_button_wrapper(search_query, *buttons_values):
# Determine which button was clicked to get the index
clicked_button_index = -1
for i, btn_val in enumerate(buttons_values):
if btn_val is not None and btn_val != "": # Assuming non-empty value indicates the clicked button's text
clicked_button_index = i
break
if clicked_button_index == -1: return # Should not happen if events are correctly wired
# Determine if it was a name button (even index) or tag button (odd index)
dataset_index = clicked_button_index // 2
dataset_name, tags = buttons_values[2 * dataset_index], buttons_values[2 * dataset_index + 1]
is_real_data = current_engine_state.value is not None # Infer from engine state
engine = current_engine_state.value if is_real_data else None
yield from _show_dataset_details(search_query, dataset_name, tags, is_real_data, engine)
# Wire up click events for all dataset name and tag buttons
for i, (name_btn, tag_btn) in enumerate(batched(buttons, 2)):
name_btn.click(
partial(_show_dataset_from_button_wrapper),
inputs=[search_bar, *buttons],
outputs=[search_page, dataset_page, dataset_title_md, dataset_description_md, dataset_source_badge, dataset_source_info, dataset_share_textbox, full_dataset_section, save_dataset_button, open_dataset_message]
)
tag_btn.click(
partial(_show_dataset_from_button_wrapper),
inputs=[search_bar, *buttons],
outputs=[search_page, dataset_page, dataset_title_md, dataset_description_md, dataset_source_badge, dataset_source_info, dataset_share_textbox, full_dataset_section, save_dataset_button, open_dataset_message]
)
# Back Button Navigation
back_button.click(lambda: (gr.Column(visible=True), gr.Column(visible=False)), outputs=[search_page, dataset_page], js="""
function() {
if ('parentIFrame' in window) { window.parentIFrame.scrollTo({top: 0, behavior:'smooth'}); }
else { window.scrollTo({ top: 0, behavior: 'smooth' }); }
return Array.from(arguments);
}
""")
# Full Dataset Generation
@generate_full_dataset_button.click(
inputs=[dataset_title_md, dataset_description_md, search_bar, select_namespace_dropdown, visibility_radio, refinement_mode, is_real_data_state, current_engine_state],
outputs=[full_table_comp, generate_full_dataset_button, save_dataset_button, full_dataset_section]
)
def _generate_full_dataset(title_md, content_md, search_query, namespace, visibility, mode, is_real_data, engine):
# Extract dataset name and tags from the markdown title
try:
dataset_name = title_md.split('\n')[0].strip('# ')
tags = title_md.split('tags:', 1)[1].strip()
except IndexError:
raise gr.Error("Could not parse dataset title.")
try: csv_header, preview_df = parse_preview_df(content_md)
except ValueError as e: raise gr.Error(f"Failed to parse preview: {e}")
refined_preview_df = refine_preview_data(preview_df, mode)
columns = list(refined_preview_df)
output_data: list[Optional[dict]] = [None] * NUM_ROWS # Initialize output structure
initial_rows = refined_preview_df.to_dict('records')
for i, record in enumerate(initial_rows):
if i < NUM_ROWS: output_data[i] = {"idx": i, **record}
# Update UI: show preview, disable generate, show save button
yield {
full_table_comp: gr.DataFrame(pd.DataFrame([r for r in output_data if r]), visible=True),
generate_full_dataset_button: gr.Button(interactive=False),
save_dataset_button: gr.Button(f"💾 Save {namespace}/{dataset_name}" + (" (private)" if visibility != "public" else ""), visible=True, interactive=False),
full_dataset_section: gr.Column(visible=True)
}
# Prepare generation tasks for variants
generation_tasks = []
variants = islice(generate_variants(refined_preview_df), NUM_VARIANTS)
for i, variant in enumerate(variants):
indices = list(range(len(initial_rows) + i, NUM_ROWS, NUM_VARIANTS))
if indices: # Only create task if there are rows to generate
generation_tasks.append({
"func": generate_partial_dataset,
"kwargs": {
"title": title_md, "content": content_md, "search_query": search_query, "variant": variant,
"csv_header": csv_header, "output": output_data, "indices_to_generate": indices,
"history": [], # Use fresh history for each variant task
"is_real_data": is_real_data, "engine": engine
}
})
# Execute tasks in parallel and update UI progressively
for _ in iflatmap_unordered(lambda **kw: kw.pop('func')(**kw), generation_tasks):
yield {full_table_comp: pd.DataFrame([r for r in output_data if r])} # Update DataFrame display
yield {save_dataset_button: gr.Button(interactive=True)} # Enable save button
print(f"Full dataset generation complete for {dataset_name}.")
# Save Dataset to Hugging Face Hub
@save_dataset_button.click(
inputs=[dataset_title_md, dataset_description_md, search_bar, full_table_comp, select_namespace_dropdown, visibility_radio],
outputs=[save_dataset_button, open_dataset_message]
)
def _save_dataset(title_md, content_md, search_query, df, namespace, visibility, oauth_token):
# Extract dataset name and tags from the markdown title
try:
dataset_name = title_md.split('\n')[0].strip('# ')
tags = title_md.split('tags:', 1)[1].strip()
except IndexError:
raise gr.Error("Could not parse dataset title.")
token = oauth_token.token if oauth_token else save_dataset_hf_token
if not token: raise gr.Error("Login required or set SAVE_DATASET_HF_TOKEN.")
repo_id = f"{namespace}/{dataset_name}"
dataset_url_params = f"q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}"
dataset_url = f"{URL}?{dataset_url_params}"
gr.Info("Saving dataset...")
yield {save_dataset_button: gr.Button(interactive=False)} # Disable button during save
try:
create_repo(repo_id=repo_id, repo_type="dataset", private=visibility!="public", exist_ok=True, token=token)
df.to_csv(f"hf://datasets/{repo_id}/data.csv", storage_options={"token": token}, index=False)
card_content = DATASET_CARD_CONTENT.format(title=title_md, content=content_md, url=URL, dataset_url=dataset_url, model_id=model_id, search_query=search_query)
DatasetCard(card_content).push_to_hub(repo_id=repo_id, repo_type="dataset", token=token)
success_msg = f"# 🎉 Yay! Dataset saved to [{repo_id}](https://huggingface.co/datasets/{repo_id})!\n\n_PS: Check Settings to manage your saved datasets._"
gr.Info("Dataset saved successfully.")
yield {open_dataset_message: gr.Markdown(success_msg, visible=True)}
except HfHubHTTPError as e: raise gr.Error(f"HF Hub error: {e.message}")
except Exception as e: raise gr.Error(f"Save failed: {str(e)}")
finally: yield {save_dataset_button: gr.Button(interactive=True)} # Re-enable button
# Shareable URL Generation
@dataset_share_button.click(inputs=[dataset_title_md, search_bar], outputs=[dataset_share_textbox])
def _show_share_url(title_md, search_query):
try:
dataset_name = title_md.split('\n')[0].strip('# ')
tags = title_md.split('tags:', 1)[1].strip()
except IndexError:
raise gr.Error("Could not parse dataset title.")
share_url = f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}"
return gr.Textbox(share_url, visible=True)
# Settings Toggles
refinement_mode.change(lambda mode: gr.Group(visible=(mode == "sourced")), outputs=[source_group])
data_source_toggle.change(lambda value: (gr.State(value), gr.State(value if value else None)), inputs=[data_source_toggle], outputs=[is_real_data_state, current_engine_state])
@load_source_button.click(inputs=[source_type, source_path], outputs=[source_status])
def _load_source_data(source_type, source_path):
if not source_path: raise gr.Error("Source path/URL is required.")
try:
knowledge_base.load_source(source_type, source_path)
gr.Info("Source data loaded.")
return gr.Markdown("✅ Source loaded successfully", visible=True)
except (ConnectionError, ValueError, RuntimeError) as e:
raise gr.Error(f"Failed to load source: {str(e)}")
# Engine Settings Modal Logic
def _populate_engine_options(selected_engines):
engine_options_html = ""
for engine in searchEngines:
is_checked = "checked" if engine in selected_engines else ""
engine_options_html += f"""
<div class="flex items-center">
<input type="checkbox" id="engine-{engine.replace('.', '_')}" class="engine-checkbox mr-2 h-4 w-4" value="{engine}" {is_checked}>
<label for="engine-{engine.replace('.', '_')}" class="cursor-pointer">{engine}</label>
</div>
"""
return gr.HTML(engine_options_html)
def _save_engine_settings(selected_engines_json):
selected_engines = json.loads(selected_engines_json)
if not selected_engines:
gr.Warning("At least one search engine must be selected. Using DuckDuckGo as default.")
selected_engines = ["DuckDuckGo.com"]
current_engine = selected_engines[0] if selected_engines else None
return gr.State(selected_engines), gr.State(current_engine), gr.Info(f"Updated search engines. Using {len(selected_engines)} engines.")
# Initialize engine options component
engine_options_html_comp = _populate_engine_options(selected_engines_state.value)
# Update engine options when the modal is opened
engine_settings_button.click(lambda: engine_options_html_comp.update(_populate_engine_options(selected_engines_state.value)), outputs=[engine_options_html_comp])
select_all_engines_btn.click(lambda: engine_options_html_comp.update(_populate_engine_options(searchEngines)), outputs=[engine_options_html_comp])
deselect_all_engines_btn.click(lambda: engine_options_html_comp.update(_populate_engine_options([])), outputs=[engine_options_html_comp])
save_engines_btn.click(
_save_engine_settings,
inputs=[gr.JSON(elem_id="engine-options")], # Capture checked engines from modal
outputs=[selected_engines_state, current_engine_state, gr.Info()]
)
engine_settings_button.click(lambda: engine_modal.update(visible=True), outputs=[engine_modal])
# Close modal on save or when clicking outside (implicit via Gradio's modal handling)
# Initial App Load Logic
@demo.load(outputs=([search_page, dataset_page, dataset_title_md, dataset_description_md, dataset_source_badge, dataset_source_info, dataset_share_textbox, full_dataset_section, save_dataset_button, open_dataset_message, search_bar] + # Outputs for detail page and search bar
buttons + [generated_texts_state] + # Outputs for search results buttons and state
[select_namespace_dropdown, visibility_radio, source_group, data_source_toggle, current_engine_state, selected_engines_state, engine_options_html_comp])) # Outputs for settings
def _load_app(request: gr.Request, oauth_token: Optional[gr.OAuthToken]):
# Handle user login and namespace selection
if oauth_token:
try:
user_info = whoami(oauth_token.token)
namespaces = [user_info["name"]] + [org["name"] for org in user_info.get("orgs", [])]
yield {
select_namespace_dropdown: gr.Dropdown(choices=namespaces, value=user_info["name"], visible=True),
visibility_radio: gr.Radio(interactive=True),
}
except Exception: # Fallback if user info fails
yield {
select_namespace_dropdown: gr.Dropdown(choices=[NAMESPACE], value=NAMESPACE, visible=True),
visibility_radio: gr.Radio(interactive=True),
}
else: # Default settings if not logged in
yield {
select_namespace_dropdown: gr.Dropdown(choices=[NAMESPACE], value=NAMESPACE, visible=True),
visibility_radio: gr.Radio(interactive=False),
}
# Handle URL parameters for direct search or dataset loading
query_params = dict(request.query_params)
if "dataset" in query_params:
is_real = query_params.get("engine") is not None
engine = query_params.get("engine")
yield from _show_dataset_details(query_params.get("q", query_params["dataset"]), query_params["dataset"], query_params.get("tags", ""), is_real, engine)
yield {is_real_data_state: is_real, current_engine_state: engine}
elif "q" in query_params:
search_query = query_params["q"]
is_real = query_params.get("engine") is not None
engine = query_params.get("engine")
yield {search_bar: search_query}
yield {is_real_data_state: is_real, current_engine_state: engine}
yield from _update_search_results(search_query, (), is_real, engine)
else:
yield {search_page: gr.Column(visible=True)} # Show search page by default
# Initialize with default datasets
initial_outputs = {}
for i, line in enumerate(default_output):
try: dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1)
except ValueError: dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" ", 1)[0], ""
initial_outputs[buttons[2 * i]] = gr.Button(dataset_name, elem_classes="topButton")
initial_outputs[buttons[2 * i + 1]] = gr.Button(tags, elem_classes="bottomButton")
initial_outputs[button_groups[i]] = gr.Group(elem_classes="buttonsGroup")
yield initial_outputs
yield {generated_texts_state: (landing_page_datasets_generated_text,)}
# Initialize engine settings UI
yield {
data_source_toggle: gr.Checkbox(value=is_real_data_state.value),
engine_options_html_comp: _populate_engine_options(selected_engines_state.value)
}
if __name__ == "__main__":
demo.launch(share=False, server_name="0.0.0.0")