Infini-d-set

Runtime error

App Files Files Community

Infini-d-set / app.py

acecalisto3

Update app.py

2a41399 verified 13 days ago

raw

history blame contribute delete

125 kB

	import io
	import os
	import re
	import time
	import requests
	from typing import Any, Dict, List, Optional, Set, Union
	from difflib import get_close_matches
	from pathlib import Path
	from itertools import islice
	from functools import partial
	from multiprocessing.pool import ThreadPool
	from queue import Queue, Empty
	from typing import Callable, Iterable, Iterator, Optional, TypeVar

	import gradio as gr
	import pandas as pd
	import requests.exceptions
	from huggingface_hub import InferenceClient, create_repo, DatasetCard
	from huggingface_hub.utils import HfHubHTTPError
	import json

	# --- Configuration ---
	model_id = "microsoft/Phi-3-mini-4k-instruct"
	client = InferenceClient(model_id)
	save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN")

	MAX_TOTAL_NB_ITEMS = 100
	MAX_NB_ITEMS_PER_GENERATION_CALL = 10
	NUM_ROWS = 100
	NUM_VARIANTS = 10
	NAMESPACE = "infinite-dataset-hub"
	URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"

	# --- Prompt Templates ---
	GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
	"A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
	f"Generate a list of {MAX_NB_ITEMS_PER_GENERATION_CALL} names of quality datasets that don't exist but sound plausible and would "
	"be helpful. Feel free to reuse words from the query '{search_query}' to name the datasets. "
	"Every dataset should be about '{search_query}' and have descriptive tags/keywords including the ML task name associated with the dataset (classification, regression, anomaly detection, etc.). Use the following format:\n1. DatasetName1 (tag1, tag2, tag3)\n2. DatasetName2 (tag1, tag2, tag3)"
	)

	GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS = (
	"An ML practitioner is looking for a dataset CSV after the query '{search_query}'. "
	"Generate the first 5 rows of a plausible and quality CSV for the dataset '{dataset_name}'. "
	"You can get inspiration from related keywords '{tags}' but most importantly the dataset should correspond to the query '{search_query}'. "
	"Focus on quality text content and use a 'label' or 'labels' column if it makes sense (invent labels, avoid reusing the keywords, be accurate while labelling texts). "
	"Reply using a short description of the dataset with title Dataset Description: followed by the CSV content in a code block and with title CSV Content Preview:."
	)
	GENERATE_MORE_ROWS = "Can you give me 10 additional samples in CSV format as well? Use the same CSV header '{csv_header}'."
	GENERATE_VARIANTS_WITH_RARITY_AND_LABEL = "Focus on generating samples for the label '{label}' and ideally generate {rarity} samples."
	GENERATE_VARIANTS_WITH_RARITY = "Focus on generating {rarity} samples."

	# --- Default Datasets for Landing Page ---
	landing_page_datasets_generated_text = """
	1. NewsEventsPredict (classification, media, trend)
	2. FinancialForecast (economy, stocks, regression)
	3. HealthMonitor (science, real-time, anomaly detection)
	4. SportsAnalysis (classification, performance, player tracking)
	5. SciLiteracyTools (language modeling, science literacy, text classification)
	6. RetailSalesAnalyzer (consumer behavior, sales trend, segmentation)
	7. SocialSentimentEcho (social media, emotion analysis, clustering)
	8. NewsEventTracker (classification, public awareness, topical clustering)
	9. HealthVitalSigns (anomaly detection, biometrics, prediction)
	10. GameStockPredict (classification, finance, sports contingency)
	"""
	default_output = landing_page_datasets_generated_text.strip().split("\n")
	assert len(default_output) == MAX_NB_ITEMS_PER_GENERATION_CALL

	# --- Dataset Card Template ---
	DATASET_CARD_CONTENT = """
	---
	license: mit
	tags:
	- infinite-dataset-hub
	- synthetic
	---
	{title}
	_Note: This is an AI-generated dataset so its content may be inaccurate or false_
	{content}
	Source of the data:
	The dataset was generated using the [Infinite Dataset Hub]({url}) and {model_id} using the query '{search_query}':
	- Dataset Generation Page: {dataset_url}
	- Model: https://huggingface.co/{model_id}
	- More Datasets: https://huggingface.co/datasets?other=infinite-dataset-hub
	"""

	# --- Gradio HTML ---
	html = """

	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Infinite Dataset Hub</title>
	<script src="https://cdn.tailwindcss.com"></script>
	<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
	<script src="https://cdn.jsdelivr.net/npm/papaparse@5.3.0/papaparse.min.js"></script>
	<script>
	tailwind.config = {
	darkMode: 'class',
	theme: {
	extend: {
	colors: {
	primary: '#5D5CDE',
	},
	}
	}
	}
	</script>
	<style>
	.shimmer {
	background: linear-gradient(90deg, #f0f0f0 25%, #e0e0e0 50%, #f0f0f0 75%);
	background-size: 200% 100%;
	animation: shimmer 1.5s infinite;
	border-radius: 4px;
	}

	@keyframes shimmer {
	0% {
	background-position: -200% 0;
	}
	100% {
	background-position: 200% 0;
	}
	}

	/* Dark mode overrides */
	.dark .shimmer {
	background: linear-gradient(90deg, #2a2a2a 25%, #3a3a3a 50%, #2a2a2a 75%);
	background-size: 200% 100%;
	}

	.dataset-card {
	transition: transform 0.2s, box-shadow 0.2s;
	}

	.dataset-card:hover {
	transform: translateY(-2px);
	box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);
	}

	.dark .dataset-card:hover {
	box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.3), 0 4px 6px -2px rgba(0, 0, 0, 0.2);
	}

	/* Table styling */
	table {
	width: 100%;
	border-collapse: collapse;
	margin: 1rem 0;
	}

	table thead th {
	background-color: #f3f4f6;
	padding: 0.75rem;
	text-align: left;
	font-weight: 600;
	}

	.dark table thead th {
	background-color: #374151;
	}

	table tbody td {
	padding: 0.75rem;
	border-top: 1px solid #e5e7eb;
	}

	.dark table tbody td {
	border-top: 1px solid #4b5563;
	}

	table tbody tr:nth-child(even) {
	background-color: #f9fafb;
	}

	.dark table tbody tr:nth-child(even) {
	background-color: #1f2937;
	}

	/* Search engine badge */
	.engine-badge {
	position: absolute;
	top: -8px;
	right: -8px;
	font-size: 0.7rem;
	padding: 2px 6px;
	border-radius: 9999px;
	background-color: #5D5CDE;
	color: white;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	}

	.dark .engine-badge {
	box-shadow: 0 2px 4px rgba(0,0,0,0.3);
	}

	/* Toggle switch */
	.toggle-switch {
	position: relative;
	display: inline-block;
	width: 50px;
	height: 24px;
	}

	.toggle-switch input {
	opacity: 0;
	width: 0;
	height: 0;
	}

	.toggle-slider {
	position: absolute;
	cursor: pointer;
	top: 0;
	left: 0;
	right: 0;
	bottom: 0;
	background-color: #ccc;
	transition: .4s;
	border-radius: 24px;
	}

	.toggle-slider:before {
	position: absolute;
	content: "";
	height: 16px;
	width: 16px;
	left: 4px;
	bottom: 4px;
	background-color: white;
	transition: .4s;
	border-radius: 50%;
	}

	input:checked + .toggle-slider {
	background-color: #5D5CDE;
	}

	input:checked + .toggle-slider:before {
	transform: translateX(26px);
	}
	</style>
	</head>
	<body class="bg-white dark:bg-gray-900 text-gray-800 dark:text-gray-200 min-h-screen">
	<!-- Dark mode detection -->
	<script>
	if (window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches) {
	document.documentElement.classList.add('dark');
	}
	window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', event => {
	if (event.matches) {
	document.documentElement.classList.add('dark');
	} else {
	document.documentElement.classList.remove('dark');
	}
	});
	</script>

	<div class="container mx-auto px-4 py-8">
	<!-- Header -->
	<header class="text-center mb-8">
	<h1 class="text-3xl font-bold mb-2">🤗 Infinite Dataset Hub ♾️</h1>
	<p class="text-lg text-gray-600 dark:text-gray-400">Generate datasets from AI and real-world data sources</p>
	</header>

	<!-- Main Content -->
	<main>
	<!-- Search Section -->
	<div id="search-page" class="mb-8">
	<div class="max-w-3xl mx-auto">
	<div class="mb-4">
	<div class="flex mb-2">
	<input id="search-input" type="text" placeholder="Search datasets, get infinite results"
	class="flex-grow px-4 py-3 text-base rounded-l-lg border border-gray-300 dark:border-gray-700 focus:outline-none focus:ring-2 focus:ring-primary dark:bg-gray-800">
	<button id="search-button" class="bg-primary text-white px-6 py-3 rounded-r-lg hover:bg-opacity-90 transition">
	🔍
	</button>
	</div>

	<div class="flex items-center justify-between p-3 bg-gray-100 dark:bg-gray-800 rounded-lg">
	<div class="flex items-center">
	<label class="toggle-switch mr-3">
	<input type="checkbox" id="data-source-toggle" checked>
	<span class="toggle-slider"></span>
	</label>
	<div>
	<span id="data-source-text" class="font-medium">Using: Real + AI Data</span>
	<p class="text-xs text-gray-500 dark:text-gray-400">Toggle to switch between data sources</p>
	</div>
	</div>

	<button id="engine-settings-button" class="text-primary hover:underline flex items-center">
	<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-1" viewBox="0 0 20 20" fill="currentColor">
	<path fill-rule="evenodd" d="M11.49 3.17c-.38-1.56-2.6-1.56-2.98 0a1.532 1.532 0 01-2.286.948c-1.372-.836-2.942.734-2.106 2.106.54.886.061 2.042-.947 2.287-1.561.379-1.561 2.6 0 2.978a1.532 1.532 0 01.947 2.287c-.836 1.372.734 2.942 2.106 2.106a1.532 1.532 0 012.287.947c.379 1.561 2.6 1.561 2.978 0a1.533 1.533 0 012.287-.947c1.372.836 2.942-.734 2.106-2.106a1.533 1.533 0 01.947-2.287c1.561-.379 1.561-2.6 0-2.978a1.532 1.532 0 01-.947-2.287c.836-1.372-.734-2.942-2.106-2.106a1.532 1.532 0 01-2.287-.947zM10 13a3 3 0 100-6 3 3 0 000 6z" clip-rule="evenodd" />
	</svg>
	Search Engines
	</button>
	</div>
	</div>

	<!-- Search Engine Selection Modal -->
	<div id="engine-modal" class="fixed inset-0 bg-black bg-opacity-50 flex items-center justify-center z-50 hidden">
	<div class="bg-white dark:bg-gray-800 rounded-lg p-6 max-w-lg w-full max-h-[80vh] overflow-y-auto">
	<div class="flex justify-between items-center mb-4">
	<h3 class="text-xl font-bold">Search Engine Settings</h3>
	<button id="close-modal-button" class="text-gray-500 hover:text-gray-700 dark:text-gray-400 dark:hover:text-gray-200">
	<svg xmlns="http://www.w3.org/2000/svg" class="h-6 w-6" fill="none" viewBox="0 0 24 24" stroke="currentColor">
	<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M6 18L18 6M6 6l12 12" />
	</svg>
	</button>
	</div>

	<p class="mb-4 text-sm text-gray-600 dark:text-gray-400">
	Select which search engines to use for real data retrieval. A diverse selection improves results.
	</p>

	<div id="engine-options" class="space-y-2 mb-6">
	<!-- Engine options will be dynamically inserted here -->
	</div>

	<div class="flex justify-between">
	<button id="select-all-engines" class="text-primary hover:underline">Select All</button>
	<button id="deselect-all-engines" class="text-primary hover:underline">Deselect All</button>
	</div>

	<div class="mt-6 flex justify-end">
	<button id="save-engines-button" class="bg-primary text-white px-4 py-2 rounded hover:bg-opacity-90 transition">
	Save Settings
	</button>
	</div>
	</div>
	</div>

	<div id="dataset-results" class="grid grid-cols-1 md:grid-cols-2 gap-4 mt-6">
	<!-- Dataset cards will be dynamically inserted here -->
	</div>

	<div id="load-more-container" class="text-center mt-6 hidden">
	<button id="load-more-button" class="bg-gray-200 dark:bg-gray-700 px-6 py-3 rounded-lg hover:bg-gray-300 dark:hover:bg-gray-600 transition">
	Load more datasets
	</button>
	</div>
	</div>
	</div>

	<!-- Dataset Detail Page -->
	<div id="dataset-page" class="hidden max-w-4xl mx-auto">
	<button id="back-button" class="flex items-center text-primary mb-4 hover:underline">
	<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-1" viewBox="0 0 20 20" fill="currentColor">
	<path fill-rule="evenodd" d="M9.707 14.707a1 1 0 01-1.414 0l-4-4a1 1 0 010-1.414l4-4a1 1 0 011.414 1.414L7.414 9H15a1 1 0 110 2H7.414l2.293 2.293a1 1 0 010 1.414z" clip-rule="evenodd" />
	</svg>
	Back to Search
	</button>

	<div id="dataset-header" class="mb-4">
	<div class="flex items-center justify-between">
	<h2 id="dataset-title" class="text-2xl font-bold"></h2>
	<span id="data-source-badge" class="px-3 py-1 rounded-full text-xs font-medium bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200">
	Real Data
	</span>
	</div>
	<div id="dataset-tags" class="text-sm text-gray-600 dark:text-gray-400 mt-1"></div>
	</div>

	<div id="data-source-info" class="bg-blue-50 dark:bg-blue-900 p-4 rounded-lg mb-6 text-blue-800 dark:text-blue-200">
	<h3 class="font-semibold mb-1 flex items-center">
	<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-1" viewBox="0 0 20 20" fill="currentColor">
	<path fill-rule="evenodd" d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-7-4a1 1 0 11-2 0 1 1 0 012 0zM9 9a1 1 0 000 2v3a1 1 0 001 1h1a1 1 0 100-2v-3a1 1 0 00-1-1H9z" clip-rule="evenodd" />
	</svg>
	Data Source Information
	</h3>
	<p id="source-details" class="text-sm"></p>
	</div>

	<div id="dataset-description" class="prose dark:prose-invert prose-sm sm:prose max-w-none mb-6"></div>

	<div id="dataset-preview" class="mb-6 overflow-x-auto">
	<h3 class="text-xl font-semibold mb-3">Dataset Preview</h3>
	<div id="preview-table" class="border dark:border-gray-700 rounded-lg overflow-hidden"></div>
	</div>

	<div id="generate-actions" class="mb-8">
	<button id="generate-full-button" class="bg-primary text-white px-6 py-3 rounded-lg hover:bg-opacity-90 transition mr-3">
	Generate Full Dataset
	</button>
	<div id="generate-status" class="hidden mt-4">
	<div class="flex items-center">
	<div class="animate-spin rounded-full h-5 w-5 border-b-2 border-primary mr-3"></div>
	<span>Generating dataset... <span id="rows-count">0</span> rows created</span>
	</div>
	<div class="w-full bg-gray-200 dark:bg-gray-700 rounded-full h-2.5 mt-2">
	<div id="progress-bar" class="bg-primary h-2.5 rounded-full" style="width: 0%"></div>
	</div>
	</div>
	</div>

	<div id="full-dataset" class="hidden mb-6">
	<h3 class="text-xl font-semibold mb-3">Full Dataset</h3>
	<div id="full-table" class="border dark:border-gray-700 rounded-lg overflow-hidden"></div>
	<div class="mt-4 flex flex-wrap gap-3">
	<button id="download-csv-button" class="bg-green-600 hover:bg-green-700 text-white px-4 py-2 rounded-lg transition flex items-center">
	<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-2" viewBox="0 0 20 20" fill="currentColor">
	<path fill-rule="evenodd" d="M3 17a1 1 0 011-1h12a1 1 0 110 2H4a1 1 0 01-1-1zm3.293-7.707a1 1 0 011.414 0L9 10.586V3a1 1 0 112 0v7.586l1.293-1.293a1 1 0 111.414 1.414l-3 3a1 1 0 01-1.414 0l-3-3a1 1 0 010-1.414z" clip-rule="evenodd" />
	</svg>
	Download CSV
	</button>
	<button id="download-json-button" class="bg-yellow-600 hover:bg-yellow-700 text-white px-4 py-2 rounded-lg transition flex items-center">
	<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-2" viewBox="0 0 20 20" fill="currentColor">
	<path fill-rule="evenodd" d="M3 17a1 1 0 011-1h12a1 1 0 110 2H4a1 1 0 01-1-1zm3.293-7.707a1 1 0 011.414 0L9 10.586V3a1 1 0 112 0v7.586l1.293-1.293a1 1 0 111.414 1.414l-3 3a1 1 0 01-1.414 0l-3-3a1 1 0 010-1.414z" clip-rule="evenodd" />
	</svg>
	Download JSON
	</button>
	<button id="download-parquet-button" class="bg-blue-600 hover:bg-blue-700 text-white px-4 py-2 rounded-lg transition flex items-center">
	<svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-2" viewBox="0 0 20 20" fill="currentColor">
	<path fill-rule="evenodd" d="M3 17a1 1 0 011-1h12a1 1 0 110 2H4a1 1 0 01-1-1zm3.293-7.707a1 1 0 011.414 0L9 10.586V3a1 1 0 112 0v7.586l1.293-1.293a1 1 0 111.414 1.414l-3 3a1 1 0 01-1.414 0l-3-3a1 1 0 010-1.414z" clip-rule="evenodd" />
	</svg>
	Download Parquet
	</button>
	</div>
	</div>
	</div>
	</main>

	<!-- Footer -->
	<footer class="mt-12 text-center text-sm text-gray-600 dark:text-gray-400">
	<p>Powered by Claude-3.7-Sonnet • Datasets generated from real sources and AI</p>
	</footer>
	</div>

	<script>
	// Constants and global state
	const MAX_DATASETS_PER_PAGE = 10;
	const MAX_FULL_DATASET_ROWS = 100;

	// List of search engines
	const searchEngines = [
	"AlltheInternet.com", "DuckDuckGo.com", "Google.com", "Bing.com", "Search.Yahoo.com",
	"Startpage.com", "Qwant.com", "Ecosia.org", "WolframAlpha.com", "Mojeek.co.uk",
	"Search.Brave.com", "Yandex.com", "Baidu.com", "Gibiru.com", "MetaGer.org",
	"Swisscows.com", "Presearch.com", "Ekoru.org", "Search.Lilo.org"
	];

	let currentDatasets = [];
	let currentPage = 1;
	let currentSearchQuery = '';
	let currentDataset = null;
	let fullDatasetRows = [];
	let useRealData = true;
	let selectedEngines = ["DuckDuckGo.com", "Bing.com", "Search.Yahoo.com", "Search.Brave.com", "Ecosia.org"];
	let currentEngine = ""; // Store the engine currently being used

	// DOM Elements
	const searchInput = document.getElementById('search-input');
	const searchButton = document.getElementById('search-button');
	const resultsContainer = document.getElementById('dataset-results');
	const loadMoreContainer = document.getElementById('load-more-container');
	const loadMoreButton = document.getElementById('load-more-button');
	const searchPage = document.getElementById('search-page');
	const datasetPage = document.getElementById('dataset-page');
	const backButton = document.getElementById('back-button');
	const datasetTitle = document.getElementById('dataset-title');
	const datasetTags = document.getElementById('dataset-tags');
	const datasetDescription = document.getElementById('dataset-description');
	const previewTable = document.getElementById('preview-table');
	const generateFullButton = document.getElementById('generate-full-button');
	const generateStatus = document.getElementById('generate-status');
	const rowsCount = document.getElementById('rows-count');
	const progressBar = document.getElementById('progress-bar');
	const fullDatasetSection = document.getElementById('full-dataset');
	const fullTable = document.getElementById('full-table');
	const downloadCsvButton = document.getElementById('download-csv-button');
	const downloadJsonButton = document.getElementById('download-json-button');
	const downloadParquetButton = document.getElementById('download-parquet-button');
	const dataSourceToggle = document.getElementById('data-source-toggle');
	const dataSourceText = document.getElementById('data-source-text');
	const dataSourceBadge = document.getElementById('data-source-badge');
	const sourceDetails = document.getElementById('source-details');
	const engineSettingsButton = document.getElementById('engine-settings-button');
	const engineModal = document.getElementById('engine-modal');
	const engineOptions = document.getElementById('engine-options');
	const closeModalButton = document.getElementById('close-modal-button');
	const saveEnginesButton = document.getElementById('save-engines-button');
	const selectAllEngines = document.getElementById('select-all-engines');
	const deselectAllEngines = document.getElementById('deselect-all-engines');

	// Event Listeners
	document.addEventListener('DOMContentLoaded', () => {
	searchButton.addEventListener('click', performSearch);
	searchInput.addEventListener('keypress', (e) => {
	if (e.key === 'Enter') performSearch();
	});
	loadMoreButton.addEventListener('click', loadMoreDatasets);
	backButton.addEventListener('click', showSearchPage);
	generateFullButton.addEventListener('click', generateFullDataset);
	downloadCsvButton.addEventListener('click', () => downloadData('csv'));
	downloadJsonButton.addEventListener('click', () => downloadData('json'));
	downloadParquetButton.addEventListener('click', () => downloadData('parquet'));

	dataSourceToggle.addEventListener('change', toggleDataSource);
	engineSettingsButton.addEventListener('click', showEngineModal);
	closeModalButton.addEventListener('click', hideEngineModal);
	saveEnginesButton.addEventListener('click', saveEngineSettings);
	selectAllEngines.addEventListener('click', () => toggleAllEngines(true));
	deselectAllEngines.addEventListener('click', () => toggleAllEngines(false));

	// Initialize engine options
	populateEngineOptions();

	// Show initial placeholder datasets
	showPlaceholderDatasets();
	});

	// Search Engine Settings
	function populateEngineOptions() {
	engineOptions.innerHTML = '';

	searchEngines.forEach(engine => {
	const isChecked = selectedEngines.includes(engine);

	const optionDiv = document.createElement('div');
	optionDiv.className = 'flex items-center';

	optionDiv.innerHTML = `
	<input type="checkbox" id="engine-${engine}" class="engine-checkbox mr-2 h-4 w-4"
	value="${engine}" ${isChecked ? 'checked' : ''}>
	<label for="engine-${engine}" class="cursor-pointer">${engine}</label>
	`;

	engineOptions.appendChild(optionDiv);
	});
	}

	function showEngineModal() {
	engineModal.classList.remove('hidden');
	}

	function hideEngineModal() {
	engineModal.classList.add('hidden');
	}

	function saveEngineSettings() {
	const checkboxes = document.querySelectorAll('.engine-checkbox:checked');
	selectedEngines = Array.from(checkboxes).map(cb => cb.value);

	if (selectedEngines.length === 0) {
	// Ensure at least one engine is selected
	selectedEngines = ["DuckDuckGo.com"];
	document.getElementById(`engine-DuckDuckGo.com`).checked = true;
	showNotification("At least one search engine must be selected. Using DuckDuckGo as default.");
	}

	hideEngineModal();
	showNotification(`Updated search engine settings. Using ${selectedEngines.length} engines.`);
	}

	function toggleAllEngines(select) {
	const checkboxes = document.querySelectorAll('.engine-checkbox');
	checkboxes.forEach(cb => {
	cb.checked = select;
	});
	}

	// Toggle data source between real and AI
	function toggleDataSource() {
	useRealData = dataSourceToggle.checked;
	dataSourceText.textContent = useRealData ? "Using: Real + AI Data" : "Using: AI Data Only";

	// Show or hide engine settings button
	engineSettingsButton.style.display = useRealData ? "flex" : "none";

	showNotification(`Switched to ${useRealData ? "combined real and synthetic" : "synthetic-only"} data mode`);
	}

	// Search functionality
	function performSearch() {
	const query = searchInput.value.trim();
	if (!query) return;

	currentSearchQuery = query;
	currentPage = 1;
	currentDatasets = [];

	resultsContainer.innerHTML = '';
	showLoadingSkeletons();

	if (useRealData) {
	// Use real data from search engines + AI
	searchWithRealData(query);
	} else {
	// Use only AI-generated data
	searchWithAIData(query);
	}
	}

	function searchWithRealData(query) {
	// Randomly select a search engine from the user's selected engines
	currentEngine = selectedEngines[Math.floor(Math.random() * selectedEngines.length)];

	// Register handler for dataset names based on real search results
	window.Poe.registerHandler("real-search-handler", (result) => {
	if (result.status === "error") {
	showError("Error querying search engines");
	return;
	}

	const message = result.responses[0];

	if (message.status === "complete") {
	// Parse the dataset names and tags from the response
	const datasets = parseDatasetResults(message.content);
	datasets.forEach(dataset => {
	dataset.isReal = true;
	dataset.engine = currentEngine;
	});

	currentDatasets = datasets;

	// Display the datasets
	resultsContainer.innerHTML = '';
	displayDatasets(datasets);

	// Show load more button if we have results
	if (datasets.length > 0) {
	loadMoreContainer.classList.remove('hidden');
	}
	}
	});

	try {
	window.Poe.sendUserMessage(
	`@Claude-3.7-Sonnet You are a data specialist who can transform real search results into structured datasets.

	A user is searching for data about: "${query}"

	Imagine you've queried ${currentEngine} and received real search results. Create a list of 10 specific datasets that could be created from these search results.

	For each dataset:
	1. Give it a clear, specific name related to the search topic
	2. Include 3-5 relevant tags in parentheses, with one tag specifying the ML task type (classification, regression, clustering, etc.)

	Format each dataset as:
	1. DatasetName (tag1, tag2, ml_task_tag)

	Make these datasets sound like real collections that could be created from ${currentEngine} search results on "${query}".`,
	{
	handler: "real-search-handler",
	stream: false,
	openChat: false
	}
	);
	} catch (err) {
	showError("Error sending message: " + err);
	// Fall back to AI data
	searchWithAIData(query);
	}
	}

	function searchWithAIData(query) {
	// Register handler for AI-generated dataset names
	window.Poe.registerHandler("dataset-search-handler", (result) => {
	if (result.status === "error") {
	showError("Error generating datasets");
	return;
	}

	const message = result.responses[0];

	if (message.status === "complete") {
	// Parse the dataset names and tags from the response
	const datasets = parseDatasetResults(message.content);
	datasets.forEach(dataset => {
	dataset.isReal = false;
	});

	currentDatasets = datasets;

	// Display the datasets
	resultsContainer.innerHTML = '';
	displayDatasets(datasets);

	// Show load more button if we have results
	if (datasets.length > 0) {
	loadMoreContainer.classList.remove('hidden');
	}
	}
	});

	try {
	window.Poe.sendUserMessage(
	`@Claude-3.7-Sonnet A Machine Learning Practioner is looking for a dataset that matches '${query}'.
	Generate a list of ${MAX_DATASETS_PER_PAGE} names of quality datasets that don't exist but sound plausible and would
	be helpful. Feel free to reuse words from the query '${query}' to name the datasets.
	Every dataset should be about '${query}' and have descriptive tags/keywords including the ML task name associated with the dataset (classification, regression, anomaly detection, etc.). Use the following format:
	1. DatasetName1 (tag1, tag2, tag3)
	2. DatasetName2 (tag1, tag2, tag3)`,
	{
	handler: "dataset-search-handler",
	stream: false,
	openChat: false
	}
	);
	} catch (err) {
	showError("Error sending message: " + err);
	}
	}

	function parseDatasetResults(content) {
	const lines = content.split('\n');
	const datasets = [];

	lines.forEach(line => {
	// Match lines that start with a number followed by a period
	const match = line.match(/^\s*\d+\.\s+(.+?)\s+$(.+?)$/);
	if (match) {
	const name = match[1].trim();
	const tags = match[2].split(',').map(tag => tag.trim());
	datasets.push({ name, tags });
	}
	});

	return datasets;
	}

	function displayDatasets(datasets) {
	datasets.forEach(dataset => {
	const card = document.createElement('div');
	card.className = 'dataset-card bg-white dark:bg-gray-800 rounded-lg p-4 border border-gray-200 dark:border-gray-700 cursor-pointer relative';

	const tagsHtml = dataset.tags.map(tag =>
	`<span class="inline-block bg-gray-100 dark:bg-gray-700 text-gray-800 dark:text-gray-300 text-xs px-2 py-1 rounded mr-1 mb-1">${tag}</span>`
	).join('');

	// Add a badge for real data
	let badgeHtml = '';
	if (dataset.isReal) {
	badgeHtml = `<span class="engine-badge" title="Data from ${dataset.engine}">${dataset.engine.split('.')[0]}</span>`;
	}

	card.innerHTML = `
	${badgeHtml}
	<h3 class="text-lg font-semibold mb-2">${dataset.name}</h3>
	<div class="flex flex-wrap mt-2">${tagsHtml}</div>
	`;

	card.addEventListener('click', () => showDatasetDetails(dataset));
	resultsContainer.appendChild(card);
	});
	}

	function showLoadingSkeletons() {
	for (let i = 0; i < 4; i++) {
	const skeleton = document.createElement('div');
	skeleton.className = 'bg-white dark:bg-gray-800 rounded-lg p-4 border border-gray-200 dark:border-gray-700';
	skeleton.innerHTML = `
	<div class="shimmer h-6 w-3/4 mb-2"></div>
	<div class="flex flex-wrap mt-2">
	<div class="shimmer h-6 w-16 rounded mr-1 mb-1"></div>
	<div class="shimmer h-6 w-20 rounded mr-1 mb-1"></div>
	<div class="shimmer h-6 w-24 rounded mr-1 mb-1"></div>
	</div>
	`;
	resultsContainer.appendChild(skeleton);
	}
	}

	function loadMoreDatasets() {
	currentPage++;

	// Use the same data source (real or AI) as the initial search
	if (useRealData) {
	loadMoreRealDatasets();
	} else {
	loadMoreAIDatasets();
	}
	}

	function loadMoreRealDatasets() {
	// Rotate to a different search engine for variety
	const previousEngine = currentEngine;
	while (currentEngine === previousEngine && selectedEngines.length > 1) {
	currentEngine = selectedEngines[Math.floor(Math.random() * selectedEngines.length)];
	}

	// Register handler for more datasets
	window.Poe.registerHandler("more-real-datasets-handler", (result) => {
	if (result.status === "error") {
	showError("Error generating more datasets");
	return;
	}

	const message = result.responses[0];

	if (message.status === "complete") {
	// Parse the dataset names and tags from the response
	const datasets = parseDatasetResults(message.content);
	datasets.forEach(dataset => {
	dataset.isReal = true;
	dataset.engine = currentEngine;
	});

	currentDatasets = [...currentDatasets, ...datasets];

	// Display the datasets
	displayDatasets(datasets);
	}
	});

	try {
	window.Poe.sendUserMessage(
	`@Claude-3.7-Sonnet You're a data specialist who can transform real search results into structured datasets.

	Continue our previous search for data about: "${currentSearchQuery}"

	Now let's use a different search engine: ${currentEngine}

	Create 10 more specific datasets that could be created from these search results. Make sure these are different from the previous datasets.

	Use the same format:
	1. DatasetName (tag1, tag2, ml_task_tag)

	Make these datasets sound like real collections that could be created from ${currentEngine} search results on "${currentSearchQuery}".`,
	{
	handler: "more-real-datasets-handler",
	stream: false,
	openChat: false
	}
	);
	} catch (err) {
	showError("Error sending message: " + err);
	// Fall back to AI data
	loadMoreAIDatasets();
	}
	}

	function loadMoreAIDatasets() {
	// Register handler for more AI datasets
	window.Poe.registerHandler("more-datasets-handler", (result) => {
	if (result.status === "error") {
	showError("Error generating more datasets");
	return;
	}

	const message = result.responses[0];

	if (message.status === "complete") {
	// Parse the dataset names and tags from the response
	const datasets = parseDatasetResults(message.content);
	datasets.forEach(dataset => {
	dataset.isReal = false;
	});

	currentDatasets = [...currentDatasets, ...datasets];

	// Display the datasets
	displayDatasets(datasets);
	}
	});

	try {
	window.Poe.sendUserMessage(
	`@Claude-3.7-Sonnet Please generate ${MAX_DATASETS_PER_PAGE} more dataset names about '${currentSearchQuery}'. Use the same format as before:
	1. DatasetName1 (tag1, tag2, tag3)
	Make sure these are completely different from previous suggestions.`,
	{
	handler: "more-datasets-handler",
	stream: false,
	openChat: false
	}
	);
	} catch (err) {
	showError("Error sending message: " + err);
	}
	}

	function showDatasetDetails(dataset) {
	currentDataset = dataset;
	searchPage.classList.add('hidden');
	datasetPage.classList.remove('hidden');

	// Update UI with dataset info
	datasetTitle.textContent = dataset.name;
	datasetTags.innerHTML = dataset.tags.map(tag =>
	`<span class="inline-block bg-gray-100 dark:bg-gray-700 text-gray-800 dark:text-gray-300 text-xs px-2 py-1 rounded mr-1 mb-1">${tag}</span>`
	).join('');

	// Update source badge
	if (dataset.isReal) {
	dataSourceBadge.textContent = "Real Data";
	dataSourceBadge.className = "px-3 py-1 rounded-full text-xs font-medium bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200";
	sourceDetails.innerHTML = `This dataset is based on real information queried from <strong>${dataset.engine}</strong> for the search term "<strong>${currentSearchQuery}</strong>". The data has been structured for machine learning use.`;
	} else {
	dataSourceBadge.textContent = "AI-Generated";
	dataSourceBadge.className = "px-3 py-1 rounded-full text-xs font-medium bg-purple-100 text-purple-800 dark:bg-purple-900 dark:text-purple-200";
	sourceDetails.innerHTML = `This is an AI-generated dataset created using Claude-3.7-Sonnet. The content is synthetic and designed to represent plausible data related to "${currentSearchQuery}".`;
	}

	// Clear previous content
	datasetDescription.innerHTML = '<div class="shimmer h-4 w-full mb-2"></div>'.repeat(3);
	previewTable.innerHTML = '';
	fullDatasetSection.classList.add('hidden');
	generateStatus.classList.add('hidden');
	generateFullButton.disabled = false;

	// Reset full dataset
	fullDatasetRows = [];

	// Generate dataset preview - different approach for real vs AI data
	if (dataset.isReal) {
	generateRealDatasetPreview(dataset);
	} else {
	generateAIDatasetPreview(dataset);
	}

	// Scroll to top
	window.scrollTo(0, 0);
	}

	function generateRealDatasetPreview(dataset) {
	window.Poe.registerHandler("real-preview-handler", (result) => {
	if (result.status === "error") {
	datasetDescription.innerHTML = '<p class="text-red-500">Error generating dataset preview</p>';
	return;
	}

	const message = result.responses[0];

	if (message.status === "complete") {
	const content = message.content;

	// Extract description and CSV
	const parts = content.split('CSV Content Preview:');
	let description = "";
	let csvContent = "";

	if (parts.length > 1) {
	description = parts[0].replace('Dataset Description:', '').trim();
	csvContent = parts[1].trim();

	// Clean up CSV content (remove markdown code block markers)
	csvContent = csvContent.replace(/```csv\n\|```\n\|```/g, '').trim();
	} else {
	description = "No description available";
	csvContent = content;
	}

	// Display description
	datasetDescription.innerHTML = marked.parse(description);

	// Parse and display CSV preview
	try {
	const results = Papa.parse(csvContent, {
	header: true,
	skipEmptyLines: true
	});

	if (results.data && results.data.length > 0) {
	// Create table from CSV data
	createTable(previewTable, results.data, results.meta.fields);
	} else {
	previewTable.innerHTML = '<p class="p-4 text-red-500">No preview data available</p>';
	}
	} catch (err) {
	previewTable.innerHTML = `<p class="p-4 text-red-500">Error parsing CSV: ${err.message}</p>`;
	}
	}
	});

	try {
	const tagsStr = dataset.tags.join(', ');
	window.Poe.sendUserMessage(
	`@Claude-3.7-Sonnet You're a specialist in converting web search results into structured data.

	Based on search results from ${dataset.engine} about "${currentSearchQuery}",
	create a preview of the dataset "${dataset.name}" with tags "${tagsStr}".

	First, write a detailed description of what this dataset contains, its structure, and how it was constructed from web search results.

	Then, generate a realistic 5-row CSV preview that resembles data you might get if you scraped and structured real results from ${dataset.engine}.

	Format your response with:
	Dataset Description: [detailed description]

	CSV Content Preview:
	\`\`\`csv
	[CSV header and 5 rows of realistic data]
	\`\`\`

	Include relevant columns for the dataset type, with proper labels/categories where appropriate. The data should look like it came from real sources.`,
	{
	handler: "real-preview-handler",
	stream: false,
	openChat: false
	}
	);
	} catch (err) {
	datasetDescription.innerHTML = `<p class="text-red-500">Error: ${err.message}</p>`;
	}
	}

	function generateAIDatasetPreview(dataset) {
	window.Poe.registerHandler("dataset-preview-handler", (result) => {
	if (result.status === "error") {
	datasetDescription.innerHTML = '<p class="text-red-500">Error generating dataset preview</p>';
	return;
	}

	const message = result.responses[0];

	if (message.status === "complete") {
	const content = message.content;

	// Extract description and CSV
	const parts = content.split('CSV Content Preview:');
	let description = "";
	let csvContent = "";

	if (parts.length > 1) {
	description = parts[0].replace('Dataset Description:', '').trim();
	csvContent = parts[1].trim();

	// Clean up CSV content (remove markdown code block markers)
	csvContent = csvContent.replace(/```csv\n\|```\n\|```/g, '').trim();
	} else {
	description = "No description available";
	csvContent = content;
	}

	// Display description
	datasetDescription.innerHTML = marked.parse(description);

	// Parse and display CSV preview
	try {
	const results = Papa.parse(csvContent, {
	header: true,
	skipEmptyLines: true
	});

	if (results.data && results.data.length > 0) {
	// Create table from CSV data
	createTable(previewTable, results.data, results.meta.fields);
	} else {
	previewTable.innerHTML = '<p class="p-4 text-red-500">No preview data available</p>';
	}
	} catch (err) {
	previewTable.innerHTML = `<p class="p-4 text-red-500">Error parsing CSV: ${err.message}</p>`;
	}
	}
	});

	try {
	const tagsStr = dataset.tags.join(', ');
	window.Poe.sendUserMessage(
	`@Claude-3.7-Sonnet An ML practitioner is looking for a dataset CSV after the query '${currentSearchQuery}'.
	Generate the first 5 rows of a plausible and quality CSV for the dataset '${dataset.name}'.
	You can get inspiration from related keywords '${tagsStr}' but most importantly the dataset should correspond to the query '${currentSearchQuery}'.
	Focus on quality text content and use a 'label' or 'labels' column if it makes sense (invent labels, avoid reusing the keywords, be accurate while labelling texts).
	Reply using a short description of the dataset with title Dataset Description: followed by the CSV content in a code block and with title CSV Content Preview:`,
	{
	handler: "dataset-preview-handler",
	stream: false,
	openChat: false
	}
	);
	} catch (err) {
	datasetDescription.innerHTML = `<p class="text-red-500">Error: ${err.message}</p>`;
	}
	}

	function createTable(container, data, headers) {
	container.innerHTML = '';

	const table = document.createElement('table');
	table.className = 'w-full';

	// Create header
	const thead = document.createElement('thead');
	const headerRow = document.createElement('tr');

	headers.forEach(header => {
	const th = document.createElement('th');
	th.textContent = header;
	headerRow.appendChild(th);
	});

	thead.appendChild(headerRow);
	table.appendChild(thead);

	// Create body
	const tbody = document.createElement('tbody');

	data.forEach(row => {
	const tr = document.createElement('tr');

	headers.forEach(header => {
	const td = document.createElement('td');
	td.textContent = row[header] \|\| '';
	tr.appendChild(td);
	});

	tbody.appendChild(tr);
	});

	table.appendChild(tbody);
	container.appendChild(table);
	}

	function generateFullDataset() {
	// Disable button and show status
	generateFullButton.disabled = true;
	generateStatus.classList.remove('hidden');
	rowsCount.textContent = '0';
	progressBar.style.width = '0%';

	// Set up variables for tracking generation
	let csvHeader = '';
	const targetRows = MAX_FULL_DATASET_ROWS;
	let currentRows = 0;
	fullDatasetRows = [];

	// Get the CSV header from the preview table
	const previewHeaders = Array.from(previewTable.querySelectorAll('thead th')).map(th => th.textContent);
	csvHeader = previewHeaders.join(',');

	// Add initial rows from preview
	const previewRows = Array.from(previewTable.querySelectorAll('tbody tr')).map(tr => {
	const row = {};
	Array.from(tr.querySelectorAll('td')).forEach((td, index) => {
	row[previewHeaders[index]] = td.textContent;
	});
	return row;
	});

	fullDatasetRows = [...previewRows];
	currentRows = previewRows.length;
	updateGenerationProgress(currentRows, targetRows);

	// Choose generation method based on dataset type
	if (currentDataset.isReal) {
	generateFullRealDataset(previewHeaders, csvHeader, currentRows, targetRows);
	} else {
	generateFullAIDataset(previewHeaders, csvHeader, currentRows, targetRows);
	}
	}

	function generateFullRealDataset(previewHeaders, csvHeader, currentRows, targetRows) {
	// Function to generate more rows in batches from "real" search results
	const generateBatch = (batchIndex) => {
	const batchSize = 15; // Larger batches for efficiency
	const startRow = currentRows + batchIndex * batchSize;

	if (startRow >= targetRows) {
	// We've reached the target, show the full dataset
	showFullDataset();
	return;
	}

	window.Poe.registerHandler(`real-batch-${batchIndex}-handler`, (result) => {
	if (result.status === "error") {
	showError("Error generating dataset rows");
	return;
	}

	const message = result.responses[0];

	if (message.status === "complete") {
	const content = message.content;

	// Extract CSV content (remove markdown code block markers)
	let csvContent = content.replace(/```csv\n\|```\n\|```/g, '').trim();

	// If there are multiple code blocks, try to find one with CSV data
	if (csvContent.includes('```')) {
	const codeBlocks = content.match(/```(?:csv)?\n([\s\S]*?)```/g) \|\| [];
	if (codeBlocks.length > 0) {
	csvContent = codeBlocks[0].replace(/```(?:csv)?\n\|```/g, '').trim();
	}
	}

	try {
	// Parse the CSV
	const results = Papa.parse(csvContent, {
	header: true,
	skipEmptyLines: true
	});

	if (results.data && results.data.length > 0) {
	// Add the new rows
	fullDatasetRows = [...fullDatasetRows, ...results.data];
	currentRows += results.data.length;

	// Update progress
	updateGenerationProgress(currentRows, targetRows);

	// Generate next batch
	generateBatch(batchIndex + 1);
	} else {
	// Try again with a different prompt
	generateBatch(batchIndex);
	}
	} catch (err) {
	console.error("Error parsing CSV:", err);
	// Try again
	generateBatch(batchIndex);
	}
	}
	});

	try {
	// For variation, rotate through engines for each batch
	const engineForBatch = selectedEngines[batchIndex % selectedEngines.length] \|\| currentDataset.engine;

	window.Poe.sendUserMessage(
	`@Claude-3.7-Sonnet You're expanding a dataset based on search results from ${engineForBatch}.

	For the dataset "${currentDataset.name}" about "${currentSearchQuery}", please generate ${batchSize} more rows of data.

	Use this exact CSV header: ${csvHeader}

	The data should look realistic, as if it came from actual ${engineForBatch} search results for "${currentSearchQuery}".
	Include appropriate values for each field, maintaining the same patterns and types as seen in the existing data.

	Only include the CSV data in your response (header + ${batchSize} rows), no explanations or additional text.`,
	{
	handler: `real-batch-${batchIndex}-handler`,
	stream: false,
	openChat: false
	}
	);
	} catch (err) {
	showError("Error sending message: " + err);
	}
	};

	// Start generating batches
	generateBatch(0);
	}

	function generateFullAIDataset(previewHeaders, csvHeader, currentRows, targetRows) {
	// Function to generate more rows in batches from AI
	const generateBatch = (batchIndex) => {
	const batchSize = 10;
	const startRow = currentRows + batchIndex * batchSize;

	if (startRow >= targetRows) {
	// We've reached the target, show the full dataset
	showFullDataset();
	return;
	}

	window.Poe.registerHandler(`batch-${batchIndex}-handler`, (result) => {
	if (result.status === "error") {
	showError("Error generating dataset rows");
	return;
	}

	const message = result.responses[0];

	if (message.status === "complete") {
	const content = message.content;

	// Extract CSV content (remove markdown code block markers)
	let csvContent = content.replace(/```csv\n\|```\n\|```/g, '').trim();

	// If there are multiple code blocks, try to find one with CSV data
	if (csvContent.includes('```')) {
	const codeBlocks = content.match(/```(?:csv)?\n([\s\S]*?)```/g) \|\| [];
	if (codeBlocks.length > 0) {
	csvContent = codeBlocks[0].replace(/```(?:csv)?\n\|```/g, '').trim();
	}
	}

	try {
	// Parse the CSV
	const results = Papa.parse(csvContent, {
	header: true,
	skipEmptyLines: true
	});

	if (results.data && results.data.length > 0) {
	// Add the new rows
	fullDatasetRows = [...fullDatasetRows, ...results.data];
	currentRows += results.data.length;

	// Update progress
	updateGenerationProgress(currentRows, targetRows);

	// Generate next batch
	generateBatch(batchIndex + 1);
	} else {
	// Try again with a different prompt
	generateBatch(batchIndex);
	}
	} catch (err) {
	console.error("Error parsing CSV:", err);
	// Try again
	generateBatch(batchIndex);
	}
	}
	});

	try {
	const tagsStr = currentDataset.tags.join(', ');
	window.Poe.sendUserMessage(
	`@Claude-3.7-Sonnet For the dataset '${currentDataset.name}' about '${currentSearchQuery}' with tags '${tagsStr}',
	please generate ${batchSize} more sample rows in CSV format. Use the same CSV header: ${csvHeader}
	Only include the CSV data in your response, no explanations or additional text.`,
	{
	handler: `batch-${batchIndex}-handler`,
	stream: false,
	openChat: false
	}
	);
	} catch (err) {
	showError("Error sending message: " + err);
	}
	};

	// Start generating batches
	generateBatch(0);
	}

	function updateGenerationProgress(current, total) {
	rowsCount.textContent = current;
	const percentage = Math.min(100, Math.floor((current / total) * 100));
	progressBar.style.width = `${percentage}%`;
	}

	function showFullDataset() {
	// Hide generation status
	generateStatus.classList.add('hidden');

	// Show full dataset section
	fullDatasetSection.classList.remove('hidden');

	// Get headers from the data
	const headers = Object.keys(fullDatasetRows[0] \|\| {});

	// Create and display the table
	createTable(fullTable, fullDatasetRows.slice(0, 10), headers);

	// Add a note about showing limited rows
	const note = document.createElement('p');
	note.className = 'text-sm text-gray-600 dark:text-gray-400 mt-2';
	note.textContent = `Showing 10 of ${fullDatasetRows.length} rows. Use the download buttons to get the complete dataset.`;
	fullTable.appendChild(note);
	}

	function downloadData(format) {
	if (fullDatasetRows.length === 0) return;

	const filename = `${currentDataset.name.replace(/\s+/g, '_')}_dataset`;

	switch(format) {
	case 'csv':
	downloadCsv(filename);
	break;
	case 'json':
	downloadJson(filename);
	break;
	case 'parquet':
	// Show a notification that this format is simulated
	showNotification("Parquet format download simulated - actual conversion would require a server component");
	downloadJson(filename + "_parquet_simulated");
	break;
	}
	}

	function downloadCsv(filename) {
	// Convert data to CSV
	const csv = Papa.unparse(fullDatasetRows);

	// Create a blob and download link
	const blob = new Blob([csv], { type: 'text/csv' });
	const url = URL.createObjectURL(blob);
	const a = document.createElement('a');

	a.href = url;
	a.download = `${filename}.csv`;
	document.body.appendChild(a);
	a.click();

	// Clean up
	setTimeout(() => {
	document.body.removeChild(a);
	URL.revokeObjectURL(url);
	}, 100);
	}

	function downloadJson(filename) {
	// Convert data to JSON
	const json = JSON.stringify(fullDatasetRows, null, 2);

	// Create a blob and download link
	const blob = new Blob([json], { type: 'application/json' });
	const url = URL.createObjectURL(blob);
	const a = document.createElement('a');

	a.href = url;
	a.download = `${filename}.json`;
	document.body.appendChild(a);
	a.click();

	// Clean up
	setTimeout(() => {
	document.body.removeChild(a);
	URL.revokeObjectURL(url);
	}, 100);
	}

	function showSearchPage() {
	searchPage.classList.remove('hidden');
	datasetPage.classList.add('hidden');
	}

	function showError(message) {
	console.error(message);
	showNotification(message, true);
	}

	function showNotification(message, isError = false) {
	const notification = document.createElement('div');
	notification.className = `fixed bottom-4 right-4 px-6 py-3 rounded-lg shadow-lg ${
	isError
	? 'bg-red-500 text-white'
	: 'bg-green-500 text-white'
	} z-50 transition-opacity duration-300`;
	notification.textContent = message;

	document.body.appendChild(notification);

	setTimeout(() => {
	notification.style.opacity = '0';
	setTimeout(() => {
	document.body.removeChild(notification);
	}, 300);
	}, 3000);
	}

	function showPlaceholderDatasets() {
	const placeholders = [
	{
	name: "NewsEventsPredict",
	tags: ["classification", "media", "trend"],
	isReal: true,
	engine: "AlltheInternet.com"
	},
	{
	name: "FinancialForecast",
	tags: ["economy", "stocks", "regression"],
	isReal: false
	},
	{
	name: "HealthMonitor",
	tags: ["science", "real-time", "anomaly detection"],
	isReal: true,
	engine: "DuckDuckGo.com"
	},
	{
	name: "SportsAnalysis",
	tags: ["classification", "performance", "player tracking"],
	isReal: false
	},
	{
	name: "RetailSalesAnalyzer",
	tags: ["consumer behavior", "sales trend", "segmentation"],
	isReal: true,
	engine: "Bing.com"
	},
	{
	name: "SocialMediaSentiment",
	tags: ["text classification", "opinion mining", "NLP"],
	isReal: false
	}
	];

	currentDatasets = placeholders;
	displayDatasets(placeholders);
	loadMoreContainer.classList.remove('hidden');
	}
	</script>
	</body>
	</html>
	"""

	# --- Gradio CSS ---
	css = """
	a { color: var(--body-text-color); }
	.datasetButton { justify-content: start; justify-content: left; }
	.tags { font-size: var(--button-small-text-size); color: var(--body-text-color-subdued); }
	.topButton {
	justify-content: start; justify-content: left; text-align: left; background: transparent;
	box-shadow: none; padding-bottom: 0;
	}
	.topButton::before {
	content: url("data:image/svg+xml,%3Csvg style='color: rgb(209 213 219)' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink' aria-hidden='true' focusable='false' role='img' width='1em' height='1em' preserveAspectRatio='xMidYMid meet' viewBox='0 0 25 25'%3E%3Cellipse cx='12.5' cy='5' fill='currentColor' fill-opacity='0.25' rx='7.5' ry='2'%3E%3C/ellipse%3E%3Cpath d='M12.5 15C16.6421 15 20 14.1046 20 13V20C20 21.1046 16.6421 22 12.5 22C8.35786 22 5 21.1046 5 20V13C5 14.1046 8.35786 15 12.5 15Z' fill='currentColor' opacity='0.5'%3E%3C/path%3E%3Cpath d='M12.5 7C16.6421 7 20 6.10457 20 5V11.5C20 12.6046 16.6421 13.5 12.5 13.5C8.35786 13.5 5 12.6046 5 11.5V5C5 6.10457 8.35786 7 12.5 7Z' fill='currentColor' opacity='0.5'%3E%3C/path%3E%3Cpath d='M5.23628 12C5.08204 12.1598 5 12.8273 5 13C5 14.1046 8.35786 15 12.5 15C16.6421 15 20 14.1046 20 13C20 12.8273 19.918 12.1598 19.7637 12C18.9311 12.8626 15.9947 13.5 12.5 13.5C9.0053 13.5 6.06886 12.8626 5.23628 12Z' fill='currentColor'%3E%3C/path%3E%3C/svg%3E");
	margin-right: .25rem; margin-left: -.125rem; margin-top: .25rem;
	}
	.bottomButton {
	justify-content: start; justify-content: left; text-align: left; background: transparent;
	box-shadow: none; font-size: var(--button-small-text-size); color: var(--body-text-color-subdued);
	padding-top: 0; align-items: baseline;
	}
	.bottomButton::before { content: 'tags:'; margin-right: .25rem; }
	.buttonsGroup { background: transparent; }
	.buttonsGroup:hover { background: var(--input-background-fill); }
	.buttonsGroup div { background: transparent; }
	.insivibleButtonGroup { display: none; }
	@keyframes placeHolderShimmer { 0%{ background-position: -468px 0 } 100%{ background-position: 468px 0 } }
	.linear-background {
	animation-duration: 1s; animation-fill-mode: forwards; animation-iteration-count: infinite;
	animation-name: placeHolderShimmer; animation-timing-function: linear;
	background-image: linear-gradient(to right, var(--body-text-color-subdued) 8%, #dddddd11 18%, var(--body-text-color-subdued) 33%);
	background-size: 1000px 104px; color: transparent; background-clip: text;
	}
	.settings { background: transparent; }
	.settings button span { color: var(--body-text-color-subdued); }
	"""

	# --- Knowledge Base ---
	class KnowledgeBase:
	"""Manages known entities (materials, colors) and patterns for data refinement."""
	def __init__(self):
	self.materials: Set[str] = {'Metal', 'Wood', 'Plastic', 'Aluminum', 'Bronze', 'Steel', 'Glass', 'Leather', 'Fabric'}
	self.colors: Set[str] = {'Red', 'Black', 'White', 'Silver', 'Bronze', 'Yellow', 'Blue', 'Green', 'Gray', 'Brown'}
	self.patterns: Dict[str, List[str]] = {}
	self.source_data: Dict[str, Any] = {}

	def load_source(self, source_type: str, source_path: str) -> None:
	"""Loads data from various sources and extracts knowledge."""
	try:
	if source_type == 'csv_url':
	response = requests.get(source_path, timeout=10)
	response.raise_for_status()
	df = pd.read_csv(io.StringIO(response.text))
	elif source_type == 'xlsx_url':
	response = requests.get(source_path, timeout=10)
	response.raise_for_status()
	df = pd.read_excel(io.BytesIO(response.content))
	elif source_type == 'local_csv':
	df = pd.read_csv(source_path)
	elif source_type == 'local_xlsx':
	df = pd.read_excel(source_path)
	else:
	raise ValueError(f"Unsupported source type: {source_type}")

	self._extract_knowledge(df)
	self.source_data[source_path] = df.to_dict('records')

	except requests.exceptions.RequestException as e:
	raise ConnectionError(f"Failed to fetch data from URL: {e}")
	except ValueError as e: raise e
	except Exception as e:
	raise RuntimeError(f"Error loading source {source_path}: {str(e)}")

	def _extract_knowledge(self, df: pd.DataFrame) -> None:
	"""Extracts known materials, colors, and column patterns."""
	for column in df.columns:
	if 'material' in column.lower():
	values = df[column].dropna().unique()
	self.materials.update(v.title() for v in values if isinstance(v, str))
	elif 'color' in column.lower():
	values = df[column].dropna().unique()
	self.colors.update(v.title() for v in values if isinstance(v, str))

	if df[column].dtype == 'object': # Store string patterns for fuzzy matching
	patterns = df[column].dropna().astype(str).tolist()
	self.patterns[column] = patterns

	def get_closest_match(self, value: str, field_type: str) -> Optional[str]:
	"""Finds the closest known value (material or color) for fuzzy matching."""
	known_values = getattr(self, field_type + 's', set())
	if not known_values: return None

	matches = get_close_matches(value.title(), list(known_values), n=1, cutoff=0.8)
	return matches[0] if matches else None

	knowledge_base = KnowledgeBase() # Global instance for refinement

	# --- Data Refinement Utilities ---
	def split_compound_field(field: str) -> List[str]:
	"""Splits strings like 'Red, Blue' into ['Red', 'Blue']."""
	parts = re.split(r'[,;\n]+', field)
	return list(set(p.strip().title() for p in parts if p.strip()))

	def normalize_value(value: Any, field_name: str, mode: str = 'sourceless', kb: Optional[KnowledgeBase] = None) -> Any:
	"""Normalizes a single data value based on field name and refinement mode."""
	if not isinstance(value, str): return value

	value = re.sub(r'\s+', ' ', value.strip()) # Normalize whitespace
	value = value.replace('_', ' ') # Replace underscores

	# Field-specific normalization logic
	if any(term in field_name.lower() for term in ['material']):
	parts = split_compound_field(value)
	if mode == 'sourced' and kb:
	known = [kb.get_closest_match(p, 'material') or p.title() for p in parts]
	else:
	known = [m for m in parts if m in kb.materials] if kb else parts
	return known[0] if len(known) == 1 else known

	elif any(term in field_name.lower() for term in ['color']):
	parts = split_compound_field(value)
	if mode == 'sourced' and kb:
	known = [kb.get_closest_match(p, 'color') or p.title() for p in parts]
	else:
	known = [c for c in parts if c in kb.colors] if kb else parts
	return known[0] if len(known) == 1 else known

	elif any(term in field_name.lower() for term in ['date', 'time']): return value # Placeholder

	elif any(term in field_name.lower() for term in ['type', 'status', 'category', 'description']):
	return value.title() # Title case for descriptive fields

	return value

	def clean_record(record: Dict[str, Any], mode: str = 'sourceless', kb: Optional[KnowledgeBase] = None) -> Dict[str, Any]:
	"""Cleans and normalizes a single record, handling nesting and compound fields."""
	cleaned = {}
	compound_fields_to_split = {}

	# Pass 1: Normalize values and identify compound fields
	for key, value in record.items():
	clean_key = key.strip().lower().replace(" ", "_")

	if isinstance(value, str): # Detect potential compound fields
	for material in knowledge_base.materials:
	if material.lower() in value.lower():
	compound_fields_to_split[clean_key] = value
	break

	# Recursively clean nested structures
	if isinstance(value, list):
	cleaned[clean_key] = [normalize_value(v, clean_key, mode, kb) for v in value]
	elif isinstance(value, dict):
	cleaned[clean_key] = clean_record(value, mode, kb)
	else:
	cleaned[clean_key] = normalize_value(value, clean_key, mode, kb)

	# Pass 2: Split identified compound fields
	for key, value in compound_fields_to_split.items():
	parts = split_compound_field(value)
	materials = [p for p in parts if p in knowledge_base.materials]

	if materials:
	cleaned['material'] = materials[0] if len(materials) == 1 else materials
	remaining = [p for p in parts if p not in materials]
	if remaining: cleaned['condition'] = ' '.join(remaining)
	elif key not in cleaned: # If not processed and no known materials found
	cleaned[key] = value

	return cleaned

	def refine_data_generic(dataset: List[Dict[str, Any]], mode: str = 'sourceless', kb: Optional[KnowledgeBase] = None) -> List[Dict[str, Any]]:
	"""Applies generic data refinement to a list of records, with optional knowledge base guidance."""
	if mode == 'sourced' and kb and kb.patterns: # Apply fuzzy matching if sourced
	for record in dataset:
	for field, patterns in kb.patterns.items():
	if field in record and isinstance(record[field], str):
	value = str(record[field])
	matches = get_close_matches(value, patterns, n=1, cutoff=0.8)
	if matches: record[field] = matches[0]

	return [clean_record(entry, mode, kb) for entry in dataset]

	def refine_preview_data(df: pd.DataFrame, mode: str = 'sourceless') -> pd.DataFrame:
	"""Refines the preview DataFrame based on the selected mode."""
	# Remove common auto-generated index columns
	cols_to_drop = []
	for col_name, values in df.to_dict(orient="series").items():
	try:
	if all(isinstance(v, int) and v == i for i, (v, _) in enumerate(zip(values, df.index))): cols_to_drop.append(col_name)
	elif all(isinstance(v, int) and v == i + 1 for i, (v, _) in enumerate(zip(values, df.index))): cols_to_drop.append(col_name)
	except Exception: pass # Ignore non-sequential columns

	if cols_to_drop: df = df.drop(columns=cols_to_drop)

	records = df.to_dict('records')
	refined_records = refine_data_generic(records, mode=mode, kb=knowledge_base)
	return pd.DataFrame(refined_records)

	def detect_anomalies(record: Dict[str, Any]) -> List[str]:
	"""Detects potential data quality issues (e.g., verbosity, missing values)."""
	flags = []
	for k, v in record.items():
	if isinstance(v, str):
	if len(v) > 300: flags.append(f"{k}: Too verbose.")
	if v.lower() in ['n/a', 'none', 'undefined', 'null', '']: flags.append(f"{k}: Missing value.")
	return flags

	def parse_preview_df(content: str) -> tuple[str, pd.DataFrame]:
	"""Extracts CSV from response, parses, refines, and adds quality flags."""
	csv_lines = []
	in_csv_block = False
	for line in content.split("\n"): # Extract lines within CSV code blocks
	if line.strip().startswith("```csv") or line.strip().startswith("```"): in_csv_block = True; continue
	if line.strip().startswith("```"): in_csv_block = False; continue
	if in_csv_block: csv_lines.append(line)

	csv_content = "\n".join(csv_lines)
	if not csv_content: raise ValueError("No CSV content found.")

	csv_header = csv_content.split("\n")[0] if csv_content else ""
	df = parse_csv_df(csv_content)

	refined_df = refine_preview_data(df, mode='sourceless') # Initial refinement

	# Add quality flags
	refined_records = refined_df.to_dict('records')
	for record in refined_records:
	flags = detect_anomalies(record)
	if flags: record['_quality_flags'] = flags

	return csv_header, pd.DataFrame(refined_records)

	def parse_csv_df(csv: str, csv_header: Optional[str] = None) -> pd.DataFrame:
	"""Safely parses CSV data using pandas with error handling and common fixes."""
	csv = re.sub(r'''(?!")$$(["'][\w\s]+["'][, ]*)+$$(?!")''', lambda m: '"' + m.group(0).replace('"', "'") + '"', csv) # Fix unquoted lists
	if csv_header and csv.strip() and not csv.strip().startswith(csv_header.split(',')[0]): csv = csv_header + "\n" + csv # Prepend header if missing

	try: return pd.read_csv(io.StringIO(csv), skipinitialspace=True)
	except Exception as e: raise ValueError(f"Pandas CSV parsing error: {e}")

	# --- LLM Interaction Utilities ---
	T = TypeVar("T")

	def batched(it: Iterable[T], n: int) -> Iterator[list[T]]:
	"""Yields chunks of size n from an iterable."""
	it = iter(it)
	while batch := list(islice(it, n)): yield batch

	def stream_response(msg: str, history: list[Dict[str, str]] = [], max_tokens=500) -> Iterator[str]:
	"""Streams responses from the LLM client with retry logic."""
	messages = [{"role": m["role"], "content": m["content"]} for m in history]
	messages.append({"role": "user", "content": msg})

	for attempt in range(3): # Retry mechanism
	try:
	for chunk in client.chat_completion(messages=messages, max_tokens=max_tokens, stream=True, top_p=0.8, seed=42):
	content = chunk.choices[0].delta.content
	if content: yield content
	break # Success
	except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
	print(f"LLM connection error (attempt {attempt+1}): {e}. Retrying in {2**attempt}s...")
	time.sleep(2**attempt)
	except Exception as e:
	print(f"Unexpected LLM error (attempt {attempt+1}): {e}. Retrying...")
	time.sleep(2**attempt)

	def generate_dataset_names(search_query: str, history: list[Dict[str, str]], is_real_data: bool = False, engine: Optional[str] = None) -> Iterator[str]:
	"""Generates dataset names based on a search query using the LLM."""
	query = search_query[:1000] if search_query else ""

	if is_real_data and engine:
	prompt = (
	f"@Claude-3.7-Sonnet You are a data specialist who can transform real search results into structured datasets. "
	f"A user is searching for data about: \"{query}\" "
	f"Imagine you've queried {engine} and received real search results. Create a list of {MAX_NB_ITEMS_PER_GENERATION_CALL} specific datasets that could be created from these search results. "
	f"For each dataset: 1. Give it a clear, specific name related to the search topic. 2. Include 3-5 relevant tags in parentheses, with one tag specifying the ML task type (classification, regression, clustering, etc.). "
	f"Format each dataset as: 1. DatasetName (tag1, tag2, ml_task_tag). Make these datasets sound like real collections that could be created from {engine} search results on \"{query}\"."
	)
	else:
	prompt = GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY.format(search_query=query)

	full_response = ""
	for token in stream_response(prompt, history):
	full_response += token
	yield token # Yield tokens for real-time display

	print(f"Generated dataset names for query '{search_query}'.")
	history.append({"role": "assistant", "content": full_response}) # Update history
	# No return needed as history is modified in place

	def generate_dataset_content(search_query: str, dataset_name: str, tags: str, history: list[Dict[str, str]], is_real_data: bool = False, engine: Optional[str] = None) -> Iterator[str]:
	"""Generates the description and CSV preview for a dataset."""
	query = search_query[:1000] if search_query else ""

	if is_real_data and engine:
	prompt = (
	f"@Claude-3.7-Sonnet You're a specialist in converting web search results into structured data. "
	f"Based on search results from {engine} about \"{query}\", create a preview of the dataset \"{dataset_name}\" with tags \"{tags}\". "
	f"First, write a detailed description of what this dataset contains, its structure, and how it was constructed from web search results. "
	f"Then, generate a realistic 5-row CSV preview that resembles data you might get if you scraped and structured real results from {engine}. "
	f"Format your response with: Dataset Description: [detailed description] CSV Content Preview: ```csv [CSV header and 5 rows of realistic data] ``` "
	f"Include relevant columns for the dataset type, with proper labels/categories where appropriate. The data should look like it came from real sources."
	)
	else:
	prompt = GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
	search_query=query, dataset_name=dataset_name, tags=tags
	)

	full_response = ""
	for token in stream_response(prompt, history):
	full_response += token
	yield token

	print(f"Generated content for dataset '{dataset_name}'.")
	history.append({"role": "assistant", "content": full_response}) # Update history

	def _write_generator_to_queue(queue: Queue, func: Callable, kwargs: dict) -> None:
	"""Helper to run a generator and put results (or errors) into a queue."""
	try:
	for i, result in enumerate(func(**kwargs)): queue.put((i, result))
	except Exception as e: queue.put((-1, str(e))) # Signal error with index -1
	finally: queue.put(None) # Signal completion

	def iflatmap_unordered(func: Callable, kwargs_iterable: Iterable[dict]) -> Iterable[Any]:
	"""Runs generator functions concurrently and yields results as they complete."""
	queue = Queue()
	pool_size = min(len(kwargs_iterable), os.cpu_count() or 4)
	with ThreadPool(pool_size) as pool:
	async_results = [pool.apply_async(_write_generator_to_queue, (queue, func, kwargs)) for kwargs in kwargs_iterable]

	completed_generators = 0
	while completed_generators < len(async_results):
	try:
	result = queue.get(timeout=0.1)
	if result is None: # Generator finished
	completed_generators += 1
	continue

	index, data = result
	if index == -1: # Error occurred
	print(f"Generator error: {data}")
	continue # Skip this result
	yield data # Yield successful result
	except Empty: # Timeout occurred, check if all threads are done
	if all(res.ready() for res in async_results) and queue.empty(): break

	for res in async_results: res.get(timeout=0.1) # Ensure threads finish and raise exceptions

	def generate_partial_dataset(
	title: str, content: str, search_query: str, variant: str, csv_header: str,
	output: list[Optional[dict]], indices_to_generate: list[int], history: list[Dict[str, str]],
	is_real_data: bool = False, engine: Optional[str] = None
	) -> Iterator[int]:
	"""Generates a batch of dataset rows for a specific variant."""
	dataset_name, tags = title.strip("# ").split("\ntags:", 1)
	dataset_name, tags = dataset_name.strip(), tags.strip()

	prompt = GENERATE_MORE_ROWS.format(csv_header=csv_header) + " " + variant

	# Construct initial messages for context
	initial_prompt = ""
	if is_real_data and engine:
	initial_prompt = (
	f"@Claude-3.7-Sonnet You're a specialist in converting web search results into structured data. "
	f"Based on search results from {engine} about \"{search_query}\", create a preview of the dataset \"{dataset_name}\" with tags \"{tags}\". "
	f"First, write a detailed description of what this dataset contains, its structure, and how it was constructed from web search results. "
	f"Then, generate a realistic 5-row CSV preview that resembles data you might get if you scraped and structured real results from {engine}. "
	f"Format your response with: Dataset Description: [detailed description] CSV Content Preview: ```csv [CSV header and 5 rows of realistic data] ``` "
	f"Include relevant columns for the dataset type, with proper labels/categories where appropriate. The data should look like it came from real sources."
	)
	else:
	initial_prompt = GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
	search_query=search_query, dataset_name=dataset_name, tags=tags
	)

	messages = [
	{"role": "user", "content": initial_prompt},
	{"role": "assistant", "content": title + "\n\n" + content},
	{"role": "user", "content": prompt},
	]

	generated_samples = 0
	current_csv_chunk = ""
	in_csv_block = False

	for attempt in range(3): # Retry logic
	try:
	for chunk in client.chat_completion(messages=messages, max_tokens=1500, stream=True, top_p=0.8, seed=42):
	token = chunk.choices[0].delta.content
	if not token: continue

	current_csv_chunk += token

	# Detect CSV block start/end
	if token.strip().startswith("```csv") or token.strip().startswith("```"):
	in_csv_block = True
	continue
	if token.strip().startswith("```"):
	in_csv_block = False
	if current_csv_chunk.strip(): # Process accumulated chunk if block just ended
	try:
	temp_df = parse_csv_df(current_csv_chunk.strip(), csv_header=csv_header)
	new_rows = temp_df.iloc[generated_samples:].to_dict('records')
	for i, record in enumerate(new_rows):
	if generated_samples >= len(indices_to_generate): break
	refined_record = refine_data_generic([record])[0]
	flags = detect_anomalies(refined_record)
	if flags: refined_record['_quality_flags'] = flags

	output_index = indices_to_generate[generated_samples]
	if output_index < len(output):
	output[output_index] = refined_record
	generated_samples += 1
	yield 1 # Signal progress
	except ValueError as e: print(f"CSV parsing error: {e}")
	except Exception as e: print(f"CSV chunk processing error: {e}")
	finally: current_csv_chunk = "" # Reset chunk
	continue

	if in_csv_block: # Process incrementally if inside CSV block
	try:
	temp_df = parse_csv_df(current_csv_chunk.strip(), csv_header=csv_header)
	new_rows = temp_df.iloc[generated_samples:].to_dict('records')
	for i, record in enumerate(new_rows):
	if generated_samples >= len(indices_to_generate): break
	refined_record = refine_data_generic([record])[0]
	flags = detect_anomalies(refined_record)
	if flags: refined_record['_quality_flags'] = flags

	output_index = indices_to_generate[generated_samples]
	if output_index < len(output):
	output[output_index] = refined_record
	generated_samples += 1
	yield 1
	except ValueError: pass # CSV not complete
	except Exception as e: print(f"Incremental CSV processing error: {e}")

	if generated_samples >= len(indices_to_generate): break # Target reached
	print(f"Retrying generation for variant '{variant}' (attempt {attempt+1})...")
	time.sleep(2**attempt)

	except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
	print(f"Connection error (attempt {attempt+1}): {e}. Retrying...")
	time.sleep(2**attempt)
	except Exception as e:
	print(f"Unexpected error (attempt {attempt+1}): {e}. Retrying...")
	time.sleep(2**attempt)

	def generate_variants(preview_df: pd.DataFrame) -> Iterator[str]:
	"""Generates diverse prompts for creating dataset variants."""
	label_cols = [col for col in preview_df.columns if "label" in col.lower()]
	labels = preview_df[label_cols[0]].unique() if label_cols and len(preview_df[label_cols[0]].unique()) > 1 else []

	if labels: # Prioritize label-based generation
	rarities = ["pretty obvious", "common/regular", "unexpected but useful", "uncommon but still plausible", "rare/niche but still plausible"]
	for rarity in rarities:
	for label in labels: yield GENERATE_VARIANTS_WITH_RARITY_AND_LABEL.format(rarity=rarity, label=label)
	else: # Fallback to general rarity prompts
	rarities = ["obvious", "expected", "common", "regular", "unexpected but useful", "original but useful", "specific but not far-fetched", "uncommon but still plausible", "rare but still plausible", "very niche but still plausible"]
	for rarity in rarities: yield GENERATE_VARIANTS_WITH_RARITY.format(rarity=rarity)

	# --- Gradio Interface ---
	def whoami(token: str) -> Dict[str, Any]:
	"""Fetches user information from Hugging Face Hub API."""
	try:
	response = requests.get("https://huggingface.co/api/users/me", headers={"Authorization": f"Bearer {token}"}, timeout=5)
	response.raise_for_status()
	return response.json()
	except (requests.exceptions.RequestException, ValueError) as e:
	print(f"Error fetching user info: {e}")
	return {"name": "User", "orgs": []}

	def get_repo_visibility(repo_id: str, token: str) -> str:
	"""Determines if a Hugging Face repository is public or private."""
	try:
	response = requests.get(f"https://huggingface.co/api/repos/{repo_id}", headers={"Authorization": f"Bearer {token}"}, timeout=5)
	response.raise_for_status()
	return "public" if not response.json().get("private", False) else "private"
	except HfHubHTTPError as e:
	if e.response.status_code == 404: return "public" # Assume public if repo doesn't exist
	print(f"Error checking repo visibility for {repo_id}: {e}")
	return "public"
	except Exception as e:
	print(f"Unexpected error checking repo visibility for {repo_id}: {e}")
	return "public"

	with gr.Blocks(css=css) as demo:
	generated_texts_state = gr.State((landing_page_datasets_generated_text,)) # State for generated dataset names
	current_dataset_state = gr.State(None) # State to hold current dataset details for generation
	is_real_data_state = gr.State(True) # State to track if real data is being used
	current_engine_state = gr.State(None) # State to track the current search engine
	selected_engines_state = gr.State(["DuckDuckGo.com", "Bing.com", "Search.Yahoo.com", "Search.Brave.com", "Ecosia.org"]) # Default selected engines
	searchEngines = ["AlltheInternet.com", "DuckDuckGo.com", "Google.com", "Bing.com", "Search.Yahoo.com", "Startpage.com", "Qwant.com", "Ecosia.org", "WolframAlpha.com", "Mojeek.co.uk", "Search.Brave.com", "Yandex.com", "Baidu.com", "Gibiru.com", "MetaGer.org", "Swisscows.com", "Presearch.com", "Ekoru.org", "Search.Lilo.org"]

	# --- Search Page UI ---
	with gr.Column(visible=True, elem_id="search-page") as search_page:
	gr.Markdown("# 🤗 Infinite Dataset Hub ♾️\n\nAn endless catalog of datasets, created just for you by an AI model.")
	with gr.Row():
	search_bar = gr.Textbox(max_lines=1, placeholder="Search datasets, get infinite results", show_label=False, container=False, scale=9)
	search_button = gr.Button("🔍", variant="primary", scale=1)

	button_groups: list[gr.Group] = [] # Holds the groups for dataset buttons
	buttons: list[gr.Button] = [] # Holds the actual dataset name and tag buttons
	for i in range(MAX_TOTAL_NB_ITEMS):
	if i < len(default_output): # Use default datasets initially
	line = default_output[i]
	try: dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1)
	except ValueError: dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" ", 1)[0], ""
	group_classes, name_classes, tag_classes = "buttonsGroup", "topButton", "bottomButton"
	else: # Placeholders for future datasets
	dataset_name, tags = "⬜⬜⬜⬜⬜⬜", "░░░░, ░░░░, ░░░░"
	group_classes, name_classes, tag_classes = "buttonsGroup insivibleButtonGroup", "topButton linear-background", "bottomButton linear-background"

	with gr.Group(elem_classes=group_classes) as button_group:
	button_groups.append(button_group)
	dataset_btn = gr.Button(dataset_name, elem_classes=name_classes)
	tags_btn = gr.Button(tags, elem_classes=tag_classes)
	buttons.append(dataset_btn)
	buttons.append(tags_btn)

	load_more_datasets = gr.Button("Load more datasets")
	gr.Markdown(f"_powered by [{model_id}](https://huggingface.co/{model_id})_")

	# --- Settings Panel ---
	with gr.Column(scale=4, min_width="200px"):
	with gr.Accordion("Settings", open=False, elem_classes="settings"):
	gr.Markdown("Manage your Hugging Face account and dataset saving options.")
	gr.LoginButton()
	select_namespace_dropdown = gr.Dropdown(choices=[NAMESPACE], value=NAMESPACE, label="Hugging Face Namespace", visible=False)

	gr.Markdown("Dataset Generation Mode")
	refinement_mode = gr.Radio(
	["sourceless", "sourced"], value="sourceless", label="Refinement Mode",
	info="Sourceless: AI generates data freely. Sourced: AI uses loaded data for context and refinement."
	)

	with gr.Group(visible=False) as source_group: # Dynamic section for source loading
	source_type = gr.Dropdown(
	choices=["csv_url", "xlsx_url", "local_csv", "local_xlsx"], value="csv_url",
	label="Source Type", info="Select the format of your data source."
	)
	source_path = gr.Textbox(
	label="Source Path/URL", placeholder="Enter URL or local file path",
	info="Provide the location of your dataset file."
	)
	load_source_button = gr.Button("Load Source Data", icon="https://huggingface.co/datasets/huggingface/badges/resolve/main/badge-files/data.svg")
	source_status = gr.Markdown("", visible=False)

	visibility_radio = gr.Radio(
	["public", "private"], value="public", container=False, interactive=False,
	label="Dataset Visibility", info="Set visibility for datasets saved to Hugging Face Hub."
	)

	# Search Engine Settings
	gr.Markdown("Search Engine Configuration")
	data_source_toggle = gr.Checkbox(label="Use Real Search Data", value=True, info="Toggle to include results from real search engines.")
	engine_settings_button = gr.Button("Configure Search Engines", icon="https://img.icons8.com/ios-filled/50/000000/settings--v1.png", size="sm")

	# Engine Selection Modal
	with gr.Modal("Search Engine Settings", id="engine-modal") as engine_modal:
	gr.Markdown("Select which search engines to use for real data retrieval. A diverse selection improves results.")
	engine_options_html_comp = gr.HTML(elem_id="engine-options")
	with gr.Row():
	select_all_engines_btn = gr.Button("Select All")
	deselect_all_engines_btn = gr.Button("Deselect All")
	save_engines_btn = gr.Button("Save Settings", variant="primary")

	# --- Dataset Detail Page UI ---
	with gr.Column(visible=False, elem_id="dataset-page") as dataset_page:
	gr.Markdown("# 🤗 Infinite Dataset Hub ♾️\n\nAn endless catalog of datasets, created just for you.")
	dataset_title_md = gr.Markdown() # Dataset name and tags
	dataset_source_badge = gr.Markdown() # Badge indicating real/AI data
	dataset_source_info = gr.Markdown() # Details about the data source
	dataset_description_md = gr.Markdown() # Dataset description
	preview_table_comp = gr.DataFrame(visible=False, interactive=False, wrap=True) # Holds the preview CSV

	with gr.Row():
	generate_full_dataset_button = gr.Button("Generate Full Dataset", variant="primary")
	save_dataset_button = gr.Button("💾 Save Dataset", variant="primary", visible=False)

	open_dataset_message = gr.Markdown("", visible=False) # Confirmation message
	dataset_share_button = gr.Button("Share Dataset URL")
	dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)

	full_dataset_section = gr.Column(visible=False) # Container for full dataset and downloads
	full_table_comp = gr.DataFrame(visible=False, interactive=False, wrap=True)
	with gr.Row():
	download_csv_button = gr.Button("Download CSV")
	download_json_button = gr.Button("Download JSON")
	download_parquet_button = gr.Button("Download Parquet")

	back_button = gr.Button("< Back", size="sm")

	# --- Event Handlers ---

	# Search Logic
	def _update_search_results(search_query: str, current_generated_texts: tuple[str], is_real_data: bool, engine: Optional[str]):
	"""Handles dataset search and UI updates."""
	# Reset UI to loading state
	yield {btn: gr.Button("⬜⬜⬜⬜⬜⬜", elem_classes="topButton linear-background") for btn in buttons[::2]}
	yield {btn: gr.Button("░░░░, ░░░░, ░░░░", elem_classes="bottomButton linear-background") for btn in buttons[1::2]}
	yield {group: gr.Group(elem_classes="buttonsGroup insivibleButtonGroup") for group in button_groups}

	generated_count = 0
	new_texts = ""

	try:
	# Generate dataset names from LLM
	for line in generate_dataset_names(search_query, [], is_real_data=is_real_data, engine=engine):
	if "I'm sorry" in line or "policy" in line: raise gr.Error("Inappropriate content detected.")
	if generated_count >= MAX_NB_ITEMS_PER_GENERATION_CALL: break

	match = re.match(r"^\s*\d+\.\s+(.+?)\s+$$(.+?)$$", line) # Parse line format
	if match:
	dataset_name, tags = match.groups()
	dataset_name, tags = dataset_name.strip(), tags.strip()
	new_texts += line

	# Update buttons with generated data
	yield {
	buttons[2 * generated_count]: gr.Button(dataset_name, elem_classes="topButton"),
	buttons[2 * generated_count + 1]: gr.Button(tags, elem_classes="bottomButton"),
	}
	generated_count += 1

	# Update state and make new buttons visible
	new_history = (current_generated_texts + (new_texts,)) if current_generated_texts else (landing_page_datasets_generated_text + "\n" + new_texts,)
	yield {generated_texts_state: new_history}
	yield {group: gr.Group(elem_classes="buttonsGroup") for group in button_groups[:generated_count]}

	except gr.Error as e: raise e # Propagate Gradio errors
	except Exception as e: raise gr.Error(f"Failed to generate datasets: {str(e)}")

	# Attach search handlers
	search_button.click(
	_update_search_results,
	inputs=[search_bar, generated_texts_state, is_real_data_state, current_engine_state],
	outputs=buttons + [generated_texts_state] + button_groups
	)
	search_bar.submit(
	_update_search_results,
	inputs=[search_bar, generated_texts_state, is_real_data_state, current_engine_state],
	outputs=buttons + [generated_texts_state] + button_groups
	)

	# Load More Datasets
	load_more_datasets.click(
	_update_search_results,
	inputs=[search_bar, generated_texts_state, is_real_data_state, current_engine_state],
	outputs=buttons + [generated_texts_state] + button_groups
	)

	# Display Single Dataset Details
	def _show_dataset_details(search_query, dataset_name, tags, is_real_data, engine):
	"""Switches to detail view and loads dataset content."""
	yield {
	search_page: gr.Column(visible=False), dataset_page: gr.Column(visible=True),
	dataset_title_md: f"# {dataset_name}\n\n tags: {tags}",
	dataset_share_textbox: gr.Textbox(visible=False),
	full_dataset_section: gr.Column(visible=False),
	save_dataset_button: gr.Button(visible=False),
	open_dataset_message: gr.Markdown("", visible=False)
	}

	# Update source badge and info
	if is_real_data:
	badge_html = gr.Markdown(f'<span class="px-3 py-1 rounded-full text-xs font-medium bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200">Real Data</span>', visible=True)
	info_html = gr.Markdown(f'This dataset is based on real information queried from <strong>{engine}</strong> for the search term "<strong>{search_query}</strong>". The data has been structured for machine learning use.', visible=True)
	else:
	badge_html = gr.Markdown('<span class="px-3 py-1 rounded-full text-xs font-medium bg-purple-100 text-purple-800 dark:bg-purple-900 dark:text-purple-200">AI-Generated</span>', visible=True)
	info_html = gr.Markdown(f'This is an AI-generated dataset created using {model_id}. The content is synthetic and designed to represent plausible data related to "{search_query}".', visible=True)

	yield {dataset_source_badge: badge_html, dataset_source_info: info_html}

	# Stream content generation
	for content_chunk in generate_dataset_content(search_query, dataset_name, tags, [], is_real_data=is_real_data, engine=engine):
	yield {dataset_description_md: content_chunk}

	# Link buttons to the detail view function
	def _show_dataset_from_button_wrapper(search_query, *buttons_values):
	# Determine which button was clicked to get the index
	clicked_button_index = -1
	for i, btn_val in enumerate(buttons_values):
	if btn_val is not None and btn_val != "": # Assuming non-empty value indicates the clicked button's text
	clicked_button_index = i
	break

	if clicked_button_index == -1: return # Should not happen if events are correctly wired

	# Determine if it was a name button (even index) or tag button (odd index)
	dataset_index = clicked_button_index // 2

	dataset_name, tags = buttons_values[2 * dataset_index], buttons_values[2 * dataset_index + 1]
	is_real_data = current_engine_state.value is not None # Infer from engine state
	engine = current_engine_state.value if is_real_data else None

	yield from _show_dataset_details(search_query, dataset_name, tags, is_real_data, engine)

	# Wire up click events for all dataset name and tag buttons
	for i, (name_btn, tag_btn) in enumerate(batched(buttons, 2)):
	name_btn.click(
	partial(_show_dataset_from_button_wrapper),
	inputs=[search_bar, *buttons],
	outputs=[search_page, dataset_page, dataset_title_md, dataset_description_md, dataset_source_badge, dataset_source_info, dataset_share_textbox, full_dataset_section, save_dataset_button, open_dataset_message]
	)
	tag_btn.click(
	partial(_show_dataset_from_button_wrapper),
	inputs=[search_bar, *buttons],
	outputs=[search_page, dataset_page, dataset_title_md, dataset_description_md, dataset_source_badge, dataset_source_info, dataset_share_textbox, full_dataset_section, save_dataset_button, open_dataset_message]
	)

	# Back Button Navigation
	back_button.click(lambda: (gr.Column(visible=True), gr.Column(visible=False)), outputs=[search_page, dataset_page], js="""
	function() {
	if ('parentIFrame' in window) { window.parentIFrame.scrollTo({top: 0, behavior:'smooth'}); }
	else { window.scrollTo({ top: 0, behavior: 'smooth' }); }
	return Array.from(arguments);
	}
	""")

	# Full Dataset Generation
	@generate_full_dataset_button.click(
	inputs=[dataset_title_md, dataset_description_md, search_bar, select_namespace_dropdown, visibility_radio, refinement_mode, is_real_data_state, current_engine_state],
	outputs=[full_table_comp, generate_full_dataset_button, save_dataset_button, full_dataset_section]
	)
	def _generate_full_dataset(title_md, content_md, search_query, namespace, visibility, mode, is_real_data, engine):
	# Extract dataset name and tags from the markdown title
	try:
	dataset_name = title_md.split('\n')[0].strip('# ')
	tags = title_md.split('tags:', 1)[1].strip()
	except IndexError:
	raise gr.Error("Could not parse dataset title.")

	try: csv_header, preview_df = parse_preview_df(content_md)
	except ValueError as e: raise gr.Error(f"Failed to parse preview: {e}")

	refined_preview_df = refine_preview_data(preview_df, mode)
	columns = list(refined_preview_df)

	output_data: list[Optional[dict]] = [None] * NUM_ROWS # Initialize output structure
	initial_rows = refined_preview_df.to_dict('records')
	for i, record in enumerate(initial_rows):
	if i < NUM_ROWS: output_data[i] = {"idx": i, **record}

	# Update UI: show preview, disable generate, show save button
	yield {
	full_table_comp: gr.DataFrame(pd.DataFrame([r for r in output_data if r]), visible=True),
	generate_full_dataset_button: gr.Button(interactive=False),
	save_dataset_button: gr.Button(f"💾 Save {namespace}/{dataset_name}" + (" (private)" if visibility != "public" else ""), visible=True, interactive=False),
	full_dataset_section: gr.Column(visible=True)
	}

	# Prepare generation tasks for variants
	generation_tasks = []
	variants = islice(generate_variants(refined_preview_df), NUM_VARIANTS)
	for i, variant in enumerate(variants):
	indices = list(range(len(initial_rows) + i, NUM_ROWS, NUM_VARIANTS))
	if indices: # Only create task if there are rows to generate
	generation_tasks.append({
	"func": generate_partial_dataset,
	"kwargs": {
	"title": title_md, "content": content_md, "search_query": search_query, "variant": variant,
	"csv_header": csv_header, "output": output_data, "indices_to_generate": indices,
	"history": [], # Use fresh history for each variant task
	"is_real_data": is_real_data, "engine": engine
	}
	})

	# Execute tasks in parallel and update UI progressively
	for _ in iflatmap_unordered(lambda kw: kw.pop('func')(kw), generation_tasks):
	yield {full_table_comp: pd.DataFrame([r for r in output_data if r])} # Update DataFrame display

	yield {save_dataset_button: gr.Button(interactive=True)} # Enable save button
	print(f"Full dataset generation complete for {dataset_name}.")

	# Save Dataset to Hugging Face Hub
	@save_dataset_button.click(
	inputs=[dataset_title_md, dataset_description_md, search_bar, full_table_comp, select_namespace_dropdown, visibility_radio],
	outputs=[save_dataset_button, open_dataset_message]
	)
	def _save_dataset(title_md, content_md, search_query, df, namespace, visibility, oauth_token):
	# Extract dataset name and tags from the markdown title
	try:
	dataset_name = title_md.split('\n')[0].strip('# ')
	tags = title_md.split('tags:', 1)[1].strip()
	except IndexError:
	raise gr.Error("Could not parse dataset title.")

	token = oauth_token.token if oauth_token else save_dataset_hf_token
	if not token: raise gr.Error("Login required or set SAVE_DATASET_HF_TOKEN.")

	repo_id = f"{namespace}/{dataset_name}"
	dataset_url_params = f"q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}"
	dataset_url = f"{URL}?{dataset_url_params}"

	gr.Info("Saving dataset...")
	yield {save_dataset_button: gr.Button(interactive=False)} # Disable button during save

	try:
	create_repo(repo_id=repo_id, repo_type="dataset", private=visibility!="public", exist_ok=True, token=token)
	df.to_csv(f"hf://datasets/{repo_id}/data.csv", storage_options={"token": token}, index=False)

	card_content = DATASET_CARD_CONTENT.format(title=title_md, content=content_md, url=URL, dataset_url=dataset_url, model_id=model_id, search_query=search_query)
	DatasetCard(card_content).push_to_hub(repo_id=repo_id, repo_type="dataset", token=token)

	success_msg = f"# 🎉 Yay! Dataset saved to [{repo_id}](https://huggingface.co/datasets/{repo_id})!\n\n_PS: Check Settings to manage your saved datasets._"
	gr.Info("Dataset saved successfully.")
	yield {open_dataset_message: gr.Markdown(success_msg, visible=True)}

	except HfHubHTTPError as e: raise gr.Error(f"HF Hub error: {e.message}")
	except Exception as e: raise gr.Error(f"Save failed: {str(e)}")
	finally: yield {save_dataset_button: gr.Button(interactive=True)} # Re-enable button

	# Shareable URL Generation
	@dataset_share_button.click(inputs=[dataset_title_md, search_bar], outputs=[dataset_share_textbox])
	def _show_share_url(title_md, search_query):
	try:
	dataset_name = title_md.split('\n')[0].strip('# ')
	tags = title_md.split('tags:', 1)[1].strip()
	except IndexError:
	raise gr.Error("Could not parse dataset title.")

	share_url = f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}"
	return gr.Textbox(share_url, visible=True)

	# Settings Toggles
	refinement_mode.change(lambda mode: gr.Group(visible=(mode == "sourced")), outputs=[source_group])

	data_source_toggle.change(lambda value: (gr.State(value), gr.State(value if value else None)), inputs=[data_source_toggle], outputs=[is_real_data_state, current_engine_state])

	@load_source_button.click(inputs=[source_type, source_path], outputs=[source_status])
	def _load_source_data(source_type, source_path):
	if not source_path: raise gr.Error("Source path/URL is required.")
	try:
	knowledge_base.load_source(source_type, source_path)
	gr.Info("Source data loaded.")
	return gr.Markdown("✅ Source loaded successfully", visible=True)
	except (ConnectionError, ValueError, RuntimeError) as e:
	raise gr.Error(f"Failed to load source: {str(e)}")

	# Engine Settings Modal Logic
	def _populate_engine_options(selected_engines):
	engine_options_html = ""
	for engine in searchEngines:
	is_checked = "checked" if engine in selected_engines else ""
	engine_options_html += f"""
	<div class="flex items-center">
	<input type="checkbox" id="engine-{engine.replace('.', '_')}" class="engine-checkbox mr-2 h-4 w-4" value="{engine}" {is_checked}>
	<label for="engine-{engine.replace('.', '_')}" class="cursor-pointer">{engine}</label>
	</div>
	"""
	return gr.HTML(engine_options_html)

	def _save_engine_settings(selected_engines_json):
	selected_engines = json.loads(selected_engines_json)
	if not selected_engines:
	gr.Warning("At least one search engine must be selected. Using DuckDuckGo as default.")
	selected_engines = ["DuckDuckGo.com"]

	current_engine = selected_engines[0] if selected_engines else None
	return gr.State(selected_engines), gr.State(current_engine), gr.Info(f"Updated search engines. Using {len(selected_engines)} engines.")

	# Initialize engine options component
	engine_options_html_comp = _populate_engine_options(selected_engines_state.value)

	# Update engine options when the modal is opened
	engine_settings_button.click(lambda: engine_options_html_comp.update(_populate_engine_options(selected_engines_state.value)), outputs=[engine_options_html_comp])

	select_all_engines_btn.click(lambda: engine_options_html_comp.update(_populate_engine_options(searchEngines)), outputs=[engine_options_html_comp])
	deselect_all_engines_btn.click(lambda: engine_options_html_comp.update(_populate_engine_options([])), outputs=[engine_options_html_comp])

	save_engines_btn.click(
	_save_engine_settings,
	inputs=[gr.JSON(elem_id="engine-options")], # Capture checked engines from modal
	outputs=[selected_engines_state, current_engine_state, gr.Info()]
	)

	engine_settings_button.click(lambda: engine_modal.update(visible=True), outputs=[engine_modal])
	# Close modal on save or when clicking outside (implicit via Gradio's modal handling)

	# Initial App Load Logic
	@demo.load(outputs=([search_page, dataset_page, dataset_title_md, dataset_description_md, dataset_source_badge, dataset_source_info, dataset_share_textbox, full_dataset_section, save_dataset_button, open_dataset_message, search_bar] + # Outputs for detail page and search bar
	buttons + [generated_texts_state] + # Outputs for search results buttons and state
	[select_namespace_dropdown, visibility_radio, source_group, data_source_toggle, current_engine_state, selected_engines_state, engine_options_html_comp])) # Outputs for settings
	def _load_app(request: gr.Request, oauth_token: Optional[gr.OAuthToken]):
	# Handle user login and namespace selection
	if oauth_token:
	try:
	user_info = whoami(oauth_token.token)
	namespaces = [user_info["name"]] + [org["name"] for org in user_info.get("orgs", [])]
	yield {
	select_namespace_dropdown: gr.Dropdown(choices=namespaces, value=user_info["name"], visible=True),
	visibility_radio: gr.Radio(interactive=True),
	}
	except Exception: # Fallback if user info fails
	yield {
	select_namespace_dropdown: gr.Dropdown(choices=[NAMESPACE], value=NAMESPACE, visible=True),
	visibility_radio: gr.Radio(interactive=True),
	}
	else: # Default settings if not logged in
	yield {
	select_namespace_dropdown: gr.Dropdown(choices=[NAMESPACE], value=NAMESPACE, visible=True),
	visibility_radio: gr.Radio(interactive=False),
	}

	# Handle URL parameters for direct search or dataset loading
	query_params = dict(request.query_params)
	if "dataset" in query_params:
	is_real = query_params.get("engine") is not None
	engine = query_params.get("engine")
	yield from _show_dataset_details(query_params.get("q", query_params["dataset"]), query_params["dataset"], query_params.get("tags", ""), is_real, engine)
	yield {is_real_data_state: is_real, current_engine_state: engine}
	elif "q" in query_params:
	search_query = query_params["q"]
	is_real = query_params.get("engine") is not None
	engine = query_params.get("engine")
	yield {search_bar: search_query}
	yield {is_real_data_state: is_real, current_engine_state: engine}
	yield from _update_search_results(search_query, (), is_real, engine)
	else:
	yield {search_page: gr.Column(visible=True)} # Show search page by default

	# Initialize with default datasets
	initial_outputs = {}
	for i, line in enumerate(default_output):
	try: dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1)
	except ValueError: dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" ", 1)[0], ""

	initial_outputs[buttons[2 * i]] = gr.Button(dataset_name, elem_classes="topButton")
	initial_outputs[buttons[2 * i + 1]] = gr.Button(tags, elem_classes="bottomButton")
	initial_outputs[button_groups[i]] = gr.Group(elem_classes="buttonsGroup")
	yield initial_outputs
	yield {generated_texts_state: (landing_page_datasets_generated_text,)}

	# Initialize engine settings UI
	yield {
	data_source_toggle: gr.Checkbox(value=is_real_data_state.value),
	engine_options_html_comp: _populate_engine_options(selected_engines_state.value)
	}


	if __name__ == "__main__":
	demo.launch(share=False, server_name="0.0.0.0")