|
|
|
|
|
""" |
|
|
UltraData Math Parser - Hugging Face Space Demo |
|
|
A unified HTML parser optimized for extracting mathematical content. |
|
|
""" |
|
|
|
|
|
import gradio as gr |
|
|
import requests |
|
|
from ultradata_math_parser import GeneralParser |
|
|
|
|
|
|
|
|
def fetch_url_content(url: str) -> tuple: |
|
|
"""Fetch HTML content from a URL.""" |
|
|
if not url or not url.strip(): |
|
|
return "", "Please enter a URL" |
|
|
|
|
|
url = url.strip() |
|
|
if not url.startswith(("http://", "https://")): |
|
|
url = "https://" + url |
|
|
|
|
|
try: |
|
|
headers = { |
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" |
|
|
} |
|
|
response = requests.get(url, headers=headers, timeout=15) |
|
|
response.raise_for_status() |
|
|
return response.text, url |
|
|
except requests.exceptions.Timeout: |
|
|
return "", f"Request timed out for {url}" |
|
|
except requests.exceptions.RequestException as e: |
|
|
return "", f"Failed to fetch URL: {str(e)}" |
|
|
|
|
|
|
|
|
def fetch_and_parse(url: str, process_math: bool, include_tables: bool, enable_forum: bool, html_type: str) -> tuple: |
|
|
"""Fetch URL content and parse it in one step.""" |
|
|
html_content, base_url = fetch_url_content(url) |
|
|
|
|
|
if not html_content: |
|
|
|
|
|
error_msg = base_url |
|
|
return "", error_msg, f"β {error_msg}", "", "", f"**Error:** {error_msg}" |
|
|
|
|
|
result = parse_html( |
|
|
html_content=html_content, |
|
|
base_url=base_url, |
|
|
process_math=process_math, |
|
|
include_tables=include_tables, |
|
|
enable_forum_assembly=enable_forum, |
|
|
html_type=html_type, |
|
|
) |
|
|
|
|
|
formatted = format_output(result) |
|
|
|
|
|
return html_content, base_url, formatted[0], formatted[1], formatted[2], formatted[3] |
|
|
|
|
|
|
|
|
def parse_html( |
|
|
html_content: str, |
|
|
base_url: str = "", |
|
|
process_math: bool = True, |
|
|
include_tables: bool = True, |
|
|
enable_forum_assembly: bool = True, |
|
|
html_type: str = "unified", |
|
|
) -> dict: |
|
|
""" |
|
|
Parse HTML content using GeneralParser. |
|
|
|
|
|
Args: |
|
|
html_content: Raw HTML string to parse |
|
|
base_url: Base URL for resolving relative links |
|
|
process_math: Whether to process and convert math expressions |
|
|
include_tables: Whether to preserve table elements |
|
|
enable_forum_assembly: Whether to enable forum post assembly |
|
|
html_type: Parser type (unified/article/forum) |
|
|
|
|
|
Returns: |
|
|
Dictionary containing parsed results |
|
|
""" |
|
|
if not html_content or not html_content.strip(): |
|
|
return { |
|
|
"title": "", |
|
|
"html": "", |
|
|
"text": "", |
|
|
"text_length": 0, |
|
|
"xp_num": "", |
|
|
"fallback_strategy": "", |
|
|
"forum_assembled": False, |
|
|
"error": "Please provide HTML content to parse.", |
|
|
} |
|
|
|
|
|
parser = GeneralParser() |
|
|
|
|
|
try: |
|
|
result = parser.extract( |
|
|
html=html_content, |
|
|
base_url=base_url, |
|
|
process_math=process_math, |
|
|
include_tables=include_tables, |
|
|
enable_forum_assembly=enable_forum_assembly, |
|
|
html_type=html_type, |
|
|
) |
|
|
|
|
|
return { |
|
|
"title": result.get("title", ""), |
|
|
"html": result.get("html", ""), |
|
|
"text": result.get("text", ""), |
|
|
"text_length": result.get("text_length", 0), |
|
|
"xp_num": result.get("xp_num", ""), |
|
|
"fallback_strategy": result.get("fallback_strategy", ""), |
|
|
"forum_assembled": result.get("forum_assembled", False), |
|
|
"error": None, |
|
|
} |
|
|
except Exception as e: |
|
|
return { |
|
|
"title": "", |
|
|
"html": "", |
|
|
"text": "", |
|
|
"text_length": 0, |
|
|
"xp_num": "", |
|
|
"fallback_strategy": "", |
|
|
"forum_assembled": False, |
|
|
"error": str(e), |
|
|
} |
|
|
|
|
|
|
|
|
def format_output(result: dict) -> tuple: |
|
|
"""Format the parser output for Gradio display.""" |
|
|
if result.get("error"): |
|
|
return ( |
|
|
f"β Error: {result['error']}", |
|
|
"", |
|
|
"", |
|
|
f"**Error:** {result['error']}", |
|
|
) |
|
|
|
|
|
|
|
|
text_content = result.get("text", "") |
|
|
markdown_content = text_content if text_content else "_No content extracted_" |
|
|
|
|
|
return ( |
|
|
result.get("title", ""), |
|
|
result.get("html", ""), |
|
|
result.get("text", ""), |
|
|
markdown_content, |
|
|
) |
|
|
|
|
|
|
|
|
def process_input(html_content, base_url, process_math, include_tables, enable_forum, html_type): |
|
|
"""Main processing function for Gradio interface.""" |
|
|
result = parse_html( |
|
|
html_content=html_content, |
|
|
base_url=base_url, |
|
|
process_math=process_math, |
|
|
include_tables=include_tables, |
|
|
enable_forum_assembly=enable_forum, |
|
|
html_type=html_type, |
|
|
) |
|
|
return format_output(result) |
|
|
|
|
|
|
|
|
|
|
|
EXAMPLE_HTML = """<!DOCTYPE html> |
|
|
<html> |
|
|
<head> |
|
|
<title>Quadratic Formula Example</title> |
|
|
</head> |
|
|
<body> |
|
|
<article class="post-content"> |
|
|
<h1>Understanding the Quadratic Formula</h1> |
|
|
<p>The quadratic formula is used to solve equations of the form axΒ² + bx + c = 0.</p> |
|
|
<p>The solution is given by:</p> |
|
|
<math xmlns="http://www.w3.org/1998/Math/MathML"> |
|
|
<mi>x</mi> |
|
|
<mo>=</mo> |
|
|
<mfrac> |
|
|
<mrow> |
|
|
<mo>-</mo> |
|
|
<mi>b</mi> |
|
|
<mo>Β±</mo> |
|
|
<msqrt> |
|
|
<mrow> |
|
|
<msup><mi>b</mi><mn>2</mn></msup> |
|
|
<mo>-</mo> |
|
|
<mn>4</mn> |
|
|
<mi>a</mi> |
|
|
<mi>c</mi> |
|
|
</mrow> |
|
|
</msqrt> |
|
|
</mrow> |
|
|
<mrow> |
|
|
<mn>2</mn> |
|
|
<mi>a</mi> |
|
|
</mrow> |
|
|
</mfrac> |
|
|
</math> |
|
|
<p>Where a, b, and c are coefficients of the quadratic equation.</p> |
|
|
<h2>Example Problem</h2> |
|
|
<p>Solve: xΒ² - 5x + 6 = 0</p> |
|
|
<p>Here, a = 1, b = -5, c = 6</p> |
|
|
<p>Using the formula: x = (5 Β± β(25-24))/2 = (5 Β± 1)/2</p> |
|
|
<p>Therefore, x = 3 or x = 2</p> |
|
|
</article> |
|
|
<footer> |
|
|
<nav>Related articles...</nav> |
|
|
</footer> |
|
|
</body> |
|
|
</html>""" |
|
|
|
|
|
|
|
|
|
|
|
custom_css = """ |
|
|
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&family=JetBrains+Mono:wght@400;500&display=swap'); |
|
|
|
|
|
:root { |
|
|
--primary-color: #6366f1; |
|
|
--text-light: #f8fafc; |
|
|
--text-gray: #cbd5e1; |
|
|
--panel-bg: rgba(15, 23, 42, 0.6); |
|
|
} |
|
|
|
|
|
body { |
|
|
background-color: #0f172a; |
|
|
color: var(--text-light); |
|
|
} |
|
|
|
|
|
.gradio-container { |
|
|
font-family: 'Inter', sans-serif !important; |
|
|
background: radial-gradient(circle at top left, #1e1b4b, #0f172a) !important; |
|
|
max-width: 95% !important; /* Increased width for better split view */ |
|
|
} |
|
|
|
|
|
/* Force all text to be light by default to combat Gradio's light theme defaults */ |
|
|
.gradio-container * { |
|
|
color: var(--text-light); |
|
|
} |
|
|
|
|
|
/* ... (keep existing styles) ... */ |
|
|
|
|
|
/* Output Box Styling - Fixed Height */ |
|
|
.output-textbox, .markdown-box { |
|
|
height: 600px !important; |
|
|
max-height: 600px !important; |
|
|
overflow-y: auto !important; |
|
|
background-color: #1e293b !important; |
|
|
border: 1px solid #64748b !important; |
|
|
border-radius: 8px !important; |
|
|
padding: 1rem !important; |
|
|
} |
|
|
|
|
|
.output-textbox textarea { |
|
|
background-color: transparent !important; |
|
|
border: none !important; |
|
|
box-shadow: none !important; |
|
|
height: 100% !important; |
|
|
color: #ffffff !important; |
|
|
} |
|
|
|
|
|
.markdown-box { |
|
|
background-color: #f8fafc !important; /* Light background for markdown readability */ |
|
|
color: #0f172a !important; /* Dark text for markdown */ |
|
|
} |
|
|
|
|
|
.markdown-box * { |
|
|
color: #0f172a !important; |
|
|
} |
|
|
.main-title { |
|
|
font-family: 'Inter', sans-serif !important; |
|
|
font-weight: 800 !important; |
|
|
font-size: 3rem !important; |
|
|
background: linear-gradient(to right, #818cf8, #c084fc, #f472b6) !important; |
|
|
-webkit-background-clip: text !important; |
|
|
-webkit-text-fill-color: transparent !important; |
|
|
text-align: center !important; |
|
|
margin-bottom: 0.5rem !important; |
|
|
/* Reset color for gradient text */ |
|
|
color: transparent !important; |
|
|
} |
|
|
|
|
|
.subtitle { |
|
|
text-align: center !important; |
|
|
color: var(--text-gray) !important; |
|
|
font-size: 1.1rem !important; |
|
|
margin-bottom: 3rem !important; |
|
|
font-weight: 300 !important; |
|
|
} |
|
|
|
|
|
/* Panels */ |
|
|
.glass-panel { |
|
|
background: var(--panel-bg) !important; |
|
|
backdrop-filter: blur(12px) !important; |
|
|
border: 1px solid rgba(255, 255, 255, 0.1) !important; |
|
|
border-radius: 16px !important; |
|
|
padding: 24px !important; |
|
|
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1) !important; |
|
|
} |
|
|
|
|
|
/* Labels - High Contrast, Clean Black */ |
|
|
.block > label > span, |
|
|
.form > label > span, |
|
|
.gr-form > label > span, |
|
|
.label-wrap > span { |
|
|
color: #000000 !important; /* Pure Black */ |
|
|
font-weight: 700 !important; |
|
|
font-size: 1rem !important; |
|
|
margin-bottom: 0.5rem !important; |
|
|
text-shadow: 0 0 2px #ffffff, 0 0 4px #ffffff !important; /* White glow/outline for visibility */ |
|
|
background-color: transparent !important; |
|
|
padding: 0 !important; |
|
|
} |
|
|
|
|
|
/* Info Text (Description) */ |
|
|
span.description, .description { |
|
|
color: #000000 !important; |
|
|
font-weight: 600 !important; |
|
|
text-shadow: 0 0 2px #ffffff !important; |
|
|
opacity: 1 !important; |
|
|
} |
|
|
|
|
|
/* Fix Radio/Checkbox alignment & styling */ |
|
|
fieldset label span { |
|
|
margin-bottom: 0 !important; |
|
|
text-shadow: none !important; |
|
|
font-weight: 600 !important; |
|
|
color: #0f172a !important; /* Dark text for unselected options (white background) */ |
|
|
display: flex !important; |
|
|
align-items: center !important; |
|
|
} |
|
|
|
|
|
/* Selected radio button text should be white */ |
|
|
fieldset label.selected span { |
|
|
color: #ffffff !important; |
|
|
} |
|
|
|
|
|
/* Radio group title (e.g., Difficulty Level) */ |
|
|
fieldset legend, fieldset legend span, |
|
|
.gr-radio > label, .gr-radio > label span, |
|
|
.gradio-container .label-wrap, .gradio-container .label-wrap span { |
|
|
color: #000000 !important; |
|
|
font-weight: 700 !important; |
|
|
text-shadow: 0 0 2px #ffffff, 0 0 4px #ffffff !important; |
|
|
} |
|
|
|
|
|
/* Inputs & Textareas - Dark Grey Background for Contrast */ |
|
|
.gr-input, textarea, input, .gr-box, .gr-check-radio, .gr-dropdown { |
|
|
font-family: 'JetBrains Mono', monospace !important; |
|
|
background-color: #1e293b !important; /* Slate 800 - Lighter than bg, darker than text */ |
|
|
border: 1px solid #64748b !important; /* Visible Slate Border */ |
|
|
color: #ffffff !important; |
|
|
box-shadow: none !important; |
|
|
} |
|
|
|
|
|
/* Focus state */ |
|
|
.gr-input:focus, textarea:focus, input:focus { |
|
|
border-color: #ffffff !important; /* White border on focus */ |
|
|
background-color: #334155 !important; /* Slightly lighter on focus */ |
|
|
} |
|
|
|
|
|
/* Override default block backgrounds */ |
|
|
.gradio-container .block, .gradio-container .panel { |
|
|
background-color: transparent !important; |
|
|
border: none !important; |
|
|
} |
|
|
|
|
|
/* Fix for dropdown options background */ |
|
|
ul.options, .gr-dropdown-options { |
|
|
background-color: #1e293b !important; |
|
|
color: #ffffff !important; |
|
|
border: 1px solid #64748b !important; |
|
|
} |
|
|
|
|
|
/* Markdown prose */ |
|
|
.prose, .prose p, .prose h1, .prose h2, .prose h3, .prose strong, .prose li { |
|
|
color: var(--text-light) !important; |
|
|
} |
|
|
|
|
|
/* Buttons */ |
|
|
.gr-button-primary { |
|
|
background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%) !important; |
|
|
border: none !important; |
|
|
color: white !important; |
|
|
font-weight: 600 !important; |
|
|
border-radius: 8px !important; |
|
|
transition: transform 0.2s, box-shadow 0.2s !important; |
|
|
} |
|
|
|
|
|
.gr-button-primary:hover { |
|
|
transform: translateY(-1px) !important; |
|
|
box-shadow: 0 10px 15px -3px rgba(99, 102, 241, 0.3) !important; |
|
|
} |
|
|
|
|
|
.gr-button-secondary { |
|
|
background: rgba(30, 41, 59, 0.8) !important; |
|
|
border: 1px solid rgba(148, 163, 184, 0.3) !important; |
|
|
color: var(--text-gray) !important; |
|
|
border-radius: 8px !important; |
|
|
} |
|
|
|
|
|
.gr-button-secondary:hover { |
|
|
background: rgba(51, 65, 85, 0.8) !important; |
|
|
color: var(--text-light) !important; |
|
|
} |
|
|
|
|
|
/* Tabs */ |
|
|
.tabs { |
|
|
border: none !important; |
|
|
background: transparent !important; |
|
|
margin-bottom: 2rem !important; |
|
|
} |
|
|
|
|
|
.tab-nav { |
|
|
border-bottom: 1px solid rgba(148, 163, 184, 0.2) !important; |
|
|
justify-content: center !important; |
|
|
} |
|
|
|
|
|
.tab-nav button { |
|
|
font-weight: 600 !important; |
|
|
font-size: 1rem !important; |
|
|
color: var(--text-gray) !important; |
|
|
transition: all 0.3s ease !important; |
|
|
} |
|
|
|
|
|
.tab-nav button.selected { |
|
|
color: #818cf8 !important; |
|
|
border-bottom: 2px solid #818cf8 !important; |
|
|
} |
|
|
|
|
|
/* Section Header */ |
|
|
.section-header { |
|
|
color: #818cf8 !important; |
|
|
font-weight: 700 !important; |
|
|
font-size: 1.5rem !important; |
|
|
margin-bottom: 1.5rem !important; |
|
|
padding-bottom: 0.5rem !important; |
|
|
border-bottom: 2px solid rgba(129, 140, 248, 0.3) !important; |
|
|
text-shadow: 0 2px 4px rgba(0,0,0,0.3); |
|
|
} |
|
|
|
|
|
/* Footer */ |
|
|
.footer-text { |
|
|
text-align: center; |
|
|
margin-top: 4rem; |
|
|
padding: 2rem; |
|
|
color: var(--text-gray); |
|
|
font-size: 0.9rem; |
|
|
border-top: 1px solid rgba(148, 163, 184, 0.1); |
|
|
} |
|
|
.footer-text a { |
|
|
color: #818cf8 !important; |
|
|
} |
|
|
|
|
|
/* Accordion */ |
|
|
.gr-accordion { |
|
|
background-color: rgba(30, 41, 59, 0.4) !important; |
|
|
border: 1px solid rgba(148, 163, 184, 0.2) !important; |
|
|
} |
|
|
|
|
|
/* ===== Tech Refresh (Cleaner + Single Box Output) ===== */ |
|
|
:root { |
|
|
--bg: #0b1020; |
|
|
--surface: #0f172a; |
|
|
--surface-2: #111827; |
|
|
--border: #1f2937; |
|
|
--text: #e5e7eb; |
|
|
--muted: #94a3b8; |
|
|
--accent: #38bdf8; |
|
|
--accent-2: #8b5cf6; |
|
|
} |
|
|
|
|
|
body { |
|
|
background-color: var(--bg) !important; |
|
|
color: var(--text) !important; |
|
|
} |
|
|
|
|
|
.gradio-container { |
|
|
background: |
|
|
radial-gradient(circle at 15% 10%, rgba(56, 189, 248, 0.08), transparent 40%), |
|
|
radial-gradient(circle at 85% 20%, rgba(139, 92, 246, 0.08), transparent 45%), |
|
|
linear-gradient(180deg, #0b1020 0%, #0b1020 100%) !important; |
|
|
background-size: auto, auto, auto !important; |
|
|
} |
|
|
|
|
|
.main-title { |
|
|
background: linear-gradient(90deg, #e5e7eb, #c7d2fe, #38bdf8) !important; |
|
|
-webkit-background-clip: text !important; |
|
|
-webkit-text-fill-color: transparent !important; |
|
|
} |
|
|
|
|
|
.subtitle { |
|
|
color: var(--muted) !important; |
|
|
} |
|
|
|
|
|
.glass-panel { |
|
|
background: linear-gradient(180deg, rgba(15, 23, 42, 0.95), rgba(17, 24, 39, 0.95)) !important; |
|
|
border: 1px solid rgba(56, 189, 248, 0.15) !important; |
|
|
box-shadow: |
|
|
0 0 0 1px rgba(139, 92, 246, 0.12), |
|
|
0 20px 40px rgba(2, 6, 23, 0.55) !important; |
|
|
} |
|
|
|
|
|
.section-header { |
|
|
color: var(--text) !important; |
|
|
border-bottom: 1px solid rgba(56, 189, 248, 0.2) !important; |
|
|
text-shadow: none !important; |
|
|
} |
|
|
|
|
|
/* Labels & descriptions */ |
|
|
.block > label > span, |
|
|
.form > label > span, |
|
|
.gr-form > label > span, |
|
|
.label-wrap > span { |
|
|
color: var(--text) !important; |
|
|
text-shadow: none !important; |
|
|
} |
|
|
|
|
|
span.description, .description { |
|
|
color: var(--muted) !important; |
|
|
text-shadow: none !important; |
|
|
} |
|
|
|
|
|
/* Inputs */ |
|
|
.gr-input, textarea, input, .gr-box, .gr-check-radio, .gr-dropdown { |
|
|
background-color: var(--surface-2) !important; |
|
|
border: 1px solid var(--border) !important; |
|
|
color: var(--text) !important; |
|
|
} |
|
|
|
|
|
.gr-input:focus, textarea:focus, input:focus { |
|
|
border-color: var(--accent) !important; |
|
|
box-shadow: 0 0 0 2px rgba(56, 189, 248, 0.15) !important; |
|
|
} |
|
|
|
|
|
/* Buttons */ |
|
|
.gr-button-primary { |
|
|
background: linear-gradient(135deg, #2563eb 0%, #7c3aed 100%) !important; |
|
|
box-shadow: 0 8px 20px rgba(37, 99, 235, 0.2) !important; |
|
|
} |
|
|
|
|
|
.gr-button-primary:hover { |
|
|
background: linear-gradient(135deg, #1d4ed8 0%, #6d28d9 100%) !important; |
|
|
} |
|
|
|
|
|
.gr-button-secondary { |
|
|
background: transparent !important; |
|
|
border: 1px solid rgba(148, 163, 184, 0.35) !important; |
|
|
color: var(--text) !important; |
|
|
} |
|
|
|
|
|
/* Tabs */ |
|
|
.tab-nav button { |
|
|
color: var(--muted) !important; |
|
|
} |
|
|
|
|
|
.tab-nav button.selected { |
|
|
color: var(--text) !important; |
|
|
border-bottom: 2px solid var(--accent) !important; |
|
|
} |
|
|
|
|
|
/* Output: single box + auto height */ |
|
|
.output-textbox { |
|
|
background-color: var(--surface-2) !important; |
|
|
border: 1px solid var(--border) !important; |
|
|
border-radius: 12px !important; |
|
|
padding: 12px !important; |
|
|
min-height: 220px !important; |
|
|
max-height: 560px !important; |
|
|
height: auto !important; |
|
|
overflow-y: auto !important; |
|
|
} |
|
|
|
|
|
.output-textbox textarea { |
|
|
background-color: transparent !important; |
|
|
border: none !important; |
|
|
box-shadow: none !important; |
|
|
color: var(--text) !important; |
|
|
min-height: 200px !important; |
|
|
max-height: 520px !important; |
|
|
height: auto !important; |
|
|
overflow-y: auto !important; |
|
|
} |
|
|
|
|
|
.markdown-box { |
|
|
background: transparent !important; |
|
|
border: none !important; |
|
|
padding: 0 !important; |
|
|
} |
|
|
|
|
|
.markdown-box .prose { |
|
|
background-color: var(--surface-2) !important; |
|
|
border: 1px solid var(--border) !important; |
|
|
border-radius: 12px !important; |
|
|
padding: 16px !important; |
|
|
min-height: 220px !important; |
|
|
max-height: 560px !important; |
|
|
overflow-y: auto !important; |
|
|
} |
|
|
|
|
|
.markdown-box, .markdown-box * { |
|
|
color: var(--text) !important; |
|
|
} |
|
|
|
|
|
.markdown-box code, .markdown-box pre { |
|
|
background: #1f2937 !important; |
|
|
} |
|
|
|
|
|
/* === Premium Clean Overrides === */ |
|
|
:root { |
|
|
--bg: #0b1120; |
|
|
--surface: #111827; |
|
|
--surface-2: #0f172a; |
|
|
--border: #1f2937; |
|
|
--text: #e5e7eb; |
|
|
--muted: #94a3b8; |
|
|
--accent: #6366f1; |
|
|
} |
|
|
|
|
|
body { |
|
|
background-color: var(--bg) !important; |
|
|
color: var(--text) !important; |
|
|
} |
|
|
|
|
|
.gradio-container { |
|
|
background: var(--bg) !important; |
|
|
width: 95vw !important; |
|
|
max-width: 1400px !important; |
|
|
margin: 0 auto !important; |
|
|
} |
|
|
|
|
|
.main-title { |
|
|
background: none !important; |
|
|
-webkit-text-fill-color: unset !important; |
|
|
color: var(--text) !important; |
|
|
font-size: 2.4rem !important; |
|
|
letter-spacing: -0.01em !important; |
|
|
} |
|
|
|
|
|
.subtitle { |
|
|
color: var(--muted) !important; |
|
|
margin-bottom: 2rem !important; |
|
|
} |
|
|
|
|
|
.glass-panel { |
|
|
background: linear-gradient(180deg, rgba(17, 24, 39, 0.95), rgba(15, 23, 42, 0.95)) !important; |
|
|
border: 1px solid var(--border) !important; |
|
|
box-shadow: 0 12px 32px rgba(0, 0, 0, 0.35) !important; |
|
|
} |
|
|
|
|
|
.section-header { |
|
|
color: var(--text) !important; |
|
|
border-bottom: 1px solid var(--border) !important; |
|
|
text-shadow: none !important; |
|
|
} |
|
|
|
|
|
/* Labels & descriptions */ |
|
|
.block > label > span, |
|
|
.form > label > span, |
|
|
.gr-form > label > span, |
|
|
.label-wrap > span, |
|
|
span.description, |
|
|
.description { |
|
|
color: var(--muted) !important; |
|
|
text-shadow: none !important; |
|
|
} |
|
|
|
|
|
/* Inputs */ |
|
|
.gr-input, textarea, input, .gr-box, .gr-check-radio, .gr-dropdown { |
|
|
background-color: var(--surface-2) !important; |
|
|
border: 1px solid var(--border) !important; |
|
|
color: var(--text) !important; |
|
|
} |
|
|
|
|
|
.gr-input:focus, textarea:focus, input:focus { |
|
|
border-color: var(--accent) !important; |
|
|
background-color: #111827 !important; |
|
|
} |
|
|
|
|
|
/* Buttons */ |
|
|
.gr-button-primary { |
|
|
background: var(--accent) !important; |
|
|
box-shadow: none !important; |
|
|
color: #ffffff !important; |
|
|
} |
|
|
|
|
|
.gr-button-primary:hover { |
|
|
background: #4f46e5 !important; |
|
|
} |
|
|
|
|
|
.gr-button-secondary { |
|
|
background: transparent !important; |
|
|
border: 1px solid var(--border) !important; |
|
|
color: var(--text) !important; |
|
|
} |
|
|
|
|
|
.gr-button-secondary:hover { |
|
|
background: rgba(31, 41, 55, 0.6) !important; |
|
|
} |
|
|
|
|
|
/* Tabs */ |
|
|
.tab-nav button { |
|
|
color: var(--muted) !important; |
|
|
} |
|
|
|
|
|
.tab-nav button.selected { |
|
|
color: var(--text) !important; |
|
|
border-bottom: 2px solid var(--accent) !important; |
|
|
} |
|
|
|
|
|
/* Output */ |
|
|
.output-textbox, .markdown-box { |
|
|
background-color: var(--surface-2) !important; |
|
|
border: 1px solid var(--border) !important; |
|
|
height: 560px !important; |
|
|
max-height: 560px !important; |
|
|
} |
|
|
|
|
|
.markdown-box, .markdown-box * { |
|
|
color: var(--text) !important; |
|
|
} |
|
|
|
|
|
.markdown-box code, .markdown-box pre { |
|
|
background: #1f2937 !important; |
|
|
} |
|
|
""" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="UltraData Math Parser", css=custom_css, theme=gr.themes.Soft()) as demo: |
|
|
gr.HTML('<h1 class="main-title">π UltraData Math Parser</h1>') |
|
|
gr.HTML('<p class="subtitle">Unified HTML Parser for Mathematical Content Extraction</p>') |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1, elem_classes=["glass-panel"]): |
|
|
gr.HTML('<div class="section-header">π₯ Input</div>') |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.TabItem("π URL"): |
|
|
url_input = gr.Textbox( |
|
|
label="URL", |
|
|
placeholder="Enter URL to fetch (e.g., https://example.com/math-article)", |
|
|
lines=3, |
|
|
max_lines=5, |
|
|
value="https://math.stackexchange.com/questions/5120625/ode-problem-of-yt-sqrtyt-with-the-inital-value-y0-1-t-geq-0", |
|
|
) |
|
|
fetch_btn = gr.Button("π₯ Fetch & Parse", variant="primary", size="lg") |
|
|
|
|
|
with gr.TabItem("π HTML"): |
|
|
pass |
|
|
|
|
|
html_input = gr.Textbox( |
|
|
label="HTML Content", |
|
|
placeholder="Paste your HTML content here or fetch from URL above...", |
|
|
lines=10, |
|
|
max_lines=20, |
|
|
value=EXAMPLE_HTML, |
|
|
) |
|
|
|
|
|
base_url_input = gr.Textbox( |
|
|
label="Base URL (Auto-filled from URL fetch)", |
|
|
placeholder="https://example.com/page", |
|
|
lines=1, |
|
|
) |
|
|
|
|
|
with gr.Accordion("βοΈ Advanced Options", open=False): |
|
|
html_type = gr.Radio( |
|
|
choices=["unified", "article", "forum"], |
|
|
value="unified", |
|
|
label="Parser Type", |
|
|
info="Select the parsing strategy", |
|
|
) |
|
|
process_math = gr.Checkbox( |
|
|
label="Process Math Expressions", |
|
|
value=True, |
|
|
info="Convert MathML and LaTeX to unified format", |
|
|
) |
|
|
include_tables = gr.Checkbox( |
|
|
label="Include Tables", |
|
|
value=True, |
|
|
info="Preserve table elements in output", |
|
|
) |
|
|
enable_forum = gr.Checkbox( |
|
|
label="Enable Forum Assembly", |
|
|
value=True, |
|
|
info="Assemble forum posts and comments", |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
parse_btn = gr.Button("π Parse HTML", variant="primary", size="lg") |
|
|
clear_btn = gr.Button("ποΈ Clear", variant="secondary", size="lg") |
|
|
|
|
|
with gr.Column(scale=1, elem_classes=["glass-panel"]): |
|
|
gr.HTML('<div class="section-header">π€ Output</div>') |
|
|
|
|
|
title_output = gr.Textbox( |
|
|
label="Extracted Title", |
|
|
lines=1, |
|
|
interactive=False, |
|
|
) |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.TabItem("β¨ Markdown"): |
|
|
markdown_output = gr.Markdown( |
|
|
value="### Output will appear here...", |
|
|
label="Markdown Preview", |
|
|
elem_classes=["markdown-box"], |
|
|
latex_delimiters=[ |
|
|
{"left": "$$", "right": "$$", "display": True}, |
|
|
{"left": "$", "right": "$", "display": False}, |
|
|
{"left": "\\[", "right": "\\]", "display": True}, |
|
|
{"left": "\\(", "right": "\\)", "display": False}, |
|
|
], |
|
|
) |
|
|
with gr.TabItem("π Plain Text"): |
|
|
text_output = gr.Textbox( |
|
|
label="Plain Text (w3m rendered)", |
|
|
lines=25, |
|
|
max_lines=30, |
|
|
interactive=False, |
|
|
autoscroll=False, |
|
|
elem_classes=["output-textbox"], |
|
|
) |
|
|
with gr.TabItem("π Raw HTML"): |
|
|
html_output = gr.Textbox( |
|
|
label="Extracted HTML", |
|
|
lines=25, |
|
|
max_lines=30, |
|
|
interactive=False, |
|
|
autoscroll=False, |
|
|
elem_classes=["output-textbox"], |
|
|
) |
|
|
|
|
|
|
|
|
fetch_btn.click( |
|
|
fn=fetch_and_parse, |
|
|
inputs=[url_input, process_math, include_tables, enable_forum, html_type], |
|
|
outputs=[html_input, base_url_input, title_output, html_output, text_output, markdown_output], |
|
|
) |
|
|
|
|
|
parse_btn.click( |
|
|
fn=process_input, |
|
|
inputs=[html_input, base_url_input, process_math, include_tables, enable_forum, html_type], |
|
|
outputs=[title_output, html_output, text_output, markdown_output], |
|
|
) |
|
|
|
|
|
def clear_all(): |
|
|
return "", "", "", "", "", "", "" |
|
|
|
|
|
clear_btn.click( |
|
|
fn=clear_all, |
|
|
outputs=[url_input, html_input, base_url_input, title_output, html_output, text_output, markdown_output], |
|
|
) |
|
|
|
|
|
|
|
|
gr.HTML(""" |
|
|
<div class="footer-text"> |
|
|
<p>π¬ <strong>UltraData Math Parser</strong> - Part of the UltraData-Math Project</p> |
|
|
<p>Specialized in extracting mathematical content from web pages with MathML, LaTeX, and formula support.</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(ssr_mode=False) |
|
|
|