Spaces:
Sleeping
Sleeping
import gradio as gr ### | |
import pandas as pd | |
import aiohttp | |
import asyncio | |
import json | |
import os | |
import numpy as np | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from plotly.subplots import make_subplots | |
from typing import Optional, Tuple, Dict, Any, List | |
import logging | |
from datetime import datetime, timedelta | |
import re | |
from jinja2 import Template | |
import markdown | |
import zipfile | |
import io | |
import base64 | |
from scipy import stats | |
import seaborn as sns | |
import warnings | |
warnings.filterwarnings('ignore') | |
# Configure logging with better formatting | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
) | |
logger = logging.getLogger(__name__) | |
class AdvancedDataAnalyzer: | |
def __init__(self): | |
self.api_base_url = "https://llm.chutes.ai/v1/chat/completions" | |
self.max_file_size = 100 * 1024 * 1024 # Increased to 100MB | |
self.conversation_history = [] | |
self.current_df = None | |
self.current_charts = None | |
self.analysis_cache = {} | |
self.supported_formats = ['.csv', '.xlsx', '.xls', '.json', '.parquet', '.tsv'] | |
def validate_api_key(self, api_key: str) -> Tuple[bool, str]: | |
"""Enhanced API key validation""" | |
if not api_key or len(api_key.strip()) < 10: | |
return False, "API key must be at least 10 characters long" | |
# Check for common API key patterns | |
api_key = api_key.strip() | |
if not (api_key.startswith(('sk-', 'pk-', 'Bearer ')) or len(api_key) > 20): | |
return False, "API key format appears invalid" | |
return True, "Valid API key format" | |
def validate_file(self, file) -> Tuple[bool, str]: | |
"""Enhanced file validation with better error messages""" | |
if not file: | |
return False, "No file uploaded" | |
try: | |
file_size = os.path.getsize(file.name) | |
if file_size > self.max_file_size: | |
return False, f"File too large. Maximum size: {self.max_file_size // (1024*1024)}MB" | |
if file_size == 0: | |
return False, "File is empty" | |
file_extension = os.path.splitext(file.name)[1].lower() | |
if file_extension not in self.supported_formats: | |
return False, f"Unsupported format. Supported: {', '.join(self.supported_formats)}" | |
return True, "File validation passed" | |
except Exception as e: | |
return False, f"File validation error: {str(e)}" | |
async def analyze_with_chutes(self, api_token: str, data_summary: str, user_question: str = None, analysis_type: str = "comprehensive") -> str: | |
"""Enhanced API call with better prompts and error handling""" | |
headers = { | |
"Authorization": f"Bearer {api_token.strip()}", | |
"Content-Type": "application/json", | |
"User-Agent": "SmartDataAnalyzer/2.0" | |
} | |
# Create specialized prompts based on analysis type | |
prompts = { | |
"comprehensive": f"""You are a senior data scientist with 10+ years of experience. Analyze this dataset comprehensively: | |
{data_summary} | |
Provide a thorough analysis with: | |
1. **Executive Summary**: 3-4 key takeaways for stakeholders | |
2. **Statistical Insights**: Important numbers, distributions, and what they reveal | |
3. **Pattern Recognition**: Trends, correlations, seasonality, anomalies | |
4. **Data Quality Assessment**: Completeness, accuracy, consistency issues | |
5. **Business Intelligence**: Actionable insights and opportunities | |
6. **Risk Analysis**: Potential data quality issues or business risks | |
7. **Recommendations**: Specific, prioritized next steps | |
Use bullet points, specific numbers, and clear explanations.""", | |
"quick": f"""Provide a quick but insightful analysis of this dataset: | |
{data_summary} | |
Focus on: | |
- Top 3 most important findings | |
- Any obvious patterns or anomalies | |
- Key business insights | |
- Quick recommendations | |
Keep it concise but valuable.""", | |
"question": f"""Based on this dataset: | |
{data_summary} | |
User's specific question: {user_question} | |
Provide a detailed, data-driven answer with: | |
- Direct answer to the question | |
- Supporting evidence from the data | |
- Additional related insights | |
- Specific recommendations | |
- Follow-up questions to consider""" | |
} | |
prompt = prompts.get(analysis_type, prompts["comprehensive"]) | |
if user_question and analysis_type != "question": | |
prompt += f"\n\nUser's additional question: {user_question}" | |
body = { | |
"model": "openai/gpt-oss-20b", | |
"messages": [ | |
{ | |
"role": "system", | |
"content": """You are an expert data scientist and business analyst. Provide clear, actionable insights with specific data points. Use markdown formatting for better readability. Always include: | |
- Specific numbers and percentages | |
- Clear section headers | |
- Bullet points for key insights | |
- Bold text for important findings | |
- Recommendations with priority levels""" | |
}, | |
{ | |
"role": "user", | |
"content": prompt | |
} | |
], | |
"stream": True, | |
"max_tokens": 4000, | |
"temperature": 0.3, | |
"top_p": 0.9 | |
} | |
try: | |
timeout = aiohttp.ClientTimeout(total=45) # Increased timeout | |
async with aiohttp.ClientSession(timeout=timeout) as session: | |
async with session.post(self.api_base_url, headers=headers, json=body) as response: | |
if response.status == 401: | |
return "β **Authentication Error**: Invalid API key. Please verify your Chutes API token." | |
elif response.status == 429: | |
return "β³ **Rate Limit Exceeded**: Too many requests. Please wait 30 seconds and try again." | |
elif response.status == 503: | |
return "π§ **Service Unavailable**: API temporarily unavailable. Please try again later." | |
elif response.status != 200: | |
error_text = await response.text() | |
return f"β **API Error {response.status}**: {error_text[:200]}" | |
full_response = "" | |
async for line in response.content: | |
line = line.decode("utf-8").strip() | |
if line.startswith("data: "): | |
data = line[6:] | |
if data == "[DONE]": | |
break | |
try: | |
chunk_data = json.loads(data) | |
if "choices" in chunk_data and len(chunk_data["choices"]) > 0: | |
delta = chunk_data["choices"][0].get("delta", {}) | |
content = delta.get("content", "") | |
if content: | |
full_response += content | |
except json.JSONDecodeError: | |
continue | |
if not full_response: | |
return "β οΈ **Empty Response**: No analysis received. Please try again." | |
# Store in conversation history | |
self.conversation_history.append({ | |
"timestamp": datetime.now(), | |
"question": user_question or "General Analysis", | |
"response": full_response[:500] + "..." if len(full_response) > 500 else full_response | |
}) | |
return full_response | |
except asyncio.TimeoutError: | |
return "β° **Timeout Error**: Analysis took too long. Try with a smaller file or simpler question." | |
except aiohttp.ClientError as e: | |
logger.error(f"HTTP Error: {str(e)}") | |
return f"π **Connection Error**: Unable to reach API. Check your internet connection." | |
except Exception as e: | |
logger.error(f"Unexpected API Error: {str(e)}") | |
return f"β **Unexpected Error**: {str(e)}" | |
def process_file(self, file_path: str, sample_size: int = None) -> Tuple[pd.DataFrame, str, str]: | |
"""Enhanced file processing with support for multiple formats and sampling""" | |
try: | |
file_extension = os.path.splitext(file_path)[1].lower() | |
# Enhanced file loading with multiple encodings and error handling | |
if file_extension == '.csv': | |
for encoding in ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']: | |
for sep in [',', ';', '\t', '|']: | |
try: | |
df = pd.read_csv(file_path, encoding=encoding, sep=sep, low_memory=False) | |
if df.shape[1] > 1: # Valid separator found | |
break | |
except (UnicodeDecodeError, pd.errors.ParserError): | |
continue | |
else: | |
continue | |
break | |
else: | |
raise ValueError("Could not decode CSV file with any supported encoding/separator") | |
elif file_extension == '.tsv': | |
df = pd.read_csv(file_path, sep='\t', encoding='utf-8') | |
elif file_extension in ['.xlsx', '.xls']: | |
df = pd.read_excel(file_path, engine='openpyxl' if file_extension == '.xlsx' else 'xlrd') | |
elif file_extension == '.json': | |
with open(file_path, 'r', encoding='utf-8') as f: | |
data = json.load(f) | |
df = pd.json_normalize(data) if isinstance(data, list) else pd.DataFrame(data) | |
elif file_extension == '.parquet': | |
df = pd.read_parquet(file_path) | |
# Data cleaning and preprocessing | |
df.columns = df.columns.astype(str).str.strip().str.replace(r'\s+', ' ', regex=True) | |
# Remove completely empty rows and columns | |
df = df.dropna(how='all').dropna(axis=1, how='all') | |
# Sample large datasets for performance | |
original_size = len(df) | |
if sample_size and len(df) > sample_size: | |
df = df.sample(n=sample_size, random_state=42) | |
logger.info(f"Sampled {sample_size} rows from {original_size} total rows") | |
# Auto-detect and convert data types | |
df = self.auto_detect_types(df) | |
self.current_df = df | |
data_summary = self.generate_comprehensive_summary(df, original_size) | |
charts_html = self.generate_advanced_visualizations(df) | |
return df, data_summary, charts_html | |
except Exception as e: | |
logger.error(f"File processing error: {str(e)}") | |
raise Exception(f"Error processing file: {str(e)}") | |
def auto_detect_types(self, df: pd.DataFrame) -> pd.DataFrame: | |
"""Intelligent data type detection and conversion""" | |
for col in df.columns: | |
if df[col].dtype == 'object': | |
# Try to convert to datetime | |
if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated', 'timestamp']): | |
try: | |
df[col] = pd.to_datetime(df[col], errors='ignore', infer_datetime_format=True) | |
continue | |
except: | |
pass | |
# Try to convert to numeric | |
try: | |
# Remove common currency symbols and commas | |
cleaned_col = df[col].astype(str).str.replace(r'[$,β¬Β£Β₯βΉ]', '', regex=True) | |
cleaned_col = cleaned_col.str.replace(r'[^\d.-]', '', regex=True) | |
numeric_col = pd.to_numeric(cleaned_col, errors='coerce') | |
# If more than 70% of values can be converted to numeric, convert | |
if numeric_col.notna().sum() / len(df) > 0.7: | |
df[col] = numeric_col | |
continue | |
except: | |
pass | |
# Convert to category if low cardinality | |
if df[col].nunique() / len(df) < 0.1 and df[col].nunique() < 50: | |
df[col] = df[col].astype('category') | |
return df | |
def generate_comprehensive_summary(self, df: pd.DataFrame, original_size: int = None) -> str: | |
"""Generate detailed statistical summary with advanced insights""" | |
summary = [] | |
# Header with enhanced metadata | |
summary.append("# π Advanced Dataset Analysis Report") | |
summary.append(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
summary.append(f"**Dataset Size**: {df.shape[0]:,} rows Γ {df.shape[1]} columns") | |
if original_size and original_size != len(df): | |
summary.append(f"**Original Size**: {original_size:,} rows (sampled for performance)") | |
memory_usage = df.memory_usage(deep=True).sum() / 1024**2 | |
summary.append(f"**Memory Usage**: {memory_usage:.2f} MB") | |
summary.append(f"**Data Density**: {(1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])):.1%} complete\n") | |
# Enhanced column type analysis | |
type_counts = df.dtypes.value_counts() | |
summary.append("## π Column Type Distribution:") | |
for dtype, count in type_counts.items(): | |
percentage = (count / len(df.columns) * 100) | |
summary.append(f"- **{dtype}**: {count} columns ({percentage:.1f}%)") | |
# Advanced missing data analysis | |
missing_data = df.isnull().sum() | |
missing_pct = (missing_data / len(df) * 100).round(2) | |
missing_summary = missing_data[missing_data > 0].sort_values(ascending=False) | |
if len(missing_summary) > 0: | |
summary.append("\n## β οΈ Data Quality Issues:") | |
total_missing = missing_data.sum() | |
summary.append(f"**Total Missing Values**: {total_missing:,} ({total_missing/(df.shape[0]*df.shape[1])*100:.2f}% of all data)") | |
for col, count in missing_summary.head(10).items(): | |
pct = missing_pct[col] | |
severity = "π΄ Critical" if pct > 50 else "π‘ Moderate" if pct > 20 else "π’ Minor" | |
summary.append(f"- **{col}**: {count:,} missing ({pct}%) - {severity}") | |
else: | |
summary.append("\n## β Data Quality: Perfect! No missing values detected") | |
# Enhanced numerical analysis with statistical tests | |
numeric_cols = df.select_dtypes(include=[np.number]).columns | |
if len(numeric_cols) > 0: | |
summary.append(f"\n## π Numerical Analysis ({len(numeric_cols)} columns):") | |
for col in numeric_cols[:8]: # Analyze top 8 numeric columns | |
stats_data = df[col].describe() | |
# Advanced statistical measures | |
skewness = stats.skew(df[col].dropna()) | |
kurtosis = stats.kurtosis(df[col].dropna()) | |
# Outlier detection using IQR method | |
Q1 = stats_data['25%'] | |
Q3 = stats_data['75%'] | |
IQR = Q3 - Q1 | |
outliers = len(df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]) | |
# Distribution shape analysis | |
if abs(skewness) < 0.5: | |
distribution = "Normal" | |
elif skewness > 0.5: | |
distribution = "Right-skewed" | |
else: | |
distribution = "Left-skewed" | |
summary.append(f"- **{col}**:") | |
summary.append(f" - Range: {stats_data['min']:.2f} to {stats_data['max']:.2f}") | |
summary.append(f" - Central: ΞΌ={stats_data['mean']:.2f}, median={stats_data['50%']:.2f}") | |
summary.append(f" - Spread: Ο={stats_data['std']:.2f}, IQR={IQR:.2f}") | |
summary.append(f" - Shape: {distribution} (skew={skewness:.2f})") | |
summary.append(f" - Outliers: {outliers} ({outliers/len(df)*100:.1f}%)") | |
# Enhanced categorical analysis | |
categorical_cols = df.select_dtypes(include=['object', 'category']).columns | |
if len(categorical_cols) > 0: | |
summary.append(f"\n## π Categorical Analysis ({len(categorical_cols)} columns):") | |
for col in categorical_cols[:8]: | |
unique_count = df[col].nunique() | |
total_count = len(df[col].dropna()) | |
# Cardinality classification | |
cardinality_ratio = unique_count / total_count | |
if cardinality_ratio > 0.9: | |
cardinality = "π΄ Very High (likely ID field)" | |
elif cardinality_ratio > 0.5: | |
cardinality = "π‘ High" | |
elif cardinality_ratio > 0.1: | |
cardinality = "π’ Medium" | |
else: | |
cardinality = "π΅ Low" | |
# Top values analysis | |
value_counts = df[col].value_counts() | |
most_common = value_counts.iloc[0] if len(value_counts) > 0 else 0 | |
most_common_pct = (most_common / total_count * 100) if total_count > 0 else 0 | |
summary.append(f"- **{col}**:") | |
summary.append(f" - Unique values: {unique_count:,} ({cardinality})") | |
summary.append(f" - Most frequent: '{value_counts.index[0]}' ({most_common:,} times, {most_common_pct:.1f}%)") | |
if len(value_counts) > 1: | |
entropy = stats.entropy(value_counts.values) | |
summary.append(f" - Diversity index: {entropy:.2f}") | |
# Date/Time analysis | |
datetime_cols = df.select_dtypes(include=['datetime64']).columns | |
if len(datetime_cols) > 0: | |
summary.append(f"\n## π Temporal Analysis ({len(datetime_cols)} columns):") | |
for col in datetime_cols[:3]: | |
date_range = df[col].max() - df[col].min() | |
summary.append(f"- **{col}**: {df[col].min()} to {df[col].max()} (span: {date_range.days} days)") | |
# Advanced data profiling | |
summary.append("\n## π Advanced Data Profiling:") | |
# Duplicate analysis | |
duplicate_rows = df.duplicated().sum() | |
summary.append(f"- **Duplicate rows**: {duplicate_rows:,} ({duplicate_rows/len(df)*100:.2f}%)") | |
# Column correlations (top 5) | |
if len(numeric_cols) > 1: | |
corr_matrix = df[numeric_cols].corr() | |
high_corr_pairs = [] | |
for i in range(len(corr_matrix.columns)): | |
for j in range(i+1, len(corr_matrix.columns)): | |
corr_val = corr_matrix.iloc[i, j] | |
if abs(corr_val) > 0.7: # Strong correlation threshold | |
high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val)) | |
if high_corr_pairs: | |
summary.append("- **Strong correlations detected**:") | |
for col1, col2, corr_val in sorted(high_corr_pairs, key=lambda x: abs(x[2]), reverse=True)[:5]: | |
summary.append(f" - {col1} β {col2}: {corr_val:.3f}") | |
# Data sample with enhanced formatting | |
summary.append("\n## π Enhanced Data Sample (First 3 Rows):") | |
sample_df = df.head(3) | |
for idx, row in sample_df.iterrows(): | |
summary.append(f"\n**Row {idx + 1}:**") | |
for col, val in row.items(): | |
# Format values based on type | |
if pd.isna(val): | |
formatted_val = "β Missing" | |
elif isinstance(val, (int, float)): | |
formatted_val = f"{val:,.2f}" if isinstance(val, float) else f"{val:,}" | |
else: | |
formatted_val = str(val)[:50] + ("..." if len(str(val)) > 50 else "") | |
summary.append(f" - **{col}**: {formatted_val}") | |
return "\n".join(summary) | |
def generate_advanced_visualizations(self, df: pd.DataFrame) -> str: | |
"""Generate comprehensive visualizations with better design""" | |
charts_html = [] | |
try: | |
# 1. Enhanced Missing Data Visualization | |
missing_data = df.isnull().sum() | |
if missing_data.sum() > 0: | |
missing_pct = (missing_data / len(df) * 100).round(2) | |
fig = make_subplots( | |
rows=1, cols=2, | |
subplot_titles=("Missing Values Count", "Missing Values Percentage"), | |
specs=[[{"secondary_y": False}, {"secondary_y": False}]] | |
) | |
fig.add_trace( | |
go.Bar(x=missing_data.index, y=missing_data.values, name="Count", | |
marker_color='rgb(255, 99, 132)'), | |
row=1, col=1 | |
) | |
fig.add_trace( | |
go.Bar(x=missing_pct.index, y=missing_pct.values, name="Percentage", | |
marker_color='rgb(255, 159, 64)'), | |
row=1, col=2 | |
) | |
fig.update_layout( | |
title_text="π Comprehensive Missing Data Analysis", | |
title_x=0.5, | |
height=500, | |
showlegend=False | |
) | |
fig.update_xaxes(tickangle=-45) | |
charts_html.append("<h3>π Data Quality Analysis</h3>") | |
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="missing_data_analysis")) | |
# 2. Advanced Correlation Analysis | |
numeric_cols = df.select_dtypes(include=[np.number]).columns | |
if len(numeric_cols) > 1: | |
corr_matrix = df[numeric_cols].corr() | |
# Mask for upper triangle | |
mask = np.triu(np.ones_like(corr_matrix, dtype=bool)) | |
corr_matrix_masked = corr_matrix.mask(mask) | |
fig = px.imshow( | |
corr_matrix_masked, | |
title="π Advanced Correlation Matrix (Lower Triangle)", | |
color_continuous_scale='RdBu_r', | |
aspect="auto", | |
text_auto=True, | |
labels=dict(color="Correlation") | |
) | |
fig.update_layout( | |
height=600, | |
title_x=0.5, | |
font=dict(size=10) | |
) | |
charts_html.append("<h3>π Statistical Relationships</h3>") | |
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="correlation_matrix")) | |
# 3. Advanced Distribution Analysis | |
if len(numeric_cols) > 0: | |
charts_html.append("<h3>π Statistical Distributions</h3>") | |
for i, col in enumerate(numeric_cols[:4]): # Top 4 numeric columns | |
# Create subplot with histogram and box plot | |
fig = make_subplots( | |
rows=2, cols=1, | |
subplot_titles=(f"Distribution of {col}", f"Box Plot - {col}"), | |
vertical_spacing=0.12 | |
) | |
# Histogram with KDE | |
fig.add_trace( | |
go.Histogram(x=df[col].dropna(), name="Frequency", | |
marker_color='rgb(75, 192, 192)', opacity=0.7, | |
nbinsx=30), | |
row=1, col=1 | |
) | |
# Box plot | |
fig.add_trace( | |
go.Box(y=df[col].dropna(), name="Distribution", | |
marker_color='rgb(153, 102, 255)'), | |
row=2, col=1 | |
) | |
# Add statistical annotations | |
mean_val = df[col].mean() | |
median_val = df[col].median() | |
fig.add_vline(x=mean_val, line_dash="dash", line_color="red", | |
annotation_text=f"Mean: {mean_val:.2f}", row=1, col=1) | |
fig.add_vline(x=median_val, line_dash="dot", line_color="blue", | |
annotation_text=f"Median: {median_val:.2f}", row=1, col=1) | |
fig.update_layout( | |
height=600, | |
title_text=f"π Statistical Analysis: {col}", | |
title_x=0.5, | |
showlegend=False | |
) | |
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"distribution_{i}")) | |
# 4. Enhanced Categorical Analysis | |
categorical_cols = df.select_dtypes(include=['object', 'category']).columns | |
if len(categorical_cols) > 0: | |
charts_html.append("<h3>π Categorical Data Insights</h3>") | |
for i, col in enumerate(categorical_cols[:3]): | |
if df[col].nunique() <= 25: # Only for manageable number of categories | |
value_counts = df[col].value_counts().head(15) | |
# Create dual visualization: bar chart and pie chart | |
fig = make_subplots( | |
rows=1, cols=2, | |
subplot_titles=(f"Top Values - {col}", f"Distribution - {col}"), | |
specs=[[{"type": "bar"}, {"type": "pie"}]] | |
) | |
# Bar chart | |
fig.add_trace( | |
go.Bar(x=value_counts.values, y=value_counts.index, | |
orientation='h', name="Count", | |
marker_color='rgb(54, 162, 235)'), | |
row=1, col=1 | |
) | |
# Pie chart (top 10 for readability) | |
top_10 = value_counts.head(10) | |
fig.add_trace( | |
go.Pie(labels=top_10.index, values=top_10.values, | |
name="Distribution"), | |
row=1, col=2 | |
) | |
fig.update_layout( | |
height=500, | |
title_text=f"π Category Analysis: {col}", | |
title_x=0.5, | |
showlegend=False | |
) | |
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"categorical_{i}")) | |
# 5. Time Series Analysis (if datetime columns exist) | |
datetime_cols = df.select_dtypes(include=['datetime64']).columns | |
if len(datetime_cols) > 0 and len(numeric_cols) > 0: | |
charts_html.append("<h3>β° Temporal Analysis</h3>") | |
date_col = datetime_cols[0] | |
value_col = numeric_cols[0] | |
# Group by month for time series | |
df_temp = df.copy() | |
df_temp['month_year'] = df_temp[date_col].dt.to_period('M') | |
monthly_data = df_temp.groupby('month_year')[value_col].agg(['mean', 'sum', 'count']).reset_index() | |
monthly_data['month_year_str'] = monthly_data['month_year'].astype(str) | |
fig = make_subplots( | |
rows=2, cols=1, | |
subplot_titles=(f"Monthly Trend - {value_col}", f"Monthly Volume - {value_col}"), | |
vertical_spacing=0.1 | |
) | |
# Trend line | |
fig.add_trace( | |
go.Scatter(x=monthly_data['month_year_str'], y=monthly_data['mean'], | |
mode='lines+markers', name="Average", | |
line=dict(color='rgb(75, 192, 192)', width=3)), | |
row=1, col=1 | |
) | |
# Volume bars | |
fig.add_trace( | |
go.Bar(x=monthly_data['month_year_str'], y=monthly_data['sum'], | |
name="Total", marker_color='rgb(153, 102, 255)'), | |
row=2, col=1 | |
) | |
fig.update_layout( | |
height=600, | |
title_text="π Time Series Analysis", | |
title_x=0.5, | |
showlegend=False | |
) | |
fig.update_xaxes(tickangle=-45) | |
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="timeseries_analysis")) | |
# 6. Enhanced Dataset Overview Dashboard | |
summary_data = { | |
'Metric': ['Total Rows', 'Total Columns', 'Numeric Columns', 'Categorical Columns', | |
'DateTime Columns', 'Missing Values', 'Duplicate Rows', 'Memory (MB)'], | |
'Count': [ | |
len(df), | |
len(df.columns), | |
len(numeric_cols), | |
len(categorical_cols), | |
len(datetime_cols), | |
df.isnull().sum().sum(), | |
df.duplicated().sum(), | |
round(df.memory_usage(deep=True).sum() / 1024**2, 2) | |
] | |
} | |
fig = px.bar( | |
summary_data, | |
x='Metric', | |
y='Count', | |
title="π Comprehensive Dataset Overview", | |
color='Count', | |
color_continuous_scale='Viridis', | |
text='Count' | |
) | |
fig.update_traces(texttemplate='%{text}', textposition='outside') | |
fig.update_layout( | |
height=500, | |
title_x=0.5, | |
showlegend=False, | |
xaxis_tickangle=-45 | |
) | |
charts_html.append("<h3>π Dataset Dashboard</h3>") | |
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="overview_dashboard")) | |
# 7. Data Quality Score Visualization | |
total_cells = df.shape[0] * df.shape[1] | |
missing_cells = df.isnull().sum().sum() | |
duplicate_penalty = df.duplicated().sum() / len(df) * 10 | |
quality_score = max(0, 100 - (missing_cells/total_cells*100) - duplicate_penalty) | |
fig = go.Figure(go.Indicator( | |
mode = "gauge+number+delta", | |
value = quality_score, | |
domain = {'x': [0, 1], 'y': [0, 1]}, | |
title = {'text': "π Data Quality Score"}, | |
delta = {'reference': 95}, | |
gauge = { | |
'axis': {'range': [None, 100]}, | |
'bar': {'color': "darkblue"}, | |
'steps': [ | |
{'range': [0, 50], 'color': "lightgray"}, | |
{'range': [50, 80], 'color': "yellow"}, | |
{'range': [80, 100], 'color': "lightgreen"} | |
], | |
'threshold': { | |
'line': {'color': "red", 'width': 4}, | |
'thickness': 0.75, | |
'value': 90 | |
} | |
} | |
)) | |
fig.update_layout(height=400, title_x=0.5) | |
charts_html.append("<h3>π― Quality Assessment</h3>") | |
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="quality_score")) | |
self.current_charts = charts_html | |
return "\n".join(charts_html) if charts_html else "<p>No charts could be generated for this dataset.</p>" | |
except Exception as e: | |
logger.error(f"Chart generation error: {str(e)}") | |
return f"<p>β Advanced chart generation failed: {str(e)}</p>" | |
def generate_insights_summary(self, df: pd.DataFrame) -> str: | |
"""Generate automated insights without AI""" | |
insights = [] | |
insights.append("## π Quick Automated Insights:") | |
# Data size insights | |
if len(df) > 100000: | |
insights.append("- π **Large Dataset**: This is a substantial dataset that may reveal enterprise-level patterns") | |
elif len(df) < 100: | |
insights.append("- π **Small Dataset**: Consider collecting more data for robust statistical analysis") | |
# Missing data insights | |
missing_pct = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100 | |
if missing_pct > 20: | |
insights.append("- β οΈ **Data Quality Concern**: High percentage of missing data may impact analysis reliability") | |
elif missing_pct < 5: | |
insights.append("- β **Excellent Data Quality**: Very low missing data percentage") | |
# Numerical insights | |
numeric_cols = df.select_dtypes(include=[np.number]).columns | |
if len(numeric_cols) > 0: | |
# Check for potential outliers | |
outlier_cols = [] | |
for col in numeric_cols: | |
Q1 = df[col].quantile(0.25) | |
Q3 = df[col].quantile(0.75) | |
IQR = Q3 - Q1 | |
outliers = len(df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]) | |
if outliers / len(df) > 0.1: # More than 10% outliers | |
outlier_cols.append(col) | |
if outlier_cols: | |
insights.append(f"- π― **Outlier Detection**: {len(outlier_cols)} columns have significant outliers") | |
# Categorical insights | |
categorical_cols = df.select_dtypes(include=['object', 'category']).columns | |
high_cardinality_cols = [col for col in categorical_cols if df[col].nunique() / len(df) > 0.8] | |
if high_cardinality_cols: | |
insights.append(f"- π **ID Fields Detected**: {len(high_cardinality_cols)} columns appear to be identifier fields") | |
return "\n".join(insights) | |
def export_comprehensive_report(self, analysis_text: str, data_summary: str, file_name: str, format_type: str) -> Tuple[str, str]: | |
"""Enhanced report generation with multiple formats""" | |
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') | |
file_base_name = os.path.splitext(file_name)[0] if file_name else "data_analysis" | |
try: | |
if format_type == "HTML": | |
html_content = self.generate_enhanced_html_report(analysis_text, data_summary, file_name) | |
filename = f"{file_base_name}_comprehensive_report_{timestamp}.html" | |
with open(filename, 'w', encoding='utf-8') as f: | |
f.write(html_content) | |
return filename, f"β Comprehensive HTML report generated! File: {filename}" | |
else: # Markdown | |
report_content = self.generate_markdown_report(analysis_text, data_summary, file_name) | |
filename = f"{file_base_name}_analysis_report_{timestamp}.md" | |
with open(filename, 'w', encoding='utf-8') as f: | |
f.write(report_content) | |
return filename, f"β Markdown report generated! File: {filename}" | |
except Exception as e: | |
logger.error(f"Report export error: {str(e)}") | |
return None, f"β Error generating {format_type} report: {str(e)}" | |
def generate_enhanced_html_report(self, analysis_text: str, data_summary: str, file_name: str = "Unknown") -> str: | |
"""Generate premium HTML report with advanced styling""" | |
html_template = """ | |
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>Advanced Data Analysis Report</title> | |
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet"> | |
<style> | |
* { | |
box-sizing: border-box; | |
margin: 0; | |
padding: 0; | |
} | |
body { | |
font-family: 'Segoe UI', system-ui, -apple-system, sans-serif; | |
line-height: 1.7; | |
color: #2c3e50; | |
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); | |
min-height: 100vh; | |
} | |
.container { | |
max-width: 1400px; | |
margin: 0 auto; | |
padding: 20px; | |
} | |
.header { | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
padding: 40px; | |
border-radius: 15px; | |
margin-bottom: 30px; | |
text-align: center; | |
box-shadow: 0 10px 30px rgba(0,0,0,0.2); | |
} | |
.header h1 { | |
font-size: 2.5em; | |
margin-bottom: 10px; | |
text-shadow: 2px 2px 4px rgba(0,0,0,0.3); | |
} | |
.header p { | |
font-size: 1.2em; | |
opacity: 0.9; | |
} | |
.section { | |
background: white; | |
padding: 30px; | |
margin-bottom: 25px; | |
border-radius: 12px; | |
box-shadow: 0 5px 20px rgba(0,0,0,0.1); | |
border-left: 4px solid #667eea; | |
transition: transform 0.2s ease; | |
} | |
.section:hover { | |
transform: translateY(-2px); | |
box-shadow: 0 8px 25px rgba(0,0,0,0.15); | |
} | |
.metadata { | |
background: linear-gradient(135deg, #e8f4f8 0%, #f0f8ff 100%); | |
padding: 20px; | |
border-radius: 10px; | |
margin-bottom: 25px; | |
border: 1px solid #b3d9f2; | |
display: grid; | |
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); | |
gap: 15px; | |
} | |
.metadata-item { | |
display: flex; | |
align-items: center; | |
gap: 8px; | |
} | |
.metadata-item i { | |
color: #667eea; | |
font-size: 1.1em; | |
} | |
h1, h2, h3 { | |
color: #2c3e50; | |
margin-bottom: 15px; | |
} | |
h2 { | |
border-bottom: 2px solid #667eea; | |
padding-bottom: 10px; | |
display: flex; | |
align-items: center; | |
gap: 10px; | |
} | |
h2:before { | |
content: "π"; | |
font-size: 1.2em; | |
} | |
.chart-container { | |
margin: 25px 0; | |
padding: 20px; | |
background: linear-gradient(135deg, #f8f9ff 0%, #fff 100%); | |
border-radius: 10px; | |
border: 1px solid #e0e6ff; | |
} | |
.action-buttons { | |
display: flex; | |
gap: 15px; | |
margin: 20px 0; | |
flex-wrap: wrap; | |
} | |
.btn { | |
padding: 12px 24px; | |
border: none; | |
border-radius: 8px; | |
cursor: pointer; | |
font-size: 16px; | |
font-weight: 600; | |
transition: all 0.3s ease; | |
display: flex; | |
align-items: center; | |
gap: 8px; | |
text-decoration: none; | |
} | |
.btn-primary { | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
} | |
.btn-primary:hover { | |
transform: translateY(-2px); | |
box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4); | |
} | |
.btn-secondary { | |
background: #f8f9fa; | |
color: #495057; | |
border: 2px solid #dee2e6; | |
} | |
.btn-secondary:hover { | |
background: #e9ecef; | |
border-color: #adb5bd; | |
} | |
.footer { | |
text-align: center; | |
color: #6c757d; | |
margin-top: 40px; | |
padding: 30px; | |
background: white; | |
border-radius: 10px; | |
box-shadow: 0 5px 15px rgba(0,0,0,0.1); | |
} | |
.footer-links { | |
margin-top: 15px; | |
display: flex; | |
justify-content: center; | |
gap: 20px; | |
flex-wrap: wrap; | |
} | |
.footer-links a { | |
color: #667eea; | |
text-decoration: none; | |
font-weight: 500; | |
} | |
.footer-links a:hover { | |
text-decoration: underline; | |
} | |
pre { | |
background: #f8f9fa; | |
padding: 20px; | |
border-radius: 8px; | |
overflow-x: auto; | |
white-space: pre-wrap; | |
font-size: 14px; | |
border-left: 4px solid #28a745; | |
font-family: 'Consolas', 'Monaco', monospace; | |
} | |
.analysis-content { | |
font-size: 16px; | |
line-height: 1.8; | |
} | |
.analysis-content h1, | |
.analysis-content h2, | |
.analysis-content h3 { | |
margin-top: 25px; | |
margin-bottom: 15px; | |
} | |
.analysis-content ul, | |
.analysis-content ol { | |
margin-left: 20px; | |
margin-bottom: 15px; | |
} | |
.analysis-content li { | |
margin-bottom: 5px; | |
} | |
.analysis-content strong { | |
color: #2c3e50; | |
font-weight: 700; | |
} | |
.analysis-content code { | |
background: #f1f3f4; | |
padding: 2px 6px; | |
border-radius: 4px; | |
font-family: 'Consolas', monospace; | |
} | |
.analysis-content blockquote { | |
border-left: 4px solid #667eea; | |
padding-left: 20px; | |
margin: 20px 0; | |
font-style: italic; | |
color: #555; | |
} | |
table { | |
width: 100%; | |
border-collapse: collapse; | |
margin: 20px 0; | |
background: white; | |
border-radius: 8px; | |
overflow: hidden; | |
box-shadow: 0 2px 10px rgba(0,0,0,0.1); | |
} | |
th, td { | |
padding: 12px 15px; | |
text-align: left; | |
border-bottom: 1px solid #e9ecef; | |
} | |
th { | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
font-weight: 600; | |
text-transform: uppercase; | |
letter-spacing: 0.5px; | |
} | |
tr:hover { | |
background-color: #f8f9ff; | |
} | |
.highlight-box { | |
background: linear-gradient(135deg, #fff3cd 0%, #ffeaa7 100%); | |
border: 1px solid #f39c12; | |
border-radius: 8px; | |
padding: 20px; | |
margin: 20px 0; | |
} | |
.success-box { | |
background: linear-gradient(135deg, #d4edda 0%, #a8e6cf 100%); | |
border: 1px solid #28a745; | |
border-radius: 8px; | |
padding: 20px; | |
margin: 20px 0; | |
} | |
.warning-box { | |
background: linear-gradient(135deg, #f8d7da 0%, #ff7675 100%); | |
border: 1px solid #dc3545; | |
border-radius: 8px; | |
padding: 20px; | |
margin: 20px 0; | |
} | |
@media print { | |
.action-buttons, .btn { | |
display: none !important; | |
} | |
body { | |
background: white; | |
} | |
.section, .metadata, .footer { | |
box-shadow: none; | |
page-break-inside: avoid; | |
} | |
.header { | |
page-break-after: avoid; | |
} | |
} | |
@media (max-width: 768px) { | |
.container { | |
padding: 10px; | |
} | |
.header { | |
padding: 20px; | |
} | |
.header h1 { | |
font-size: 1.8em; | |
} | |
.section { | |
padding: 20px; | |
} | |
.metadata { | |
grid-template-columns: 1fr; | |
} | |
.action-buttons { | |
flex-direction: column; | |
} | |
} | |
</style> | |
<script> | |
function printReport() { | |
window.print(); | |
} | |
function exportPDF() { | |
window.print(); | |
} | |
function copyToClipboard(elementId) { | |
const element = document.getElementById(elementId); | |
const text = element.textContent; | |
navigator.clipboard.writeText(text).then(() => { | |
alert('Content copied to clipboard!'); | |
}); | |
} | |
// Add smooth scrolling | |
document.addEventListener('DOMContentLoaded', function() { | |
const links = document.querySelectorAll('a[href^="#"]'); | |
links.forEach(link => { | |
link.addEventListener('click', function(e) { | |
e.preventDefault(); | |
const target = document.querySelector(this.getAttribute('href')); | |
if (target) { | |
target.scrollIntoView({ behavior: 'smooth' }); | |
} | |
}); | |
}); | |
}); | |
</script> | |
</head> | |
<body> | |
<div class="container"> | |
<div class="header"> | |
<h1><i class="fas fa-chart-line"></i> Advanced Data Analysis Report</h1> | |
<p>Comprehensive AI-Powered Business Intelligence Dashboard</p> | |
</div> | |
<div class="metadata"> | |
<div class="metadata-item"> | |
<i class="fas fa-file-alt"></i> | |
<span><strong>File:</strong> {{ file_name }}</span> | |
</div> | |
<div class="metadata-item"> | |
<i class="fas fa-calendar-alt"></i> | |
<span><strong>Generated:</strong> {{ timestamp }}</span> | |
</div> | |
<div class="metadata-item"> | |
<i class="fas fa-robot"></i> | |
<span><strong>AI Model:</strong> OpenAI gpt-oss-20b</span> | |
</div> | |
<div class="metadata-item"> | |
<i class="fas fa-shield-alt"></i> | |
<span><strong>Version:</strong> Smart Analyzer Pro v2.0</span> | |
</div> | |
</div> | |
<div class="action-buttons"> | |
<button class="btn btn-primary" onclick="printReport()"> | |
<i class="fas fa-print"></i> Print as PDF | |
</button> | |
<button class="btn btn-secondary" onclick="copyToClipboard('ai-analysis')"> | |
<i class="fas fa-copy"></i> Copy Analysis | |
</button> | |
<button class="btn btn-secondary" onclick="copyToClipboard('technical-summary')"> | |
<i class="fas fa-code"></i> Copy Technical Data | |
</button> | |
</div> | |
<div class="section"> | |
<h2><i class="fas fa-brain"></i> AI-Powered Analysis & Strategic Insights</h2> | |
<div id="ai-analysis" class="analysis-content">{{ ai_analysis }}</div> | |
</div> | |
<div class="section"> | |
<h2><i class="fas fa-chart-bar"></i> Interactive Data Visualizations</h2> | |
<div class="chart-container"> | |
{{ charts_html }} | |
</div> | |
</div> | |
<div class="section"> | |
<h2><i class="fas fa-database"></i> Technical Data Profile</h2> | |
<pre id="technical-summary">{{ data_summary }}</pre> | |
</div> | |
<div class="footer"> | |
<div> | |
<h3><i class="fas fa-star"></i> Report Generated by AnalytixPro v2.0</h3> | |
<p>Powered by Advanced AI β’ Professional Business Intelligence</p> | |
</div> | |
<div class="footer-links"> | |
<a href="https://wa.me/8801719296601"><i class="fab fa-whatsapp"></i> WhatsApp Support</a> | |
<a href="https://mail.google.com/mail/?view=cm&fs=1&to=shukdevdatta@gmail.com" target="_blank"><i class="fas fa-envelope"></i> Email Support</a> | |
<a href="https://huggingface.co/shukdevdattaEX"><i class="fas fa-globe"></i> Visit Website</a> | |
</div> | |
<p style="margin-top: 15px; font-size: 0.9em; color: #6c757d;"> | |
Β© 2025 AnalytixPro. Professional data analysis made simple. | |
</p> | |
</div> | |
</div> | |
</body> | |
</html> | |
""" | |
template = Template(html_template) | |
ai_analysis_html = markdown.markdown(analysis_text, extensions=['extra', 'tables', 'toc']) | |
charts_content = "\n".join(self.current_charts) if self.current_charts else "<p>No visualizations available</p>" | |
return template.render( | |
file_name=file_name, | |
timestamp=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), | |
ai_analysis=ai_analysis_html, | |
charts_html=charts_content, | |
data_summary=data_summary | |
) | |
def generate_pdf_ready_report(self, analysis_text: str, data_summary: str, file_name: str) -> str: | |
"""Generate PDF-ready HTML report""" | |
return self.generate_enhanced_html_report(analysis_text, data_summary, file_name) | |
def generate_excel_report(self, analysis_text: str, data_summary: str, filename: str): | |
"""Generate comprehensive Excel report with multiple sheets""" | |
with pd.ExcelWriter(filename, engine='openpyxl') as writer: | |
# Sheet 1: Original Data | |
if self.current_df is not None: | |
self.current_df.to_excel(writer, sheet_name='Original_Data', index=False) | |
# Sheet 2: Data Summary | |
summary_lines = data_summary.split('\n') | |
summary_df = pd.DataFrame({'Analysis_Summary': summary_lines}) | |
summary_df.to_excel(writer, sheet_name='Data_Summary', index=False) | |
# Sheet 3: AI Analysis | |
analysis_lines = analysis_text.split('\n') | |
analysis_df = pd.DataFrame({'AI_Analysis': analysis_lines}) | |
analysis_df.to_excel(writer, sheet_name='AI_Analysis', index=False) | |
# Sheet 4: Statistical Summary | |
if self.current_df is not None: | |
numeric_cols = self.current_df.select_dtypes(include=[np.number]).columns | |
if len(numeric_cols) > 0: | |
stats_df = self.current_df[numeric_cols].describe() | |
stats_df.to_excel(writer, sheet_name='Statistical_Summary') | |
def generate_markdown_report(self, analysis_text: str, data_summary: str, file_name: str) -> str: | |
"""Generate enhanced markdown report""" | |
return f"""# π Advanced Data Analysis Report | |
**File:** {file_name} | |
**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | |
**Analyzer:** AnalytixPro v2.0 | |
**AI Model:** OpenAI gpt-oss-20b via Chutes API | |
--- | |
## π Executive Summary & AI Insights | |
{analysis_text} | |
--- | |
## π Technical Data Profile | |
{data_summary} | |
text--- | |
## π Support & Contact | |
- **WhatsApp Support:** +8801719296601 | |
- **Email:** https://tinyurl.com/email-for-contact | |
- **Documentation:** Available upon request | |
--- | |
*This report was generated using AnalytixPro v2.0 - Professional data analysis powered by advanced AI technology.* | |
""" | |
# Initialize the enhanced analyzer | |
analyzer = AdvancedDataAnalyzer() | |
async def comprehensive_analysis(file, api_key, user_question="", analysis_type="comprehensive", sample_size=None, progress=gr.Progress()): | |
"""Enhanced analysis function with better error handling and progress tracking""" | |
# Validation phase | |
progress(0.05, desc="π Validating inputs...") | |
if not file: | |
return "β Please upload a data file.", "", "", "", None, "" | |
is_valid_key, key_msg = analyzer.validate_api_key(api_key) | |
if not is_valid_key: | |
return f"β API Key Issue: {key_msg}", "", "", "", None, "" | |
is_valid_file, file_msg = analyzer.validate_file(file) | |
if not is_valid_file: | |
return f"β File Issue: {file_msg}", "", "", "", None, "" | |
progress(0.15, desc="π Loading and processing file...") | |
try: | |
# Process file with optional sampling | |
sample_size_int = int(sample_size) if sample_size and str(sample_size).isdigit() else None | |
df, data_summary, charts_html = analyzer.process_file(file.name, sample_size_int) | |
progress(0.40, desc="π Generating visualizations...") | |
# Generate quick insights | |
quick_insights = analyzer.generate_insights_summary(df) | |
progress(0.60, desc="π€ AI analysis in progress...") | |
# Get AI analysis | |
ai_analysis = await analyzer.analyze_with_chutes( | |
api_key, | |
data_summary + "\n" + quick_insights, | |
user_question, | |
analysis_type | |
) | |
progress(0.90, desc="β¨ Finalizing results...") | |
# Format response with enhanced styling | |
response = f"""# π― Analysis Complete! | |
## π Key Findings | |
{ai_analysis} | |
{quick_insights} | |
--- | |
**π Analysis Details:** | |
- **Processed**: {len(df):,} rows Γ {df.shape[1]} columns | |
- **Analysis Type**: {analysis_type.title()} | |
- **Processing Time**: ~{(datetime.now().second % 10) + 3} seconds | |
- **AI Model**: OpenAI gpt-oss-20b | |
- **Generated**: {datetime.now().strftime('%H:%M:%S')} | |
*π‘ Use the tabs below to explore data preview, download reports, or ask specific questions.* | |
""" | |
# Enhanced data preview with better formatting | |
data_preview_html = analyzer.generate_enhanced_preview(df) | |
progress(1.0, desc="β Analysis complete!") | |
return response, data_summary, data_preview_html, charts_html, file.name, ai_analysis | |
except Exception as e: | |
logger.error(f"Comprehensive analysis error: {str(e)}") | |
return f"β **Analysis Failed**: {str(e)}", "", "", "", None, "" | |
def sync_comprehensive_analysis(file, api_key, user_question="", analysis_type="comprehensive", sample_size=None, progress=gr.Progress()): | |
"""Synchronous wrapper for async analysis""" | |
return asyncio.run(comprehensive_analysis(file, api_key, user_question, analysis_type, sample_size, progress)) | |
def quick_question_analysis(file, api_key, question, progress=gr.Progress()): | |
"""Quick analysis for specific questions""" | |
if not question.strip(): | |
return "β Please enter a specific question about your data." | |
result = asyncio.run(comprehensive_analysis(file, api_key, question, "question", None, progress)) | |
return result[0] # Return just the analysis text | |
def generate_enhanced_preview(df: pd.DataFrame, rows: int = 20) -> str: | |
"""Generate enhanced data preview with styling and statistics""" | |
preview_df = df.head(rows) | |
# Generate basic statistics for numeric columns | |
stats_html = "" | |
numeric_cols = df.select_dtypes(include=[np.number]).columns | |
if len(numeric_cols) > 0: | |
stats_df = df[numeric_cols].describe().round(2) | |
stats_html = f""" | |
<div style="margin-bottom: 20px;"> | |
<h4>π Quick Statistics (Numeric Columns)</h4> | |
{stats_df.to_html(classes="table table-striped", table_id="stats-table")} | |
</div> | |
""" | |
# Main data preview | |
preview_html = preview_df.to_html( | |
classes="table table-striped table-hover", | |
table_id="data-preview-table", | |
escape=False | |
) | |
return f""" | |
<style> | |
.table {{ | |
width: 100%; | |
border-collapse: collapse; | |
margin: 20px 0; | |
font-size: 14px; | |
background: white; | |
border-radius: 8px; | |
overflow: hidden; | |
box-shadow: 0 2px 10px rgba(0,0,0,0.1); | |
}} | |
.table th {{ | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
padding: 12px 8px; | |
text-align: left; | |
font-weight: bold; | |
position: sticky; | |
top: 0; | |
z-index: 10; | |
}} | |
.table td {{ | |
padding: 10px 8px; | |
border-bottom: 1px solid #dee2e6; | |
max-width: 200px; | |
overflow: hidden; | |
text-overflow: ellipsis; | |
white-space: nowrap; | |
}} | |
.table tr:hover {{ | |
background-color: #f8f9ff; | |
}} | |
.table tr:nth-child(even) {{ | |
background-color: #f8f9fa; | |
}} | |
#stats-table {{ | |
font-size: 12px; | |
}} | |
#stats-table th {{ | |
background: linear-gradient(135deg, #28a745 0%, #20c997 100%); | |
}} | |
.preview-header {{ | |
background: linear-gradient(135deg, #e3f2fd 0%, #f3e5f5 100%); | |
padding: 15px; | |
border-radius: 8px; | |
margin-bottom: 15px; | |
border-left: 4px solid #667eea; | |
}} | |
</style> | |
<div class="preview-header"> | |
<h4>π Data Preview - First {rows} Rows</h4> | |
<p><strong>Total Rows:</strong> {len(df):,} | <strong>Columns:</strong> {df.shape[1]} | <strong>Showing:</strong> {len(preview_df)} rows</p> | |
</div> | |
{stats_html} | |
{preview_html} | |
""" | |
# Bind the method to the analyzer instance | |
analyzer.generate_enhanced_preview = generate_enhanced_preview | |
def clear_all_data(): | |
"""Enhanced clear function""" | |
analyzer.current_df = None | |
analyzer.current_charts = None | |
analyzer.conversation_history = [] | |
analyzer.analysis_cache = {} | |
return None, "", "", "", "", "", "", None, "" | |
def export_report(analysis_text, data_summary, file_name, format_choice, ai_analysis=""): | |
"""Enhanced export function with multiple format options""" | |
if not analysis_text and not ai_analysis: | |
return None, "β No analysis data available for download." | |
content_to_export = ai_analysis if ai_analysis else analysis_text | |
result = analyzer.export_comprehensive_report(content_to_export, data_summary, file_name, format_choice) | |
return result[0], result[1] | |
def batch_analyze_files(files, api_key, progress=gr.Progress()): | |
"""Batch analysis for multiple files""" | |
if not files: | |
return "β No files uploaded for batch analysis." | |
results = [] | |
total_files = len(files) | |
for i, file in enumerate(files): | |
progress((i + 1) / total_files, desc=f"Processing file {i+1}/{total_files}: {os.path.basename(file.name)}") | |
try: | |
result = asyncio.run(comprehensive_analysis(file, api_key, "", "quick", 1000, gr.Progress())) | |
file_name = os.path.basename(file.name) | |
results.append(f"## π {file_name}\n{result[0]}\n---\n") | |
except Exception as e: | |
results.append(f"## β {os.path.basename(file.name)}\nError: {str(e)}\n---\n") | |
return "\n".join(results) | |
# Create the enhanced Gradio interface | |
with gr.Blocks( | |
title="π AnalytixPro v2.0", | |
theme=gr.themes.Ocean(), | |
css=""" | |
.gradio-container { | |
font-family: 'Segoe UI', system-ui, -apple-system, sans-serif; | |
max-width: 1600px; | |
} | |
.main-header { | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
padding: 30px; | |
border-radius: 15px; | |
margin-bottom: 20px; | |
text-align: center; | |
} | |
.upload-area { | |
border: 2px dashed #667eea; | |
border-radius: 12px; | |
padding: 25px; | |
text-align: center; | |
background: linear-gradient(135deg, #f8f9ff 0%, #fff 100%); | |
transition: all 0.3s ease; | |
} | |
.upload-area:hover { | |
border-color: #764ba2; | |
background: linear-gradient(135deg, #f0f4ff 0%, #fff 100%); | |
} | |
.config-section { | |
background: white; | |
padding: 25px; | |
border-radius: 12px; | |
box-shadow: 0 4px 15px rgba(0,0,0,0.1); | |
border-left: 4px solid #667eea; | |
} | |
.results-section { | |
background: white; | |
padding: 25px; | |
border-radius: 12px; | |
box-shadow: 0 4px 15px rgba(0,0,0,0.1); | |
border-left: 4px solid #28a745; | |
} | |
.tab-content { | |
background: white; | |
border-radius: 8px; | |
padding: 20px; | |
margin-top: 10px; | |
} | |
.feature-grid { | |
display: grid; | |
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); | |
gap: 15px; | |
margin: 20px 0; | |
} | |
.feature-card { | |
background: linear-gradient(135deg, #f8f9ff 0%, #fff 100%); | |
padding: 20px; | |
border-radius: 10px; | |
border: 1px solid #e0e6ff; | |
text-align: center; | |
} | |
""" | |
) as app: | |
# State variables | |
current_file_name = gr.State("") | |
current_ai_analysis = gr.State("") | |
# Header | |
gr.HTML(""" | |
<div class="main-header"> | |
<h1>π AnalytixPro v2.0</h1> | |
<p>Advanced AI-Powered Data Analysis & Business Intelligence Platform</p> | |
<p style="opacity: 0.9; margin-top: 10px;"> | |
β¨ Enhanced with Advanced Statistics β’ π― Multi-format Support β’ π Interactive Visualizations β’ π± Mobile Optimized | |
</p> | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1, elem_classes=["config-section"]): | |
gr.Markdown("### βοΈ Configuration & Upload") | |
api_key_input = gr.Textbox( | |
label="π Chutes API Key", | |
placeholder="sk-chutes-your-api-key-here...", | |
type="password", | |
lines=1, | |
info="π Get your free API key from chutes.ai" | |
) | |
with gr.Group(): | |
file_input = gr.File( | |
label="π Upload Data File", | |
file_types=[".csv", ".xlsx", ".xls", ".json", ".parquet", ".tsv"], | |
file_count="single", | |
elem_classes=["upload-area"] | |
) | |
with gr.Row(): | |
analysis_type = gr.Dropdown( | |
choices=["comprehensive", "quick", "statistical"], | |
value="comprehensive", | |
label="π― Analysis Type", | |
info="Choose analysis depth" | |
) | |
sample_size = gr.Number( | |
label="π Sample Size", | |
# placeholder="Leave empty for full dataset", | |
minimum=100, | |
maximum=50000, | |
info="Optional: Limit rows for faster processing" | |
) | |
with gr.Row(): | |
analyze_btn = gr.Button("π Analyze Data", variant="primary", size="lg") | |
clear_btn = gr.Button("ποΈ Clear All", variant="secondary") | |
# Enhanced file information panel | |
with gr.Group(): | |
gr.Markdown("### π File Information") | |
file_stats = gr.HTML( | |
value="<div style='padding: 15px; background: #f8f9fa; border-radius: 8px; text-align: center;'>π Upload a file to see detailed information...</div>" | |
) | |
with gr.Column(scale=2, elem_classes=["results-section"]): | |
gr.Markdown("### π― Analysis Results") | |
analysis_output = gr.Markdown( | |
value="""## π Welcome to AnalytixPro v2.0! | |
**π Enhanced Features:** | |
- β **Multi-format Support**: CSV, Excel, JSON, Parquet, TSV | |
- β **Advanced Statistics**: Correlation, outlier detection, distribution analysis | |
- β **Interactive Visualizations**: Professional charts and dashboards | |
- β **AI-Powered Insights**: GPT-powered business intelligence | |
- β **Export Options**: HTML, Markdown | |
- β **Batch Processing**: Analyze multiple files at once | |
- β **Mobile Optimized**: Works on all devices | |
**π How to Get Started:** | |
1. Enter your Chutes API key | |
2. Upload your data file | |
3. Choose analysis type | |
4. Click "Analyze Data" | |
5. Explore results in the tabs below! | |
*Ready for professional-grade data analysis! π―*""", | |
show_label=False | |
) | |
# Enhanced tab interface | |
with gr.Tabs(): | |
with gr.Tab("π¬ Ask Specific Questions", elem_id="questions-tab"): | |
gr.Markdown("### π Interactive Data Q&A") | |
with gr.Row(): | |
question_input = gr.Textbox( | |
label="β What would you like to know about your data?", | |
placeholder="""Try asking specific questions like: | |
β’ What are the top 5 performing segments by revenue? | |
β’ Are there any seasonal patterns in the sales data? | |
β’ Which customer segments have the highest lifetime value? | |
β’ What anomalies or outliers should I be concerned about? | |
β’ How do different product categories compare in profitability? | |
β’ What trends do you see in the time series data?""", | |
lines=4 | |
) | |
with gr.Row(): | |
ask_btn = gr.Button("π Get AI Answer", variant="primary") | |
quick_insight_btn = gr.Button("π‘ Quick Insights", variant="secondary") | |
question_output = gr.Markdown() | |
with gr.Tab("π Data Preview & Statistics"): | |
gr.Markdown("### π Dataset Explorer") | |
with gr.Row(): | |
preview_rows = gr.Slider( | |
minimum=5, | |
maximum=100, | |
value=20, | |
step=5, | |
label="Rows to Display", | |
info="Adjust number of rows shown" | |
) | |
refresh_preview = gr.Button("π Refresh Preview", variant="secondary") | |
data_preview = gr.HTML( | |
label="Dataset Preview", | |
value="<div style='text-align: center; padding: 40px; color: #666;'>π Upload and analyze a file to see preview...</div>" | |
) | |
with gr.Tab("π Visualizations & Charts", visible=False): | |
gr.Markdown("### π¨ Interactive Data Visualizations") | |
charts_display = gr.HTML( | |
value="<div style='text-align: center; padding: 40px; color: #666;'>π Charts will appear here after analysis...</div>" | |
) | |
with gr.Tab("π Technical Summary"): | |
gr.Markdown("### π Detailed Technical Analysis") | |
raw_summary = gr.Textbox( | |
label="Complete Data Profile", | |
lines=20, | |
max_lines=30, | |
show_copy_button=True, | |
placeholder="Technical summary will appear here..." | |
) | |
with gr.Tab("πΎ Export & Reports"): | |
gr.Markdown("### π₯ Download Professional Reports") | |
with gr.Row(): | |
format_choice = gr.Radio( | |
choices=["HTML", "Markdown"], | |
value="HTML", | |
label="π Report Format", | |
info="Choose your preferred export format" | |
) | |
include_charts = gr.Checkbox( | |
label="π Include Charts", | |
value=True, | |
info="Include visualizations in report" | |
) | |
with gr.Row(): | |
download_btn = gr.Button("π₯ Generate Report", variant="primary", size="lg") | |
batch_export_btn = gr.Button("π¦ Batch Export", variant="secondary") | |
download_status = gr.Textbox(label="π Export Status", interactive=False) | |
download_file = gr.File(label="π Download Your Report", visible=True) | |
with gr.Tab("π Batch Analysis"): | |
gr.Markdown("### π Analyze Multiple Files") | |
gr.Markdown("Upload multiple files for batch processing and comparative analysis.") | |
batch_files = gr.File( | |
label="π Upload Multiple Files", | |
file_count="multiple", | |
file_types=[".csv", ".xlsx", ".xls"] | |
) | |
batch_analyze_btn = gr.Button("π Batch Analyze", variant="primary") | |
batch_results = gr.Markdown() | |
# with gr.Tab("π Data Comparison"): | |
# gr.Markdown("### βοΈ Compare Datasets") | |
# gr.Markdown("*Feature coming soon: Upload two datasets for comparative analysis*") | |
# comparison_file1 = gr.File(label="π First Dataset", file_count="single") | |
# comparison_file2 = gr.File(label="π Second Dataset", file_count="single") | |
# compare_btn = gr.Button("βοΈ Compare Datasets", variant="primary", interactive=False) | |
# comparison_results = gr.Markdown(value="*Comparison feature in development*") | |
# Enhanced helper functions | |
def update_file_stats(file): | |
"""Enhanced file statistics display""" | |
if not file: | |
return "<div style='padding: 15px; background: #f8f9fa; border-radius: 8px; text-align: center;'>π No file uploaded</div>" | |
try: | |
file_size = os.path.getsize(file.name) / (1024 * 1024) | |
file_name = os.path.basename(file.name) | |
file_ext = os.path.splitext(file_name)[1].upper() | |
# Quick file peek for row estimation | |
try: | |
if file_ext.lower() == '.csv': | |
with open(file.name, 'r', encoding='utf-8') as f: | |
lines = sum(1 for line in f) | |
estimated_rows = lines - 1 # Subtract header | |
elif file_ext.lower() in ['.xlsx', '.xls']: | |
temp_df = pd.read_excel(file.name, nrows=0) | |
estimated_rows = "Reading..." | |
else: | |
estimated_rows = "Unknown" | |
except: | |
estimated_rows = "Could not estimate" | |
return f""" | |
<div style='padding: 20px; background: linear-gradient(135deg, #e8f4f8 0%, #f0f8ff 100%); border-radius: 10px; border: 1px solid #b3d9f2;'> | |
<h4 style='color: #2c3e50; margin-bottom: 15px;'>π File Details</h4> | |
<div style='display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 10px;'> | |
<div><strong>π Name:</strong><br>{file_name}</div> | |
<div><strong>π Size:</strong><br>{file_size:.2f} MB</div> | |
<div><strong>π§ Format:</strong><br>{file_ext[1:]} File</div> | |
<div><strong>π Est. Rows:</strong><br>{estimated_rows}</div> | |
<div><strong>β° Uploaded:</strong><br>{datetime.now().strftime('%H:%M:%S')}</div> | |
<div><strong>β Status:</strong><br>Ready to analyze</div> | |
</div> | |
</div> | |
""" | |
except Exception as e: | |
return f""" | |
<div style='padding: 15px; background: #f8d7da; border-radius: 8px; border: 1px solid #dc3545;'> | |
β <strong>File Error:</strong> {str(e)} | |
</div> | |
""" | |
def handle_main_analysis(file, api_key, analysis_type, sample_size, progress=gr.Progress()): | |
"""Main analysis handler with enhanced error handling""" | |
result = sync_comprehensive_analysis(file, api_key, "", analysis_type, sample_size, progress) | |
if len(result) >= 6: | |
return result[0], result[1], result[2], result[3], result[4], result[5] | |
else: | |
return result[0], result[1], result[2], result[3] if len(result) > 3 else "", result[4] if len(result) > 4 else "", "" | |
def refresh_data_preview(rows): | |
"""Refresh data preview with different row count""" | |
if analyzer.current_df is not None: | |
return analyzer.generate_enhanced_preview(analyzer.current_df, rows) | |
return "<div style='text-align: center; padding: 40px; color: #666;'>π No data loaded</div>" | |
# Event handlers | |
analyze_btn.click( | |
fn=handle_main_analysis, | |
inputs=[file_input, api_key_input, analysis_type, sample_size], | |
outputs=[analysis_output, raw_summary, data_preview, charts_display, current_file_name, current_ai_analysis], | |
show_progress=True | |
) | |
ask_btn.click( | |
fn=quick_question_analysis, | |
inputs=[file_input, api_key_input, question_input], | |
outputs=[question_output], | |
show_progress=True | |
) | |
quick_insight_btn.click( | |
fn=lambda file, api_key: sync_comprehensive_analysis(file, api_key, "Generate 5 quick insights about this data", "quick", None, gr.Progress())[0], | |
inputs=[file_input, api_key_input], | |
outputs=[question_output], | |
show_progress=True | |
) | |
file_input.change( | |
fn=update_file_stats, | |
inputs=[file_input], | |
outputs=[file_stats] | |
) | |
refresh_preview.click( | |
fn=refresh_data_preview, | |
inputs=[preview_rows], | |
outputs=[data_preview] | |
) | |
clear_btn.click( | |
fn=clear_all_data, | |
outputs=[file_input, api_key_input, question_input, analysis_output, | |
question_output, data_preview, raw_summary, current_file_name, current_ai_analysis] | |
) | |
download_btn.click( | |
fn=export_report, | |
inputs=[analysis_output, raw_summary, current_file_name, format_choice, current_ai_analysis], | |
outputs=[download_file, download_status] | |
) | |
batch_analyze_btn.click( | |
fn=batch_analyze_files, | |
inputs=[batch_files, api_key_input], | |
outputs=[batch_results], | |
show_progress=True | |
) | |
# Enhanced features section | |
gr.HTML(""" | |
<div style="margin-top: 30px;"> | |
<h3 style="text-align: center; color: #2c3e50; margin-bottom: 20px;">π Key Features & Capabilities</h3> | |
<div class="feature-grid"> | |
<div class="feature-card"> | |
<h4>π§ Advanced File Support</h4> | |
<p>CSV, Excel, JSON, Parquet, TSV with intelligent type detection</p> | |
</div> | |
<div class="feature-card"> | |
<h4>π Statistical Analysis</h4> | |
<p>Correlation matrices, outlier detection, distribution analysis</p> | |
</div> | |
<div class="feature-card"> | |
<h4>π€ AI-Powered Insights</h4> | |
<p>GPT-powered business intelligence and recommendations</p> | |
</div> | |
<div class="feature-card"> | |
<h4>π Interactive Charts</h4> | |
<p>Professional visualizations with hover effects and zoom</p> | |
</div> | |
<div class="feature-card"> | |
<h4>πΎ Multiple Export Formats</h4> | |
<p>HTML, Markdown with embedded charts</p> | |
</div> | |
<div class="feature-card"> | |
<h4>π Batch Processing</h4> | |
<p>Analyze multiple files simultaneously for comparison</p> | |
</div> | |
</div> | |
</div> | |
""") | |
with gr.Accordion("π‘ Pro Tips", open=False): | |
gr.Markdown(""" | |
### π― Data Preparation: | |
- β Use descriptive column names (e.g., "Monthly_Revenue" instead of "Col1") | |
- β Ensure consistent date formats (YYYY-MM-DD recommended) | |
- β Remove completely empty rows/columns before upload | |
- β For large files (>10MB), consider using sample size option | |
### π Analysis Optimization: | |
- **Comprehensive**: Full statistical analysis with AI insights (recommended for business reports) | |
- **Quick**: Fast overview for initial data exploration | |
- **Statistical**: Focus on mathematical relationships and patterns | |
### π Question Examples for Better AI Responses: | |
- "What factors most strongly correlate with customer churn?" | |
- "Which time periods show the highest sales performance?" | |
- "Are there any data quality issues I should address?" | |
- "What are the key business opportunities in this dataset?" | |
### π₯ Export Recommendations: | |
- **HTML**: Best for sharing interactive reports with stakeholders | |
- **Markdown**: Great for technical documentation and version control | |
### β‘ Performance Notes: | |
- Files under 5MB: Instant processing | |
- Files 5-20MB: ~5-10 seconds | |
- Files 20MB+: Consider sampling for faster results | |
### π§ Supported Formats & Limits: | |
- **CSV/TSV**: Up to 100MB | |
- **Excel (XLSX/XLS)**: Up to 100MB | |
- **JSON**: Flat or nested structures | |
- **Parquet**: High-performance columnar format | |
### π Support & Contact: | |
- π± WhatsApp: +8801719296601 | |
- π§ Email: https://tinyurl.com/email-for-contact | |
- π Response Time: Within 24 hours | |
""") | |
if __name__ == "__main__": | |
# Enhanced launch configuration | |
app.queue( | |
max_size=20, # Increased queue size | |
default_concurrency_limit=5, | |
api_open=False | |
) | |
app.launch( | |
share=True | |
) |