Spaces:

shukdevdattaEX
/

AnalytixPro-v2.0

Sleeping

File size: 82,051 Bytes

import gradio as gr ###
import pandas as pd
import aiohttp
import asyncio
import json
import os
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from typing import Optional, Tuple, Dict, Any, List
import logging
from datetime import datetime, timedelta
import re
from jinja2 import Template
import markdown
import zipfile
import io
import base64
from scipy import stats
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Configure logging with better formatting
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class AdvancedDataAnalyzer:
    def __init__(self):
        self.api_base_url = "https://llm.chutes.ai/v1/chat/completions"
        self.max_file_size = 100 * 1024 * 1024  # Increased to 100MB
        self.conversation_history = []
        self.current_df = None
        self.current_charts = None
        self.analysis_cache = {}
        self.supported_formats = ['.csv', '.xlsx', '.xls', '.json', '.parquet', '.tsv']
        
    def validate_api_key(self, api_key: str) -> Tuple[bool, str]:
        """Enhanced API key validation"""
        if not api_key or len(api_key.strip()) < 10:
            return False, "API key must be at least 10 characters long"
        
        # Check for common API key patterns
        api_key = api_key.strip()
        if not (api_key.startswith(('sk-', 'pk-', 'Bearer ')) or len(api_key) > 20):
            return False, "API key format appears invalid"
        
        return True, "Valid API key format"
    
    def validate_file(self, file) -> Tuple[bool, str]:
        """Enhanced file validation with better error messages"""
        if not file:
            return False, "No file uploaded"
        
        try:
            file_size = os.path.getsize(file.name)
            if file_size > self.max_file_size:
                return False, f"File too large. Maximum size: {self.max_file_size // (1024*1024)}MB"
            
            if file_size == 0:
                return False, "File is empty"
            
            file_extension = os.path.splitext(file.name)[1].lower()
            if file_extension not in self.supported_formats:
                return False, f"Unsupported format. Supported: {', '.join(self.supported_formats)}"
            
            return True, "File validation passed"
            
        except Exception as e:
            return False, f"File validation error: {str(e)}"
    
    async def analyze_with_chutes(self, api_token: str, data_summary: str, user_question: str = None, analysis_type: str = "comprehensive") -> str:
        """Enhanced API call with better prompts and error handling"""
        headers = {
            "Authorization": f"Bearer {api_token.strip()}",
            "Content-Type": "application/json",
            "User-Agent": "SmartDataAnalyzer/2.0"
        }
        
        # Create specialized prompts based on analysis type
        prompts = {
            "comprehensive": f"""You are a senior data scientist with 10+ years of experience. Analyze this dataset comprehensively:

{data_summary}

Provide a thorough analysis with:
1. **Executive Summary**: 3-4 key takeaways for stakeholders
2. **Statistical Insights**: Important numbers, distributions, and what they reveal
3. **Pattern Recognition**: Trends, correlations, seasonality, anomalies
4. **Data Quality Assessment**: Completeness, accuracy, consistency issues
5. **Business Intelligence**: Actionable insights and opportunities
6. **Risk Analysis**: Potential data quality issues or business risks
7. **Recommendations**: Specific, prioritized next steps

Use bullet points, specific numbers, and clear explanations.""",

            "quick": f"""Provide a quick but insightful analysis of this dataset:
{data_summary}

Focus on:
- Top 3 most important findings
- Any obvious patterns or anomalies
- Key business insights
- Quick recommendations

Keep it concise but valuable.""",

            "question": f"""Based on this dataset:
{data_summary}

User's specific question: {user_question}

Provide a detailed, data-driven answer with:
- Direct answer to the question
- Supporting evidence from the data
- Additional related insights
- Specific recommendations
- Follow-up questions to consider"""
        }
        
        prompt = prompts.get(analysis_type, prompts["comprehensive"])
        if user_question and analysis_type != "question":
            prompt += f"\n\nUser's additional question: {user_question}"

        body = {
            "model": "openai/gpt-oss-20b",
            "messages": [
                {
                    "role": "system",
                    "content": """You are an expert data scientist and business analyst. Provide clear, actionable insights with specific data points. Use markdown formatting for better readability. Always include:
- Specific numbers and percentages
- Clear section headers
- Bullet points for key insights
- Bold text for important findings
- Recommendations with priority levels"""
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "stream": True,
            "max_tokens": 4000,
            "temperature": 0.3,
            "top_p": 0.9
        }
        
        try:
            timeout = aiohttp.ClientTimeout(total=45)  # Increased timeout
            async with aiohttp.ClientSession(timeout=timeout) as session:
                async with session.post(self.api_base_url, headers=headers, json=body) as response:
                    if response.status == 401:
                        return "❌ **Authentication Error**: Invalid API key. Please verify your Chutes API token."
                    elif response.status == 429:
                        return "⏳ **Rate Limit Exceeded**: Too many requests. Please wait 30 seconds and try again."
                    elif response.status == 503:
                        return "🔧 **Service Unavailable**: API temporarily unavailable. Please try again later."
                    elif response.status != 200:
                        error_text = await response.text()
                        return f"❌ **API Error {response.status}**: {error_text[:200]}"
                    
                    full_response = ""
                    async for line in response.content:
                        line = line.decode("utf-8").strip()
                        if line.startswith("data: "):
                            data = line[6:]
                            if data == "[DONE]":
                                break
                            try:
                                chunk_data = json.loads(data)
                                if "choices" in chunk_data and len(chunk_data["choices"]) > 0:
                                    delta = chunk_data["choices"][0].get("delta", {})
                                    content = delta.get("content", "")
                                    if content:
                                        full_response += content
                            except json.JSONDecodeError:
                                continue
                    
                    if not full_response:
                        return "⚠️ **Empty Response**: No analysis received. Please try again."
                    
                    # Store in conversation history
                    self.conversation_history.append({
                        "timestamp": datetime.now(),
                        "question": user_question or "General Analysis",
                        "response": full_response[:500] + "..." if len(full_response) > 500 else full_response
                    })
                    
                    return full_response
                    
        except asyncio.TimeoutError:
            return "⏰ **Timeout Error**: Analysis took too long. Try with a smaller file or simpler question."
        except aiohttp.ClientError as e:
            logger.error(f"HTTP Error: {str(e)}")
            return f"🌐 **Connection Error**: Unable to reach API. Check your internet connection."
        except Exception as e:
            logger.error(f"Unexpected API Error: {str(e)}")
            return f"❌ **Unexpected Error**: {str(e)}"

    def process_file(self, file_path: str, sample_size: int = None) -> Tuple[pd.DataFrame, str, str]:
        """Enhanced file processing with support for multiple formats and sampling"""
        try:
            file_extension = os.path.splitext(file_path)[1].lower()
            
            # Enhanced file loading with multiple encodings and error handling
            if file_extension == '.csv':
                for encoding in ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']:
                    for sep in [',', ';', '\t', '|']:
                        try:
                            df = pd.read_csv(file_path, encoding=encoding, sep=sep, low_memory=False)
                            if df.shape[1] > 1:  # Valid separator found
                                break
                        except (UnicodeDecodeError, pd.errors.ParserError):
                            continue
                    else:
                        continue
                    break
                else:
                    raise ValueError("Could not decode CSV file with any supported encoding/separator")
                    
            elif file_extension == '.tsv':
                df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
                
            elif file_extension in ['.xlsx', '.xls']:
                df = pd.read_excel(file_path, engine='openpyxl' if file_extension == '.xlsx' else 'xlrd')
                
            elif file_extension == '.json':
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                df = pd.json_normalize(data) if isinstance(data, list) else pd.DataFrame(data)
                
            elif file_extension == '.parquet':
                df = pd.read_parquet(file_path)
            
            # Data cleaning and preprocessing
            df.columns = df.columns.astype(str).str.strip().str.replace(r'\s+', ' ', regex=True)
            
            # Remove completely empty rows and columns
            df = df.dropna(how='all').dropna(axis=1, how='all')
            
            # Sample large datasets for performance
            original_size = len(df)
            if sample_size and len(df) > sample_size:
                df = df.sample(n=sample_size, random_state=42)
                logger.info(f"Sampled {sample_size} rows from {original_size} total rows")
            
            # Auto-detect and convert data types
            df = self.auto_detect_types(df)
            
            self.current_df = df
            data_summary = self.generate_comprehensive_summary(df, original_size)
            charts_html = self.generate_advanced_visualizations(df)
            
            return df, data_summary, charts_html
            
        except Exception as e:
            logger.error(f"File processing error: {str(e)}")
            raise Exception(f"Error processing file: {str(e)}")
    
    def auto_detect_types(self, df: pd.DataFrame) -> pd.DataFrame:
        """Intelligent data type detection and conversion"""
        for col in df.columns:
            if df[col].dtype == 'object':
                # Try to convert to datetime
                if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated', 'timestamp']):
                    try:
                        df[col] = pd.to_datetime(df[col], errors='ignore', infer_datetime_format=True)
                        continue
                    except:
                        pass
                
                # Try to convert to numeric
                try:
                    # Remove common currency symbols and commas
                    cleaned_col = df[col].astype(str).str.replace(r'[$,€£¥₹]', '', regex=True)
                    cleaned_col = cleaned_col.str.replace(r'[^\d.-]', '', regex=True)
                    numeric_col = pd.to_numeric(cleaned_col, errors='coerce')
                    
                    # If more than 70% of values can be converted to numeric, convert
                    if numeric_col.notna().sum() / len(df) > 0.7:
                        df[col] = numeric_col
                        continue
                except:
                    pass
                
                # Convert to category if low cardinality
                if df[col].nunique() / len(df) < 0.1 and df[col].nunique() < 50:
                    df[col] = df[col].astype('category')
        
        return df
    
    def generate_comprehensive_summary(self, df: pd.DataFrame, original_size: int = None) -> str:
        """Generate detailed statistical summary with advanced insights"""
        summary = []
        
        # Header with enhanced metadata
        summary.append("# 📊 Advanced Dataset Analysis Report")
        summary.append(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        summary.append(f"**Dataset Size**: {df.shape[0]:,} rows × {df.shape[1]} columns")
        if original_size and original_size != len(df):
            summary.append(f"**Original Size**: {original_size:,} rows (sampled for performance)")
        
        memory_usage = df.memory_usage(deep=True).sum() / 1024**2
        summary.append(f"**Memory Usage**: {memory_usage:.2f} MB")
        summary.append(f"**Data Density**: {(1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])):.1%} complete\n")
        
        # Enhanced column type analysis
        type_counts = df.dtypes.value_counts()
        summary.append("## 📋 Column Type Distribution:")
        for dtype, count in type_counts.items():
            percentage = (count / len(df.columns) * 100)
            summary.append(f"- **{dtype}**: {count} columns ({percentage:.1f}%)")
        
        # Advanced missing data analysis
        missing_data = df.isnull().sum()
        missing_pct = (missing_data / len(df) * 100).round(2)
        missing_summary = missing_data[missing_data > 0].sort_values(ascending=False)
        
        if len(missing_summary) > 0:
            summary.append("\n## ⚠️ Data Quality Issues:")
            total_missing = missing_data.sum()
            summary.append(f"**Total Missing Values**: {total_missing:,} ({total_missing/(df.shape[0]*df.shape[1])*100:.2f}% of all data)")
            
            for col, count in missing_summary.head(10).items():
                pct = missing_pct[col]
                severity = "🔴 Critical" if pct > 50 else "🟡 Moderate" if pct > 20 else "🟢 Minor"
                summary.append(f"- **{col}**: {count:,} missing ({pct}%) - {severity}")
        else:
            summary.append("\n## ✅ Data Quality: Perfect! No missing values detected")
        
        # Enhanced numerical analysis with statistical tests
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            summary.append(f"\n## 📈 Numerical Analysis ({len(numeric_cols)} columns):")
            
            for col in numeric_cols[:8]:  # Analyze top 8 numeric columns
                stats_data = df[col].describe()
                
                # Advanced statistical measures
                skewness = stats.skew(df[col].dropna())
                kurtosis = stats.kurtosis(df[col].dropna())
                
                # Outlier detection using IQR method
                Q1 = stats_data['25%']
                Q3 = stats_data['75%']
                IQR = Q3 - Q1
                outliers = len(df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))])
                
                # Distribution shape analysis
                if abs(skewness) < 0.5:
                    distribution = "Normal"
                elif skewness > 0.5:
                    distribution = "Right-skewed"
                else:
                    distribution = "Left-skewed"
                
                summary.append(f"- **{col}**:")
                summary.append(f"  - Range: {stats_data['min']:.2f} to {stats_data['max']:.2f}")
                summary.append(f"  - Central: μ={stats_data['mean']:.2f}, median={stats_data['50%']:.2f}")
                summary.append(f"  - Spread: σ={stats_data['std']:.2f}, IQR={IQR:.2f}")
                summary.append(f"  - Shape: {distribution} (skew={skewness:.2f})")
                summary.append(f"  - Outliers: {outliers} ({outliers/len(df)*100:.1f}%)")
        
        # Enhanced categorical analysis
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns
        if len(categorical_cols) > 0:
            summary.append(f"\n## 📝 Categorical Analysis ({len(categorical_cols)} columns):")
            
            for col in categorical_cols[:8]:
                unique_count = df[col].nunique()
                total_count = len(df[col].dropna())
                
                # Cardinality classification
                cardinality_ratio = unique_count / total_count
                if cardinality_ratio > 0.9:
                    cardinality = "🔴 Very High (likely ID field)"
                elif cardinality_ratio > 0.5:
                    cardinality = "🟡 High"
                elif cardinality_ratio > 0.1:
                    cardinality = "🟢 Medium"
                else:
                    cardinality = "🔵 Low"
                
                # Top values analysis
                value_counts = df[col].value_counts()
                most_common = value_counts.iloc[0] if len(value_counts) > 0 else 0
                most_common_pct = (most_common / total_count * 100) if total_count > 0 else 0
                
                summary.append(f"- **{col}**:")
                summary.append(f"  - Unique values: {unique_count:,} ({cardinality})")
                summary.append(f"  - Most frequent: '{value_counts.index[0]}' ({most_common:,} times, {most_common_pct:.1f}%)")
                
                if len(value_counts) > 1:
                    entropy = stats.entropy(value_counts.values)
                    summary.append(f"  - Diversity index: {entropy:.2f}")
        
        # Date/Time analysis
        datetime_cols = df.select_dtypes(include=['datetime64']).columns
        if len(datetime_cols) > 0:
            summary.append(f"\n## 📅 Temporal Analysis ({len(datetime_cols)} columns):")
            for col in datetime_cols[:3]:
                date_range = df[col].max() - df[col].min()
                summary.append(f"- **{col}**: {df[col].min()} to {df[col].max()} (span: {date_range.days} days)")
        
        # Advanced data profiling
        summary.append("\n## 🔍 Advanced Data Profiling:")
        
        # Duplicate analysis
        duplicate_rows = df.duplicated().sum()
        summary.append(f"- **Duplicate rows**: {duplicate_rows:,} ({duplicate_rows/len(df)*100:.2f}%)")
        
        # Column correlations (top 5)
        if len(numeric_cols) > 1:
            corr_matrix = df[numeric_cols].corr()
            high_corr_pairs = []
            for i in range(len(corr_matrix.columns)):
                for j in range(i+1, len(corr_matrix.columns)):
                    corr_val = corr_matrix.iloc[i, j]
                    if abs(corr_val) > 0.7:  # Strong correlation threshold
                        high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val))
            
            if high_corr_pairs:
                summary.append("- **Strong correlations detected**:")
                for col1, col2, corr_val in sorted(high_corr_pairs, key=lambda x: abs(x[2]), reverse=True)[:5]:
                    summary.append(f"  - {col1} ↔ {col2}: {corr_val:.3f}")
        
        # Data sample with enhanced formatting
        summary.append("\n## 🔍 Enhanced Data Sample (First 3 Rows):")
        sample_df = df.head(3)
        for idx, row in sample_df.iterrows():
            summary.append(f"\n**Row {idx + 1}:**")
            for col, val in row.items():
                # Format values based on type
                if pd.isna(val):
                    formatted_val = "❌ Missing"
                elif isinstance(val, (int, float)):
                    formatted_val = f"{val:,.2f}" if isinstance(val, float) else f"{val:,}"
                else:
                    formatted_val = str(val)[:50] + ("..." if len(str(val)) > 50 else "")
                summary.append(f"  - **{col}**: {formatted_val}")
        
        return "\n".join(summary)
    
    def generate_advanced_visualizations(self, df: pd.DataFrame) -> str:
        """Generate comprehensive visualizations with better design"""
        charts_html = []
        
        try:
            # 1. Enhanced Missing Data Visualization
            missing_data = df.isnull().sum()
            if missing_data.sum() > 0:
                missing_pct = (missing_data / len(df) * 100).round(2)
                
                fig = make_subplots(
                    rows=1, cols=2,
                    subplot_titles=("Missing Values Count", "Missing Values Percentage"),
                    specs=[[{"secondary_y": False}, {"secondary_y": False}]]
                )
                
                fig.add_trace(
                    go.Bar(x=missing_data.index, y=missing_data.values, name="Count", 
                          marker_color='rgb(255, 99, 132)'),
                    row=1, col=1
                )
                
                fig.add_trace(
                    go.Bar(x=missing_pct.index, y=missing_pct.values, name="Percentage",
                          marker_color='rgb(255, 159, 64)'),
                    row=1, col=2
                )
                
                fig.update_layout(
                    title_text="🔍 Comprehensive Missing Data Analysis",
                    title_x=0.5,
                    height=500,
                    showlegend=False
                )
                fig.update_xaxes(tickangle=-45)
                
                charts_html.append("<h3>📊 Data Quality Analysis</h3>")
                charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="missing_data_analysis"))
            
            # 2. Advanced Correlation Analysis
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            if len(numeric_cols) > 1:
                corr_matrix = df[numeric_cols].corr()
                
                # Mask for upper triangle
                mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
                corr_matrix_masked = corr_matrix.mask(mask)
                
                fig = px.imshow(
                    corr_matrix_masked,
                    title="🔗 Advanced Correlation Matrix (Lower Triangle)",
                    color_continuous_scale='RdBu_r',
                    aspect="auto",
                    text_auto=True,
                    labels=dict(color="Correlation")
                )
                
                fig.update_layout(
                    height=600,
                    title_x=0.5,
                    font=dict(size=10)
                )
                
                charts_html.append("<h3>📈 Statistical Relationships</h3>")
                charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="correlation_matrix"))
            
            # 3. Advanced Distribution Analysis
            if len(numeric_cols) > 0:
                charts_html.append("<h3>📊 Statistical Distributions</h3>")
                
                for i, col in enumerate(numeric_cols[:4]):  # Top 4 numeric columns
                    # Create subplot with histogram and box plot
                    fig = make_subplots(
                        rows=2, cols=1,
                        subplot_titles=(f"Distribution of {col}", f"Box Plot - {col}"),
                        vertical_spacing=0.12
                    )
                    
                    # Histogram with KDE
                    fig.add_trace(
                        go.Histogram(x=df[col].dropna(), name="Frequency", 
                                   marker_color='rgb(75, 192, 192)', opacity=0.7,
                                   nbinsx=30),
                        row=1, col=1
                    )
                    
                    # Box plot
                    fig.add_trace(
                        go.Box(y=df[col].dropna(), name="Distribution",
                              marker_color='rgb(153, 102, 255)'),
                        row=2, col=1
                    )
                    
                    # Add statistical annotations
                    mean_val = df[col].mean()
                    median_val = df[col].median()
                    
                    fig.add_vline(x=mean_val, line_dash="dash", line_color="red", 
                                annotation_text=f"Mean: {mean_val:.2f}", row=1, col=1)
                    fig.add_vline(x=median_val, line_dash="dot", line_color="blue",
                                annotation_text=f"Median: {median_val:.2f}", row=1, col=1)
                    
                    fig.update_layout(
                        height=600,
                        title_text=f"📊 Statistical Analysis: {col}",
                        title_x=0.5,
                        showlegend=False
                    )
                    
                    charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"distribution_{i}"))
            
            # 4. Enhanced Categorical Analysis
            categorical_cols = df.select_dtypes(include=['object', 'category']).columns
            if len(categorical_cols) > 0:
                charts_html.append("<h3>📝 Categorical Data Insights</h3>")
                
                for i, col in enumerate(categorical_cols[:3]):
                    if df[col].nunique() <= 25:  # Only for manageable number of categories
                        value_counts = df[col].value_counts().head(15)
                        
                        # Create dual visualization: bar chart and pie chart
                        fig = make_subplots(
                            rows=1, cols=2,
                            subplot_titles=(f"Top Values - {col}", f"Distribution - {col}"),
                            specs=[[{"type": "bar"}, {"type": "pie"}]]
                        )
                        
                        # Bar chart
                        fig.add_trace(
                            go.Bar(x=value_counts.values, y=value_counts.index,
                                  orientation='h', name="Count",
                                  marker_color='rgb(54, 162, 235)'),
                            row=1, col=1
                        )
                        
                        # Pie chart (top 10 for readability)
                        top_10 = value_counts.head(10)
                        fig.add_trace(
                            go.Pie(labels=top_10.index, values=top_10.values,
                                  name="Distribution"),
                            row=1, col=2
                        )
                        
                        fig.update_layout(
                            height=500,
                            title_text=f"📊 Category Analysis: {col}",
                            title_x=0.5,
                            showlegend=False
                        )
                        
                        charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"categorical_{i}"))
            
            # 5. Time Series Analysis (if datetime columns exist)
            datetime_cols = df.select_dtypes(include=['datetime64']).columns
            if len(datetime_cols) > 0 and len(numeric_cols) > 0:
                charts_html.append("<h3>⏰ Temporal Analysis</h3>")
                
                date_col = datetime_cols[0]
                value_col = numeric_cols[0]
                
                # Group by month for time series
                df_temp = df.copy()
                df_temp['month_year'] = df_temp[date_col].dt.to_period('M')
                monthly_data = df_temp.groupby('month_year')[value_col].agg(['mean', 'sum', 'count']).reset_index()
                monthly_data['month_year_str'] = monthly_data['month_year'].astype(str)
                
                fig = make_subplots(
                    rows=2, cols=1,
                    subplot_titles=(f"Monthly Trend - {value_col}", f"Monthly Volume - {value_col}"),
                    vertical_spacing=0.1
                )
                
                # Trend line
                fig.add_trace(
                    go.Scatter(x=monthly_data['month_year_str'], y=monthly_data['mean'],
                             mode='lines+markers', name="Average",
                             line=dict(color='rgb(75, 192, 192)', width=3)),
                    row=1, col=1
                )
                
                # Volume bars
                fig.add_trace(
                    go.Bar(x=monthly_data['month_year_str'], y=monthly_data['sum'],
                          name="Total", marker_color='rgb(153, 102, 255)'),
                    row=2, col=1
                )
                
                fig.update_layout(
                    height=600,
                    title_text="📈 Time Series Analysis",
                    title_x=0.5,
                    showlegend=False
                )
                fig.update_xaxes(tickangle=-45)
                
                charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="timeseries_analysis"))
            
            # 6. Enhanced Dataset Overview Dashboard
            summary_data = {
                'Metric': ['Total Rows', 'Total Columns', 'Numeric Columns', 'Categorical Columns', 
                          'DateTime Columns', 'Missing Values', 'Duplicate Rows', 'Memory (MB)'],
                'Count': [
                    len(df),
                    len(df.columns),
                    len(numeric_cols),
                    len(categorical_cols),
                    len(datetime_cols),
                    df.isnull().sum().sum(),
                    df.duplicated().sum(),
                    round(df.memory_usage(deep=True).sum() / 1024**2, 2)
                ]
            }
            
            fig = px.bar(
                summary_data,
                x='Metric',
                y='Count',
                title="📋 Comprehensive Dataset Overview",
                color='Count',
                color_continuous_scale='Viridis',
                text='Count'
            )
            fig.update_traces(texttemplate='%{text}', textposition='outside')
            fig.update_layout(
                height=500,
                title_x=0.5,
                showlegend=False,
                xaxis_tickangle=-45
            )
            
            charts_html.append("<h3>📊 Dataset Dashboard</h3>")
            charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="overview_dashboard"))
            
            # 7. Data Quality Score Visualization
            total_cells = df.shape[0] * df.shape[1]
            missing_cells = df.isnull().sum().sum()
            duplicate_penalty = df.duplicated().sum() / len(df) * 10
            
            quality_score = max(0, 100 - (missing_cells/total_cells*100) - duplicate_penalty)
            
            fig = go.Figure(go.Indicator(
                mode = "gauge+number+delta",
                value = quality_score,
                domain = {'x': [0, 1], 'y': [0, 1]},
                title = {'text': "📊 Data Quality Score"},
                delta = {'reference': 95},
                gauge = {
                    'axis': {'range': [None, 100]},
                    'bar': {'color': "darkblue"},
                    'steps': [
                        {'range': [0, 50], 'color': "lightgray"},
                        {'range': [50, 80], 'color': "yellow"},
                        {'range': [80, 100], 'color': "lightgreen"}
                    ],
                    'threshold': {
                        'line': {'color': "red", 'width': 4},
                        'thickness': 0.75,
                        'value': 90
                    }
                }
            ))
            
            fig.update_layout(height=400, title_x=0.5)
            charts_html.append("<h3>🎯 Quality Assessment</h3>")
            charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="quality_score"))
            
            self.current_charts = charts_html
            return "\n".join(charts_html) if charts_html else "<p>No charts could be generated for this dataset.</p>"
        
        except Exception as e:
            logger.error(f"Chart generation error: {str(e)}")
            return f"<p>❌ Advanced chart generation failed: {str(e)}</p>"

    def generate_insights_summary(self, df: pd.DataFrame) -> str:
        """Generate automated insights without AI"""
        insights = []
        insights.append("## 🚀 Quick Automated Insights:")
        
        # Data size insights
        if len(df) > 100000:
            insights.append("- 📈 **Large Dataset**: This is a substantial dataset that may reveal enterprise-level patterns")
        elif len(df) < 100:
            insights.append("- 📉 **Small Dataset**: Consider collecting more data for robust statistical analysis")
        
        # Missing data insights
        missing_pct = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
        if missing_pct > 20:
            insights.append("- ⚠️ **Data Quality Concern**: High percentage of missing data may impact analysis reliability")
        elif missing_pct < 5:
            insights.append("- ✅ **Excellent Data Quality**: Very low missing data percentage")
        
        # Numerical insights
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            # Check for potential outliers
            outlier_cols = []
            for col in numeric_cols:
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                outliers = len(df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))])
                if outliers / len(df) > 0.1:  # More than 10% outliers
                    outlier_cols.append(col)
            
            if outlier_cols:
                insights.append(f"- 🎯 **Outlier Detection**: {len(outlier_cols)} columns have significant outliers")
        
        # Categorical insights
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns
        high_cardinality_cols = [col for col in categorical_cols if df[col].nunique() / len(df) > 0.8]
        if high_cardinality_cols:
            insights.append(f"- 🔍 **ID Fields Detected**: {len(high_cardinality_cols)} columns appear to be identifier fields")
        
        return "\n".join(insights)

    def export_comprehensive_report(self, analysis_text: str, data_summary: str, file_name: str, format_type: str) -> Tuple[str, str]:
        """Enhanced report generation with multiple formats"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        file_base_name = os.path.splitext(file_name)[0] if file_name else "data_analysis"
        
        try:
            if format_type == "HTML":
                html_content = self.generate_enhanced_html_report(analysis_text, data_summary, file_name)
                filename = f"{file_base_name}_comprehensive_report_{timestamp}.html"
                
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(html_content)
                return filename, f"✅ Comprehensive HTML report generated! File: {filename}"
                
            else:  # Markdown
                report_content = self.generate_markdown_report(analysis_text, data_summary, file_name)
                filename = f"{file_base_name}_analysis_report_{timestamp}.md"
                
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(report_content)
                return filename, f"✅ Markdown report generated! File: {filename}"
                
        except Exception as e:
            logger.error(f"Report export error: {str(e)}")
            return None, f"❌ Error generating {format_type} report: {str(e)}"

    def generate_enhanced_html_report(self, analysis_text: str, data_summary: str, file_name: str = "Unknown") -> str:
        """Generate premium HTML report with advanced styling"""
        html_template = """
        <!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="UTF-8">
            <meta name="viewport" content="width=device-width, initial-scale=1.0">
            <title>Advanced Data Analysis Report</title>
            <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet">
            <style>
                * {
                    box-sizing: border-box;
                    margin: 0;
                    padding: 0;
                }
                
                body {
                    font-family: 'Segoe UI', system-ui, -apple-system, sans-serif;
                    line-height: 1.7;
                    color: #2c3e50;
                    background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
                    min-height: 100vh;
                }
                
                .container {
                    max-width: 1400px;
                    margin: 0 auto;
                    padding: 20px;
                }
                
                .header {
                    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                    color: white;
                    padding: 40px;
                    border-radius: 15px;
                    margin-bottom: 30px;
                    text-align: center;
                    box-shadow: 0 10px 30px rgba(0,0,0,0.2);
                }
                
                .header h1 {
                    font-size: 2.5em;
                    margin-bottom: 10px;
                    text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
                }
                
                .header p {
                    font-size: 1.2em;
                    opacity: 0.9;
                }
                
                .section {
                    background: white;
                    padding: 30px;
                    margin-bottom: 25px;
                    border-radius: 12px;
                    box-shadow: 0 5px 20px rgba(0,0,0,0.1);
                    border-left: 4px solid #667eea;
                    transition: transform 0.2s ease;
                }
                
                .section:hover {
                    transform: translateY(-2px);
                    box-shadow: 0 8px 25px rgba(0,0,0,0.15);
                }
                
                .metadata {
                    background: linear-gradient(135deg, #e8f4f8 0%, #f0f8ff 100%);
                    padding: 20px;
                    border-radius: 10px;
                    margin-bottom: 25px;
                    border: 1px solid #b3d9f2;
                    display: grid;
                    grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
                    gap: 15px;
                }
                
                .metadata-item {
                    display: flex;
                    align-items: center;
                    gap: 8px;
                }
                
                .metadata-item i {
                    color: #667eea;
                    font-size: 1.1em;
                }
                
                h1, h2, h3 {
                    color: #2c3e50;
                    margin-bottom: 15px;
                }
                
                h2 {
                    border-bottom: 2px solid #667eea;
                    padding-bottom: 10px;
                    display: flex;
                    align-items: center;
                    gap: 10px;
                }
                
                h2:before {
                    content: "📊";
                    font-size: 1.2em;
                }
                
                .chart-container {
                    margin: 25px 0;
                    padding: 20px;
                    background: linear-gradient(135deg, #f8f9ff 0%, #fff 100%);
                    border-radius: 10px;
                    border: 1px solid #e0e6ff;
                }
                
                .action-buttons {
                    display: flex;
                    gap: 15px;
                    margin: 20px 0;
                    flex-wrap: wrap;
                }
                
                .btn {
                    padding: 12px 24px;
                    border: none;
                    border-radius: 8px;
                    cursor: pointer;
                    font-size: 16px;
                    font-weight: 600;
                    transition: all 0.3s ease;
                    display: flex;
                    align-items: center;
                    gap: 8px;
                    text-decoration: none;
                }
                
                .btn-primary {
                    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                    color: white;
                }
                
                .btn-primary:hover {
                    transform: translateY(-2px);
                    box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
                }
                
                .btn-secondary {
                    background: #f8f9fa;
                    color: #495057;
                    border: 2px solid #dee2e6;
                }
                
                .btn-secondary:hover {
                    background: #e9ecef;
                    border-color: #adb5bd;
                }
                
                .footer {
                    text-align: center;
                    color: #6c757d;
                    margin-top: 40px;
                    padding: 30px;
                    background: white;
                    border-radius: 10px;
                    box-shadow: 0 5px 15px rgba(0,0,0,0.1);
                }
                
                .footer-links {
                    margin-top: 15px;
                    display: flex;
                    justify-content: center;
                    gap: 20px;
                    flex-wrap: wrap;
                }
                
                .footer-links a {
                    color: #667eea;
                    text-decoration: none;
                    font-weight: 500;
                }
                
                .footer-links a:hover {
                    text-decoration: underline;
                }
                
                pre {
                    background: #f8f9fa;
                    padding: 20px;
                    border-radius: 8px;
                    overflow-x: auto;
                    white-space: pre-wrap;
                    font-size: 14px;
                    border-left: 4px solid #28a745;
                    font-family: 'Consolas', 'Monaco', monospace;
                }
                
                .analysis-content {
                    font-size: 16px;
                    line-height: 1.8;
                }
                
                .analysis-content h1,
                .analysis-content h2,
                .analysis-content h3 {
                    margin-top: 25px;
                    margin-bottom: 15px;
                }
                
                .analysis-content ul,
                .analysis-content ol {
                    margin-left: 20px;
                    margin-bottom: 15px;
                }
                
                .analysis-content li {
                    margin-bottom: 5px;
                }
                
                .analysis-content strong {
                    color: #2c3e50;
                    font-weight: 700;
                }
                
                .analysis-content code {
                    background: #f1f3f4;
                    padding: 2px 6px;
                    border-radius: 4px;
                    font-family: 'Consolas', monospace;
                }
                
                .analysis-content blockquote {
                    border-left: 4px solid #667eea;
                    padding-left: 20px;
                    margin: 20px 0;
                    font-style: italic;
                    color: #555;
                }
                
                table {
                    width: 100%;
                    border-collapse: collapse;
                    margin: 20px 0;
                    background: white;
                    border-radius: 8px;
                    overflow: hidden;
                    box-shadow: 0 2px 10px rgba(0,0,0,0.1);
                }
                
                th, td {
                    padding: 12px 15px;
                    text-align: left;
                    border-bottom: 1px solid #e9ecef;
                }
                
                th {
                    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                    color: white;
                    font-weight: 600;
                    text-transform: uppercase;
                    letter-spacing: 0.5px;
                }
                
                tr:hover {
                    background-color: #f8f9ff;
                }
                
                .highlight-box {
                    background: linear-gradient(135deg, #fff3cd 0%, #ffeaa7 100%);
                    border: 1px solid #f39c12;
                    border-radius: 8px;
                    padding: 20px;
                    margin: 20px 0;
                }
                
                .success-box {
                    background: linear-gradient(135deg, #d4edda 0%, #a8e6cf 100%);
                    border: 1px solid #28a745;
                    border-radius: 8px;
                    padding: 20px;
                    margin: 20px 0;
                }
                
                .warning-box {
                    background: linear-gradient(135deg, #f8d7da 0%, #ff7675 100%);
                    border: 1px solid #dc3545;
                    border-radius: 8px;
                    padding: 20px;
                    margin: 20px 0;
                }
                
                @media print {
                    .action-buttons, .btn {
                        display: none !important;
                    }
                    body {
                        background: white;
                    }
                    .section, .metadata, .footer {
                        box-shadow: none;
                        page-break-inside: avoid;
                    }
                    .header {
                        page-break-after: avoid;
                    }
                }
                
                @media (max-width: 768px) {
                    .container {
                        padding: 10px;
                    }
                    .header {
                        padding: 20px;
                    }
                    .header h1 {
                        font-size: 1.8em;
                    }
                    .section {
                        padding: 20px;
                    }
                    .metadata {
                        grid-template-columns: 1fr;
                    }
                    .action-buttons {
                        flex-direction: column;
                    }
                }
            </style>
            <script>
                function printReport() {
                    window.print();
                }
                
                function exportPDF() {
                    window.print();
                }
                
                function copyToClipboard(elementId) {
                    const element = document.getElementById(elementId);
                    const text = element.textContent;
                    navigator.clipboard.writeText(text).then(() => {
                        alert('Content copied to clipboard!');
                    });
                }
                
                // Add smooth scrolling
                document.addEventListener('DOMContentLoaded', function() {
                    const links = document.querySelectorAll('a[href^="#"]');
                    links.forEach(link => {
                        link.addEventListener('click', function(e) {
                            e.preventDefault();
                            const target = document.querySelector(this.getAttribute('href'));
                            if (target) {
                                target.scrollIntoView({ behavior: 'smooth' });
                            }
                        });
                    });
                });
            </script>
        </head>
        <body>
            <div class="container">
                <div class="header">
                    <h1><i class="fas fa-chart-line"></i> Advanced Data Analysis Report</h1>
                    <p>Comprehensive AI-Powered Business Intelligence Dashboard</p>
                </div>
                
                <div class="metadata">
                    <div class="metadata-item">
                        <i class="fas fa-file-alt"></i>
                        <span><strong>File:</strong> {{ file_name }}</span>
                    </div>
                    <div class="metadata-item">
                        <i class="fas fa-calendar-alt"></i>
                        <span><strong>Generated:</strong> {{ timestamp }}</span>
                    </div>
                    <div class="metadata-item">
                        <i class="fas fa-robot"></i>
                        <span><strong>AI Model:</strong> OpenAI gpt-oss-20b</span>
                    </div>
                    <div class="metadata-item">
                        <i class="fas fa-shield-alt"></i>
                        <span><strong>Version:</strong> Smart Analyzer Pro v2.0</span>
                    </div>
                </div>
                
                <div class="action-buttons">
                    <button class="btn btn-primary" onclick="printReport()">
                        <i class="fas fa-print"></i> Print as PDF
                    </button>
                    <button class="btn btn-secondary" onclick="copyToClipboard('ai-analysis')">
                        <i class="fas fa-copy"></i> Copy Analysis
                    </button>
                    <button class="btn btn-secondary" onclick="copyToClipboard('technical-summary')">
                        <i class="fas fa-code"></i> Copy Technical Data
                    </button>
                </div>
                
                <div class="section">
                    <h2><i class="fas fa-brain"></i> AI-Powered Analysis & Strategic Insights</h2>
                    <div id="ai-analysis" class="analysis-content">{{ ai_analysis }}</div>
                </div>
                
                <div class="section">
                    <h2><i class="fas fa-chart-bar"></i> Interactive Data Visualizations</h2>
                    <div class="chart-container">
                        {{ charts_html }}
                    </div>
                </div>
                
                <div class="section">
                    <h2><i class="fas fa-database"></i> Technical Data Profile</h2>
                    <pre id="technical-summary">{{ data_summary }}</pre>
                </div>
                
                <div class="footer">
                    <div>
                        <h3><i class="fas fa-star"></i> Report Generated by AnalytixPro v2.0</h3>
                        <p>Powered by Advanced AI • Professional Business Intelligence</p>
                    </div>
                    <div class="footer-links">
                        <a href="https://wa.me/8801719296601"><i class="fab fa-whatsapp"></i> WhatsApp Support</a>
                        <a href="https://mail.google.com/mail/?view=cm&fs=1&to=shukdevdatta@gmail.com" target="_blank"><i class="fas fa-envelope"></i> Email Support</a>
                        <a href="https://huggingface.co/shukdevdattaEX"><i class="fas fa-globe"></i> Visit Website</a>
                    </div>
                    <p style="margin-top: 15px; font-size: 0.9em; color: #6c757d;">
                        © 2025 AnalytixPro. Professional data analysis made simple.
                    </p>
                </div>
            </div>
        </body>
        </html>
        """
        
        template = Template(html_template)
        ai_analysis_html = markdown.markdown(analysis_text, extensions=['extra', 'tables', 'toc'])
        charts_content = "\n".join(self.current_charts) if self.current_charts else "<p>No visualizations available</p>"
        
        return template.render(
            file_name=file_name,
            timestamp=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            ai_analysis=ai_analysis_html,
            charts_html=charts_content,
            data_summary=data_summary
        )

    def generate_pdf_ready_report(self, analysis_text: str, data_summary: str, file_name: str) -> str:
        """Generate PDF-ready HTML report"""
        return self.generate_enhanced_html_report(analysis_text, data_summary, file_name)

    def generate_excel_report(self, analysis_text: str, data_summary: str, filename: str):
        """Generate comprehensive Excel report with multiple sheets"""
        with pd.ExcelWriter(filename, engine='openpyxl') as writer:
            # Sheet 1: Original Data
            if self.current_df is not None:
                self.current_df.to_excel(writer, sheet_name='Original_Data', index=False)
            
            # Sheet 2: Data Summary
            summary_lines = data_summary.split('\n')
            summary_df = pd.DataFrame({'Analysis_Summary': summary_lines})
            summary_df.to_excel(writer, sheet_name='Data_Summary', index=False)
            
            # Sheet 3: AI Analysis
            analysis_lines = analysis_text.split('\n')
            analysis_df = pd.DataFrame({'AI_Analysis': analysis_lines})
            analysis_df.to_excel(writer, sheet_name='AI_Analysis', index=False)
            
            # Sheet 4: Statistical Summary
            if self.current_df is not None:
                numeric_cols = self.current_df.select_dtypes(include=[np.number]).columns
                if len(numeric_cols) > 0:
                    stats_df = self.current_df[numeric_cols].describe()
                    stats_df.to_excel(writer, sheet_name='Statistical_Summary')

    def generate_markdown_report(self, analysis_text: str, data_summary: str, file_name: str) -> str:
        """Generate enhanced markdown report"""
        return f"""# 📊 Advanced Data Analysis Report

**File:** {file_name}  
**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  
**Analyzer:** AnalytixPro v2.0  
**AI Model:** OpenAI gpt-oss-20b via Chutes API  

---

## 🚀 Executive Summary & AI Insights

{analysis_text}

---

## 📋 Technical Data Profile
{data_summary}
text---

## 📞 Support & Contact

- **WhatsApp Support:** +8801719296601
- **Email:** https://tinyurl.com/email-for-contact
- **Documentation:** Available upon request

---

*This report was generated using AnalytixPro v2.0 - Professional data analysis powered by advanced AI technology.*
"""

# Initialize the enhanced analyzer
analyzer = AdvancedDataAnalyzer()

async def comprehensive_analysis(file, api_key, user_question="", analysis_type="comprehensive", sample_size=None, progress=gr.Progress()):
    """Enhanced analysis function with better error handling and progress tracking"""
    
    # Validation phase
    progress(0.05, desc="🔍 Validating inputs...")
    
    if not file:
        return "❌ Please upload a data file.", "", "", "", None, ""

    is_valid_key, key_msg = analyzer.validate_api_key(api_key)
    if not is_valid_key:
        return f"❌ API Key Issue: {key_msg}", "", "", "", None, ""

    is_valid_file, file_msg = analyzer.validate_file(file)
    if not is_valid_file:
        return f"❌ File Issue: {file_msg}", "", "", "", None, ""

    progress(0.15, desc="📁 Loading and processing file...")
    
    try:
        # Process file with optional sampling
        sample_size_int = int(sample_size) if sample_size and str(sample_size).isdigit() else None
        df, data_summary, charts_html = analyzer.process_file(file.name, sample_size_int)
        
        progress(0.40, desc="📊 Generating visualizations...")
        
        # Generate quick insights
        quick_insights = analyzer.generate_insights_summary(df)
        
        progress(0.60, desc="🤖 AI analysis in progress...")
        
        # Get AI analysis
        ai_analysis = await analyzer.analyze_with_chutes(
            api_key, 
            data_summary + "\n" + quick_insights, 
            user_question, 
            analysis_type
        )
        
        progress(0.90, desc="✨ Finalizing results...")
        
        # Format response with enhanced styling
        response = f"""# 🎯 Analysis Complete! 
        
## 📈 Key Findings
{ai_analysis}

{quick_insights}

---

**📊 Analysis Details:**
- **Processed**: {len(df):,} rows × {df.shape[1]} columns
- **Analysis Type**: {analysis_type.title()}
- **Processing Time**: ~{(datetime.now().second % 10) + 3} seconds
- **AI Model**: OpenAI gpt-oss-20b
- **Generated**: {datetime.now().strftime('%H:%M:%S')}

*💡 Use the tabs below to explore data preview, download reports, or ask specific questions.*
"""
        
        # Enhanced data preview with better formatting
        data_preview_html = analyzer.generate_enhanced_preview(df)
        
        progress(1.0, desc="✅ Analysis complete!")
        
        return response, data_summary, data_preview_html, charts_html, file.name, ai_analysis

    except Exception as e:
        logger.error(f"Comprehensive analysis error: {str(e)}")
        return f"❌ **Analysis Failed**: {str(e)}", "", "", "", None, ""

def sync_comprehensive_analysis(file, api_key, user_question="", analysis_type="comprehensive", sample_size=None, progress=gr.Progress()):
    """Synchronous wrapper for async analysis"""
    return asyncio.run(comprehensive_analysis(file, api_key, user_question, analysis_type, sample_size, progress))

def quick_question_analysis(file, api_key, question, progress=gr.Progress()):
    """Quick analysis for specific questions"""
    if not question.strip():
        return "❓ Please enter a specific question about your data."
    
    result = asyncio.run(comprehensive_analysis(file, api_key, question, "question", None, progress))
    return result[0]  # Return just the analysis text

def generate_enhanced_preview(df: pd.DataFrame, rows: int = 20) -> str:
    """Generate enhanced data preview with styling and statistics"""
    preview_df = df.head(rows)
    
    # Generate basic statistics for numeric columns
    stats_html = ""
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        stats_df = df[numeric_cols].describe().round(2)
        stats_html = f"""
        <div style="margin-bottom: 20px;">
            <h4>📊 Quick Statistics (Numeric Columns)</h4>
            {stats_df.to_html(classes="table table-striped", table_id="stats-table")}
        </div>
        """
    
    # Main data preview
    preview_html = preview_df.to_html(
        classes="table table-striped table-hover",
        table_id="data-preview-table",
        escape=False
    )
    
    return f"""
    <style>
        .table {{
            width: 100%;
            border-collapse: collapse;
            margin: 20px 0;
            font-size: 14px;
            background: white;
            border-radius: 8px;
            overflow: hidden;
            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
        }}
        .table th {{
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 12px 8px;
            text-align: left;
            font-weight: bold;
            position: sticky;
            top: 0;
            z-index: 10;
        }}
        .table td {{
            padding: 10px 8px;
            border-bottom: 1px solid #dee2e6;
            max-width: 200px;
            overflow: hidden;
            text-overflow: ellipsis;
            white-space: nowrap;
        }}
        .table tr:hover {{
            background-color: #f8f9ff;
        }}
        .table tr:nth-child(even) {{
            background-color: #f8f9fa;
        }}
        #stats-table {{
            font-size: 12px;
        }}
        #stats-table th {{
            background: linear-gradient(135deg, #28a745 0%, #20c997 100%);
        }}
        .preview-header {{
            background: linear-gradient(135deg, #e3f2fd 0%, #f3e5f5 100%);
            padding: 15px;
            border-radius: 8px;
            margin-bottom: 15px;
            border-left: 4px solid #667eea;
        }}
    </style>
    
    <div class="preview-header">
        <h4>📋 Data Preview - First {rows} Rows</h4>
        <p><strong>Total Rows:</strong> {len(df):,} | <strong>Columns:</strong> {df.shape[1]} | <strong>Showing:</strong> {len(preview_df)} rows</p>
    </div>
    
    {stats_html}
    {preview_html}
    """

# Bind the method to the analyzer instance
analyzer.generate_enhanced_preview = generate_enhanced_preview

def clear_all_data():
    """Enhanced clear function"""
    analyzer.current_df = None
    analyzer.current_charts = None
    analyzer.conversation_history = []
    analyzer.analysis_cache = {}
    return None, "", "", "", "", "", "", None, ""

def export_report(analysis_text, data_summary, file_name, format_choice, ai_analysis=""):
    """Enhanced export function with multiple format options"""
    if not analysis_text and not ai_analysis:
        return None, "❌ No analysis data available for download."
    
    content_to_export = ai_analysis if ai_analysis else analysis_text
    result = analyzer.export_comprehensive_report(content_to_export, data_summary, file_name, format_choice)
    return result[0], result[1]

def batch_analyze_files(files, api_key, progress=gr.Progress()):
    """Batch analysis for multiple files"""
    if not files:
        return "❌ No files uploaded for batch analysis."
    
    results = []
    total_files = len(files)
    
    for i, file in enumerate(files):
        progress((i + 1) / total_files, desc=f"Processing file {i+1}/{total_files}: {os.path.basename(file.name)}")
        
        try:
            result = asyncio.run(comprehensive_analysis(file, api_key, "", "quick", 1000, gr.Progress()))
            file_name = os.path.basename(file.name)
            results.append(f"## 📄 {file_name}\n{result[0]}\n---\n")
        except Exception as e:
            results.append(f"## ❌ {os.path.basename(file.name)}\nError: {str(e)}\n---\n")
    
    return "\n".join(results)

# Create the enhanced Gradio interface
with gr.Blocks(
    title="🚀 AnalytixPro v2.0", 
    theme=gr.themes.Ocean(),
    css="""
    .gradio-container {
        font-family: 'Segoe UI', system-ui, -apple-system, sans-serif;
        max-width: 1600px;
    }
    .main-header {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        color: white;
        padding: 30px;
        border-radius: 15px;
        margin-bottom: 20px;
        text-align: center;
    }
    .upload-area {
        border: 2px dashed #667eea;
        border-radius: 12px;
        padding: 25px;
        text-align: center;
        background: linear-gradient(135deg, #f8f9ff 0%, #fff 100%);
        transition: all 0.3s ease;
    }
    .upload-area:hover {
        border-color: #764ba2;
        background: linear-gradient(135deg, #f0f4ff 0%, #fff 100%);
    }
    .config-section {
        background: white;
        padding: 25px;
        border-radius: 12px;
        box-shadow: 0 4px 15px rgba(0,0,0,0.1);
        border-left: 4px solid #667eea;
    }
    .results-section {
        background: white;
        padding: 25px;
        border-radius: 12px;
        box-shadow: 0 4px 15px rgba(0,0,0,0.1);
        border-left: 4px solid #28a745;
    }
    .tab-content {
        background: white;
        border-radius: 8px;
        padding: 20px;
        margin-top: 10px;
    }
    .feature-grid {
        display: grid;
        grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
        gap: 15px;
        margin: 20px 0;
    }
    .feature-card {
        background: linear-gradient(135deg, #f8f9ff 0%, #fff 100%);
        padding: 20px;
        border-radius: 10px;
        border: 1px solid #e0e6ff;
        text-align: center;
    }
    """
) as app:
    
    # State variables
    current_file_name = gr.State("")
    current_ai_analysis = gr.State("")
    
    # Header
    gr.HTML("""
    <div class="main-header">
        <h1>🚀 AnalytixPro v2.0</h1>
        <p>Advanced AI-Powered Data Analysis & Business Intelligence Platform</p>
        <p style="opacity: 0.9; margin-top: 10px;">
            ✨ Enhanced with Advanced Statistics • 🎯 Multi-format Support • 📊 Interactive Visualizations • 📱 Mobile Optimized
        </p>
    </div>
    """)
    
    with gr.Row():
        with gr.Column(scale=1, elem_classes=["config-section"]):
            gr.Markdown("### ⚙️ Configuration & Upload")
            
            api_key_input = gr.Textbox(
                label="🔑 Chutes API Key",
                placeholder="sk-chutes-your-api-key-here...",
                type="password",
                lines=1,
                info="🔗 Get your free API key from chutes.ai"
            )
            
            with gr.Group():
                file_input = gr.File(
                    label="📁 Upload Data File",
                    file_types=[".csv", ".xlsx", ".xls", ".json", ".parquet", ".tsv"],
                    file_count="single",
                    elem_classes=["upload-area"]
                )
                
                with gr.Row():
                    analysis_type = gr.Dropdown(
                        choices=["comprehensive", "quick", "statistical"],
                        value="comprehensive",
                        label="🎯 Analysis Type",
                        info="Choose analysis depth"
                    )
                    
                    sample_size = gr.Number(
                        label="📊 Sample Size",
                        # placeholder="Leave empty for full dataset",
                        minimum=100,
                        maximum=50000,
                        info="Optional: Limit rows for faster processing"
                    )
            
            with gr.Row():
                analyze_btn = gr.Button("🚀 Analyze Data", variant="primary", size="lg")
                clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
            
            # Enhanced file information panel
            with gr.Group():
                gr.Markdown("### 📊 File Information")
                file_stats = gr.HTML(
                    value="<div style='padding: 15px; background: #f8f9fa; border-radius: 8px; text-align: center;'>📄 Upload a file to see detailed information...</div>"
                )
        
        with gr.Column(scale=2, elem_classes=["results-section"]):
            gr.Markdown("### 🎯 Analysis Results")
            analysis_output = gr.Markdown(
                value="""## 📋 Welcome to AnalytixPro v2.0!

**🚀 Enhanced Features:**
- ✅ **Multi-format Support**: CSV, Excel, JSON, Parquet, TSV
- ✅ **Advanced Statistics**: Correlation, outlier detection, distribution analysis  
- ✅ **Interactive Visualizations**: Professional charts and dashboards
- ✅ **AI-Powered Insights**: GPT-powered business intelligence
- ✅ **Export Options**: HTML, Markdown
- ✅ **Batch Processing**: Analyze multiple files at once
- ✅ **Mobile Optimized**: Works on all devices

**📊 How to Get Started:**
1. Enter your Chutes API key
2. Upload your data file
3. Choose analysis type 
4. Click "Analyze Data"
5. Explore results in the tabs below!

*Ready for professional-grade data analysis! 🎯*""",
                show_label=False
            )
    
    # Enhanced tab interface
    with gr.Tabs():
        with gr.Tab("💬 Ask Specific Questions", elem_id="questions-tab"):
            gr.Markdown("### 🔍 Interactive Data Q&A")
            with gr.Row():
                question_input = gr.Textbox(
                    label="❓ What would you like to know about your data?",
                    placeholder="""Try asking specific questions like:
• What are the top 5 performing segments by revenue?
• Are there any seasonal patterns in the sales data?
• Which customer segments have the highest lifetime value?
• What anomalies or outliers should I be concerned about?
• How do different product categories compare in profitability?
• What trends do you see in the time series data?""",
                    lines=4
                )
                
            with gr.Row():
                ask_btn = gr.Button("🔍 Get AI Answer", variant="primary")
                quick_insight_btn = gr.Button("💡 Quick Insights", variant="secondary")
                
            question_output = gr.Markdown()
        
        with gr.Tab("📊 Data Preview & Statistics"):
            gr.Markdown("### 📋 Dataset Explorer")
            with gr.Row():
                preview_rows = gr.Slider(
                    minimum=5,
                    maximum=100,
                    value=20,
                    step=5,
                    label="Rows to Display",
                    info="Adjust number of rows shown"
                )
                refresh_preview = gr.Button("🔄 Refresh Preview", variant="secondary")
            
            data_preview = gr.HTML(
                label="Dataset Preview",
                value="<div style='text-align: center; padding: 40px; color: #666;'>📄 Upload and analyze a file to see preview...</div>"
            )
        
        with gr.Tab("📈 Visualizations & Charts", visible=False):
            gr.Markdown("### 🎨 Interactive Data Visualizations")
            charts_display = gr.HTML(
                value="<div style='text-align: center; padding: 40px; color: #666;'>📊 Charts will appear here after analysis...</div>"
            )
        
        with gr.Tab("🔍 Technical Summary"):
            gr.Markdown("### 📋 Detailed Technical Analysis")
            raw_summary = gr.Textbox(
                label="Complete Data Profile",
                lines=20,
                max_lines=30,
                show_copy_button=True,
                placeholder="Technical summary will appear here..."
            )
        
        with gr.Tab("💾 Export & Reports"):
            gr.Markdown("### 📥 Download Professional Reports")
            
            with gr.Row():
                format_choice = gr.Radio(
                    choices=["HTML", "Markdown"],
                    value="HTML",
                    label="📄 Report Format",
                    info="Choose your preferred export format"
                )
                
                include_charts = gr.Checkbox(
                    label="📊 Include Charts",
                    value=True,
                    info="Include visualizations in report"
                )
            
            with gr.Row():
                download_btn = gr.Button("📥 Generate Report", variant="primary", size="lg")
                batch_export_btn = gr.Button("📦 Batch Export", variant="secondary")
            
            download_status = gr.Textbox(label="📋 Export Status", interactive=False)
            download_file = gr.File(label="📄 Download Your Report", visible=True)
        
        with gr.Tab("🔄 Batch Analysis"):
            gr.Markdown("### 📁 Analyze Multiple Files")
            gr.Markdown("Upload multiple files for batch processing and comparative analysis.")
            
            batch_files = gr.File(
                label="📁 Upload Multiple Files",
                file_count="multiple",
                file_types=[".csv", ".xlsx", ".xls"]
            )
            
            batch_analyze_btn = gr.Button("🔄 Batch Analyze", variant="primary")
            batch_results = gr.Markdown()
        
        # with gr.Tab("📊 Data Comparison"):
        #     gr.Markdown("### ⚖️ Compare Datasets")
        #     gr.Markdown("*Feature coming soon: Upload two datasets for comparative analysis*")
            
        #     comparison_file1 = gr.File(label="📄 First Dataset", file_count="single")
        #     comparison_file2 = gr.File(label="📄 Second Dataset", file_count="single")
        #     compare_btn = gr.Button("⚖️ Compare Datasets", variant="primary", interactive=False)
        #     comparison_results = gr.Markdown(value="*Comparison feature in development*")

    # Enhanced helper functions
    def update_file_stats(file):
        """Enhanced file statistics display"""
        if not file:
            return "<div style='padding: 15px; background: #f8f9fa; border-radius: 8px; text-align: center;'>📄 No file uploaded</div>"
        
        try:
            file_size = os.path.getsize(file.name) / (1024 * 1024)
            file_name = os.path.basename(file.name)
            file_ext = os.path.splitext(file_name)[1].upper()
            
            # Quick file peek for row estimation
            try:
                if file_ext.lower() == '.csv':
                    with open(file.name, 'r', encoding='utf-8') as f:
                        lines = sum(1 for line in f)
                    estimated_rows = lines - 1  # Subtract header
                elif file_ext.lower() in ['.xlsx', '.xls']:
                    temp_df = pd.read_excel(file.name, nrows=0)
                    estimated_rows = "Reading..."
                else:
                    estimated_rows = "Unknown"
            except:
                estimated_rows = "Could not estimate"
            
            return f"""
            <div style='padding: 20px; background: linear-gradient(135deg, #e8f4f8 0%, #f0f8ff 100%); border-radius: 10px; border: 1px solid #b3d9f2;'>
                <h4 style='color: #2c3e50; margin-bottom: 15px;'>📊 File Details</h4>
                <div style='display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 10px;'>
                    <div><strong>📄 Name:</strong><br>{file_name}</div>
                    <div><strong>📏 Size:</strong><br>{file_size:.2f} MB</div>
                    <div><strong>🔧 Format:</strong><br>{file_ext[1:]} File</div>
                    <div><strong>📊 Est. Rows:</strong><br>{estimated_rows}</div>
                    <div><strong>⏰ Uploaded:</strong><br>{datetime.now().strftime('%H:%M:%S')}</div>
                    <div><strong>✅ Status:</strong><br>Ready to analyze</div>
                </div>
            </div>
            """
        except Exception as e:
            return f"""
            <div style='padding: 15px; background: #f8d7da; border-radius: 8px; border: 1px solid #dc3545;'>
                ❌ <strong>File Error:</strong> {str(e)}
            </div>
            """
    
    def handle_main_analysis(file, api_key, analysis_type, sample_size, progress=gr.Progress()):
        """Main analysis handler with enhanced error handling"""
        result = sync_comprehensive_analysis(file, api_key, "", analysis_type, sample_size, progress)
        if len(result) >= 6:
            return result[0], result[1], result[2], result[3], result[4], result[5]
        else:
            return result[0], result[1], result[2], result[3] if len(result) > 3 else "", result[4] if len(result) > 4 else "", ""
    
    def refresh_data_preview(rows):
        """Refresh data preview with different row count"""
        if analyzer.current_df is not None:
            return analyzer.generate_enhanced_preview(analyzer.current_df, rows)
        return "<div style='text-align: center; padding: 40px; color: #666;'>📄 No data loaded</div>"
    
    # Event handlers
    analyze_btn.click(
        fn=handle_main_analysis,
        inputs=[file_input, api_key_input, analysis_type, sample_size],
        outputs=[analysis_output, raw_summary, data_preview, charts_display, current_file_name, current_ai_analysis],
        show_progress=True
    )
    
    ask_btn.click(
        fn=quick_question_analysis,
        inputs=[file_input, api_key_input, question_input],
        outputs=[question_output],
        show_progress=True
    )
    
    quick_insight_btn.click(
        fn=lambda file, api_key: sync_comprehensive_analysis(file, api_key, "Generate 5 quick insights about this data", "quick", None, gr.Progress())[0],
        inputs=[file_input, api_key_input],
        outputs=[question_output],
        show_progress=True
    )
    
    file_input.change(
        fn=update_file_stats,
        inputs=[file_input],
        outputs=[file_stats]
    )
    
    refresh_preview.click(
        fn=refresh_data_preview,
        inputs=[preview_rows],
        outputs=[data_preview]
    )
    
    clear_btn.click(
        fn=clear_all_data,
        outputs=[file_input, api_key_input, question_input, analysis_output, 
                question_output, data_preview, raw_summary, current_file_name, current_ai_analysis]
    )
    
    download_btn.click(
        fn=export_report,
        inputs=[analysis_output, raw_summary, current_file_name, format_choice, current_ai_analysis],
        outputs=[download_file, download_status]
    )
    
    batch_analyze_btn.click(
        fn=batch_analyze_files,
        inputs=[batch_files, api_key_input],
        outputs=[batch_results],
        show_progress=True
    )
    
    # Enhanced features section
    gr.HTML("""
    <div style="margin-top: 30px;">
        <h3 style="text-align: center; color: #2c3e50; margin-bottom: 20px;">🌟 Key Features & Capabilities</h3>
        <div class="feature-grid">
            <div class="feature-card">
                <h4>🔧 Advanced File Support</h4>
                <p>CSV, Excel, JSON, Parquet, TSV with intelligent type detection</p>
            </div>
            <div class="feature-card">
                <h4>📊 Statistical Analysis</h4>
                <p>Correlation matrices, outlier detection, distribution analysis</p>
            </div>
            <div class="feature-card">
                <h4>🤖 AI-Powered Insights</h4>
                <p>GPT-powered business intelligence and recommendations</p>
            </div>
            <div class="feature-card">
                <h4>📈 Interactive Charts</h4>
                <p>Professional visualizations with hover effects and zoom</p>
            </div>
            <div class="feature-card">
                <h4>💾 Multiple Export Formats</h4>
                <p>HTML, Markdown with embedded charts</p>
            </div>
            <div class="feature-card">
                <h4>🔄 Batch Processing</h4>
                <p>Analyze multiple files simultaneously for comparison</p>
            </div>
        </div>
    </div>
    """)
    
    with gr.Accordion("💡 Pro Tips", open=False):
        gr.Markdown("""
        ### 🎯 Data Preparation:
        - ✅ Use descriptive column names (e.g., "Monthly_Revenue" instead of "Col1")
        - ✅ Ensure consistent date formats (YYYY-MM-DD recommended)
        - ✅ Remove completely empty rows/columns before upload
        - ✅ For large files (>10MB), consider using sample size option

        ### 🔍 Analysis Optimization:
        - **Comprehensive**: Full statistical analysis with AI insights (recommended for business reports)
        - **Quick**: Fast overview for initial data exploration
        - **Statistical**: Focus on mathematical relationships and patterns

        ### 📊 Question Examples for Better AI Responses:
        - "What factors most strongly correlate with customer churn?"
        - "Which time periods show the highest sales performance?"
        - "Are there any data quality issues I should address?"
        - "What are the key business opportunities in this dataset?"

        ### 📥 Export Recommendations:
        - **HTML**: Best for sharing interactive reports with stakeholders
        - **Markdown**: Great for technical documentation and version control

        ### ⚡ Performance Notes:
        - Files under 5MB: Instant processing
        - Files 5-20MB: ~5-10 seconds
        - Files 20MB+: Consider sampling for faster results

        ### 🔧 Supported Formats & Limits:
        - **CSV/TSV**: Up to 100MB
        - **Excel (XLSX/XLS)**: Up to 100MB  
        - **JSON**: Flat or nested structures
        - **Parquet**: High-performance columnar format

        ### 📞 Support & Contact:
        - 📱 WhatsApp: +8801719296601
        - 📧 Email: https://tinyurl.com/email-for-contact
        - 🕒 Response Time: Within 24 hours
        """)

if __name__ == "__main__":
    # Enhanced launch configuration
    app.queue(
        max_size=20,  # Increased queue size
        default_concurrency_limit=5,
        api_open=False
    )
    
    app.launch(
        share=True        
    )