Spaces:

shukdevdattaEX
/

Data-Summarizer-Excel-CSV

Running

File size: 9,425 Bytes

d513747

import gradio as gr
import pandas as pd
import aiohttp
import asyncio
import json
import io
import os
from typing import Optional, Tuple

class DataAnalyzer:
    def __init__(self):
        self.api_base_url = "https://llm.chutes.ai/v1/chat/completions"
    
    async def analyze_with_chutes(self, api_token: str, data_summary: str, user_question: str = None) -> str:
        """Send data to Chutes API for analysis"""
        headers = {
            "Authorization": f"Bearer {api_token}",
            "Content-Type": "application/json"
        }
        
        # Create the prompt based on whether it's initial analysis or follow-up question
        if user_question:
            prompt = f"""Based on this dataset summary:
{data_summary}
User question: {user_question}
Please provide a detailed answer based on the data."""
        else:
            prompt = f"""Analyze the following dataset and provide comprehensive insights:
{data_summary}
Please provide:
1. Key statistical insights
2. Notable patterns or trends
3. Data quality observations
4. Business recommendations
5. Potential areas for further analysis
Keep the analysis clear, actionable, and data-driven."""

        body = {
            "model": "openai/gpt-oss-20b",
            "messages": [
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "stream": True,
            "max_tokens": 2048,
            "temperature": 0.3  # Lower temperature for more consistent analysis
        }
        
        try:
            async with aiohttp.ClientSession() as session:
                async with session.post(self.api_base_url, headers=headers, json=body) as response:
                    if response.status != 200:
                        return f"Error: API request failed with status {response.status}"
                    
                    full_response = ""
                    async for line in response.content:
                        line = line.decode("utf-8").strip()
                        if line.startswith("data: "):
                            data = line[6:]
                            if data == "[DONE]":
                                break
                            try:
                                chunk_data = json.loads(data)
                                if "choices" in chunk_data and len(chunk_data["choices"]) > 0:
                                    delta = chunk_data["choices"][0].get("delta", {})
                                    content = delta.get("content", "")
                                    if content:
                                        full_response += content
                            except json.JSONDecodeError:
                                continue
                    
                    return full_response if full_response else "No response received from the model."
                    
        except Exception as e:
            return f"Error connecting to Chutes API: {str(e)}"

    def process_file(self, file_path: str) -> Tuple[pd.DataFrame, str]:
        """Process uploaded CSV or Excel file"""
        try:
            file_extension = os.path.splitext(file_path)[1].lower()
            
            if file_extension == '.csv':
                df = pd.read_csv(file_path)
            elif file_extension in ['.xlsx', '.xls']:
                df = pd.read_excel(file_path)
            else:
                raise ValueError("Unsupported file format. Please upload CSV or Excel files.")
            
            # Generate comprehensive data summary
            summary = self.generate_data_summary(df)
            return df, summary
            
        except Exception as e:
            raise Exception(f"Error processing file: {str(e)}")
    
    def generate_data_summary(self, df: pd.DataFrame) -> str:
        """Generate a comprehensive summary of the dataset"""
        summary = []
        
        # Basic info
        summary.append(f"Dataset Overview:")
        summary.append(f"- Shape: {df.shape[0]} rows × {df.shape[1]} columns")
        summary.append(f"- Total cells: {df.shape[0] * df.shape[1]:,}")
        
        # Column information
        summary.append(f"\nColumn Information:")
        for i, (col, dtype) in enumerate(df.dtypes.items()):
            null_count = df[col].isnull().sum()
            null_pct = (null_count / len(df)) * 100
            summary.append(f"- {col} ({dtype}): {null_count} nulls ({null_pct:.1f}%)")
        
        # Numerical columns statistics
        numeric_cols = df.select_dtypes(include=['number']).columns
        if len(numeric_cols) > 0:
            summary.append(f"\nNumerical Columns Summary:")
            for col in numeric_cols:
                stats = df[col].describe()
                summary.append(f"- {col}: Mean={stats['mean']:.2f}, Std={stats['std']:.2f}, Range=[{stats['min']:.2f}, {stats['max']:.2f}]")
        
        # Categorical columns
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns
        if len(categorical_cols) > 0:
            summary.append(f"\nCategorical Columns Summary:")
            for col in categorical_cols:
                unique_count = df[col].nunique()
                most_common = df[col].mode().iloc[0] if len(df[col].mode()) > 0 else "N/A"
                summary.append(f"- {col}: {unique_count} unique values, Most common: '{most_common}'")
        
        # Sample data
        summary.append(f"\nFirst 5 rows preview:")
        summary.append(df.head().to_string())
        
        return "\n".join(summary)

# Initialize the analyzer
analyzer = DataAnalyzer()

async def analyze_data(file, api_key, user_question=""):
    """Main function to analyze uploaded data"""
    if not file:
        return "Please upload a CSV or Excel file.", "", ""
    
    if not api_key:
        return "Please enter your Chutes API key.", "", ""
    
    try:
        # Process the uploaded file
        df, data_summary = analyzer.process_file(file.name)
        
        # Get AI analysis
        ai_analysis = await analyzer.analyze_with_chutes(api_key, data_summary, user_question)
        
        # Format the complete response
        response = f"""## 📊 Data Analysis Complete!
### 📈 Dataset Overview:
{data_summary}
### 🤖 AI Insights & Recommendations:
{ai_analysis}
"""
        
        return response, data_summary, df.head(10).to_html()
        
    except Exception as e:
        return f"Error: {str(e)}", "", ""

def sync_analyze_data(file, api_key, user_question=""):
    """Synchronous wrapper for the async analyze function"""
    return asyncio.run(analyze_data(file, api_key, user_question))

# Create the Gradio interface
with gr.Blocks(title="📊 Smart Data Analyzer", theme=gr.themes.Ocean()) as app:
    gr.Markdown("""
    # 📊 Smart Data Analyzer
    ### Upload your CSV/Excel file and get instant AI-powered insights using OpenAI's gpt-oss-20b model via Chutes!
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            # File upload
            file_input = gr.File(
                label="📁 Upload CSV or Excel File",
                file_types=[".csv", ".xlsx", ".xls"],
                file_count="single"
            )
            
            # API key input
            api_key_input = gr.Textbox(
                label="🔑 Chutes API Key",
                placeholder="Enter your Chutes API token here...",
                type="password",
                lines=1
            )
            
            # Optional question input
            question_input = gr.Textbox(
                label="❓ Ask a Specific Question (Optional)",
                placeholder="e.g., What are the sales trends? Which region performs best?",
                lines=2
            )
            
            # Analyze button
            analyze_btn = gr.Button("🚀 Analyze Data", variant="primary", size="lg")
        
        with gr.Column(scale=2):
            # Results display
            analysis_output = gr.Markdown(
                label="📋 Analysis Results",
                value="Upload a file and click 'Analyze Data' to see insights..."
            )
    
    # Additional outputs (hidden by default)
    with gr.Accordion("📊 Data Preview", open=False):
        data_preview = gr.HTML(label="First 10 Rows")
    
    with gr.Accordion("🔍 Raw Data Summary", open=False):
        raw_summary = gr.Textbox(label="Dataset Summary", lines=10)
    
    # Event handlers
    analyze_btn.click(
        fn=sync_analyze_data,
        inputs=[file_input, api_key_input, question_input],
        outputs=[analysis_output, raw_summary, data_preview]
    )
    
    # Example section
    gr.Markdown("""
    ### 💡 Tips for Best Results:
    - **File Size**: Keep files under 10MB for fastest processing
    - **API Key**: Get your free Chutes API key from [chutes.ai](https://chutes.ai)
    - **Questions**: Be specific! Ask about trends, patterns, outliers, or recommendations
    - **Formats**: Supports CSV, XLSX, and XLS files
    
    ### 🎯 Example Questions to Ask:
    - "What are the key trends in this sales data?"
    - "Which products are underperforming?"
    - "Are there any seasonal patterns?"
    - "What recommendations do you have based on this data?"
    """)

# Launch the application
if __name__ == "__main__":
    app.launch(
        share=True
    )