from flask import Flask, render_template, request, flash, redirect, url_for import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import pandas as pd import google.generativeai as genai import os import logging from docx import Document import plotly.express as px import plotly.graph_objects as go import plotly.io as pio from werkzeug.utils import secure_filename import re import ast import json from datetime import datetime # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = Flask(__name__) app.secret_key = 'your-secret-key-here' # Change this to a random secret key app.config['UPLOAD_FOLDER'] = 'uploads' app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size # Configure Gemini API - Replace with your actual API key GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY', 'AIzaSyBLcWuSj6N1bkhQsTF4kt3_hFh4ibH11pQ') if GOOGLE_API_KEY and GOOGLE_API_KEY != 'your-api-key-here': try: genai.configure(api_key=GOOGLE_API_KEY) model = genai.GenerativeModel('gemini-2.0-flash-exp') logger.info("Gemini API configured successfully") except Exception as e: logger.error(f"Failed to configure Gemini API: {e}") model = None else: logger.warning("Gemini API key not configured") model = None def ensure_upload_folder(): """Create upload folder if it doesn't exist.""" try: if not os.path.exists(app.config['UPLOAD_FOLDER']): os.makedirs(app.config['UPLOAD_FOLDER']) logger.info(f"Created upload folder: {app.config['UPLOAD_FOLDER']}") except Exception as e: logger.error(f"Failed to create upload folder: {e}") raise def extract_text_from_docx(file_path): """Extract text from a DOCX file.""" try: doc = Document(file_path) full_text = [] for paragraph in doc.paragraphs: if paragraph.text.strip(): # Only add non-empty paragraphs full_text.append(paragraph.text) # Also extract text from tables for table in doc.tables: for row in table.rows: for cell in row.cells: if cell.text.strip(): full_text.append(cell.text) text = '\n'.join(full_text) logger.info(f"Extracted {len(text)} characters from document") return text except Exception as e: logger.error(f"Error extracting text from DOCX: {e}") raise def extract_data_using_gemini(text): """Extract event data using Gemini AI.""" if not model: logger.error("Gemini model not configured") return None prompt = """ Extract the event counts from the following text. Look for data organized by academic years from 2018-2019 to 2022-2023. Find numbers for these categories: - Cultural competitions/events - Sports competitions/events - Technical fest/Academic fest - Social activities/events - Any other events through Active clubs and forums Return ONLY a Python dictionary in this exact format: { '2022-2023': {'Cultural': X, 'Sports': Y, 'Technical': Z, 'Social': A, 'Other': B}, '2021-2022': {'Cultural': X, 'Sports': Y, 'Technical': Z, 'Social': A, 'Other': B}, '2020-2021': {'Cultural': X, 'Sports': Y, 'Technical': Z, 'Social': A, 'Other': B}, '2019-2020': {'Cultural': X, 'Sports': Y, 'Technical': Z, 'Social': A, 'Other': B}, '2018-2019': {'Cultural': X, 'Sports': Y, 'Technical': Z, 'Social': A, 'Other': B} } Replace X, Y, Z, A, B with the actual numbers from the text. If a number is not found, use 0. """ try: # Debug: Look for patterns in text years = re.findall(r'(20\d{2}-20\d{2})', text) logger.info(f"Found years in text: {years}") # Generate response using Gemini response = model.generate_content(f"{text}\n\n{prompt}") response_text = response.text.strip() logger.info(f"Gemini response length: {len(response_text)}") # Clean the response if '```' in response_text: # Extract code block code_blocks = re.findall(r'```(?:python)?\s*(.*?)\s*```', response_text, re.DOTALL) if code_blocks: response_text = code_blocks[0].strip() # Remove any extra whitespace and comments response_text = re.sub(r'#.*$', '', response_text, flags=re.MULTILINE) response_text = response_text.strip() logger.info(f"Cleaned response: {response_text[:200]}...") # Parse the response try: data = ast.literal_eval(response_text) except (ValueError, SyntaxError): # Fallback to JSON parsing response_text = response_text.replace("'", '"') data = json.loads(response_text) # Validate data structure if not isinstance(data, dict): raise ValueError("Response is not a dictionary") # Ensure all expected years are present expected_years = ['2022-2023', '2021-2022', '2020-2021', '2019-2020', '2018-2019'] for year in expected_years: if year not in data: logger.warning(f"Missing year {year}, adding with zeros") data[year] = {'Cultural': 0, 'Sports': 0, 'Technical': 0, 'Social': 0, 'Other': 0} # Ensure all categories are present for each year required_categories = ['Cultural', 'Sports', 'Technical', 'Social', 'Other'] for year in data: for cat in required_categories: if cat not in data[year]: logger.warning(f"Missing category {cat} in year {year}, setting to 0") data[year][cat] = 0 # Ensure values are integers try: data[year][cat] = int(data[year][cat]) except (ValueError, TypeError): data[year][cat] = 0 logger.info(f"Successfully extracted data: {data}") return data except Exception as e: logger.error(f"Error processing with Gemini: {e}") return None def get_graph_insights(data, plot_type): """Generate insights and SWOT analysis for different plot types.""" try: df = pd.DataFrame(data).T insights = { 'main_insight': "", 'swot': { 'strengths': [], 'weaknesses': [], 'opportunities': [], 'threats': [] }, 'recommendations': [] } if plot_type == 'bar': total_by_category = df.sum() max_category = total_by_category.idxmax() min_category = total_by_category.idxmin() avg_events = total_by_category.mean() insights[ 'main_insight'] = f"The most active category is {max_category} with {int(total_by_category[max_category])} total events, while {min_category} has the least with {int(total_by_category[min_category])} events." insights['swot']['strengths'] = [ f"Strong performance in {max_category} events ({int(total_by_category[max_category])} total)", f"Diverse event portfolio across {len(total_by_category)} categories", f"Average of {avg_events:.1f} events per category shows balanced approach" ] insights['swot']['weaknesses'] = [ f"Underperformance in {min_category} category", f"Significant gap between highest and lowest performing categories", "Potential resource allocation imbalances" ] insights['swot']['opportunities'] = [ f"Growth potential in {min_category} category", "Cross-category collaboration possibilities", "Opportunity to standardize event quality" ] insights['swot']['threats'] = [ "Over-reliance on dominant categories", "Resource competition between categories", "Sustainability challenges for high-volume categories" ] insights['recommendations'] = [ f"Increase focus on {min_category} events", "Implement balanced resource allocation strategy", "Develop cross-category event initiatives" ] elif plot_type == 'pie': latest_year = '2022-2023' year_data = data[latest_year] total = sum(year_data.values()) max_cat = max(year_data.items(), key=lambda x: x[1]) min_cat = min(year_data.items(), key=lambda x: x[1]) if total > 0: percentage = (max_cat[1] / total) * 100 insights[ 'main_insight'] = f"In {latest_year}, {max_cat[0]} events dominated with {max_cat[1]} events ({percentage:.1f}% of total)." else: insights['main_insight'] = f"No events recorded for {latest_year}." elif plot_type == 'line': if len(df) > 1: trend_direction = "increasing" if df.iloc[-1].mean() > df.iloc[0].mean() else "decreasing" growth_rate = ((df.iloc[-1].mean() - df.iloc[0].mean()) / df.iloc[0].mean() * 100) if df.iloc[ 0].mean() > 0 else 0 insights[ 'main_insight'] = f"Overall trend shows {trend_direction} pattern with {growth_rate:.1f}% change in average events." return insights except Exception as e: logger.error(f"Error generating insights: {e}") return { 'main_insight': "Unable to generate insights for this visualization.", 'swot': {'strengths': [], 'weaknesses': [], 'opportunities': [], 'threats': []}, 'recommendations': [] } def create_plots(data): """Create various plots and analyses from the data.""" plots = {} try: df = pd.DataFrame(data).T # Color scheme for consistency colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd'] # 1. Bar Chart - Events by Category and Year fig1 = px.bar( df, barmode='group', title='Event Distribution Across Years and Categories', labels={'index': 'Year', 'value': 'Number of Events', 'variable': 'Category'}, color_discrete_sequence=colors ) fig1.update_layout( xaxis_title="Academic Year", yaxis_title="Number of Events", legend_title="Event Category", template="plotly_white" ) plots['bar'] = { 'plot': pio.to_html(fig1, full_html=False, div_id="bar-chart"), 'insight': get_graph_insights(data, 'bar') } # 2. Pie Chart - Latest Year Distribution latest_year = '2022-2023' if latest_year in data: fig2 = px.pie( names=list(data[latest_year].keys()), values=list(data[latest_year].values()), title=f'Event Distribution for {latest_year}', color_discrete_sequence=colors ) fig2.update_traces(textposition='inside', textinfo='percent+label') plots['pie'] = { 'plot': pio.to_html(fig2, full_html=False, div_id="pie-chart"), 'insight': get_graph_insights(data, 'pie') } # 3. Line Chart - Trends Over Time fig3 = px.line( df, markers=True, title='Event Trends Over Years', labels={'index': 'Year', 'value': 'Number of Events', 'variable': 'Category'}, color_discrete_sequence=colors ) fig3.update_layout( xaxis_title="Academic Year", yaxis_title="Number of Events", legend_title="Event Category", template="plotly_white" ) plots['line'] = { 'plot': pio.to_html(fig3, full_html=False, div_id="line-chart"), 'insight': get_graph_insights(data, 'line') } # 4. Stacked Area Chart fig4 = px.area( df, title='Cumulative Event Distribution Over Years', labels={'index': 'Year', 'value': 'Number of Events', 'variable': 'Category'}, color_discrete_sequence=colors ) fig4.update_layout( xaxis_title="Academic Year", yaxis_title="Number of Events", legend_title="Event Category", template="plotly_white" ) plots['area'] = { 'plot': pio.to_html(fig4, full_html=False, div_id="area-chart"), 'insight': get_graph_insights(data, 'area') } # 5. Statistical Summary total_events = df.sum().sum() avg_events_per_year = df.sum(axis=1).mean() most_active_year = df.sum(axis=1).idxmax() most_common_category = df.sum().idxmax() stats = { 'total_events': int(total_events), 'avg_events_per_year': round(avg_events_per_year, 1), 'most_active_year': most_active_year, 'most_common_category': most_common_category, 'category_totals': df.sum().to_dict(), 'yearly_totals': df.sum(axis=1).to_dict() } plots['stats'] = stats logger.info("Successfully created all plots") return plots except Exception as e: logger.error(f"Error creating plots: {e}") return None def allowed_file(filename): """Check if the uploaded file is allowed.""" return '.' in filename and filename.rsplit('.', 1)[1].lower() in ['docx'] @app.route('/', methods=['GET', 'POST']) def index(): """Main route for the application.""" plots = None if request.method == 'POST': # Check if file is uploaded if 'document' not in request.files: flash('No file uploaded. Please select a DOCX file.', 'error') return redirect(request.url) file = request.files['document'] if file.filename == '': flash('No file selected. Please choose a DOCX file.', 'error') return redirect(request.url) if not allowed_file(file.filename): flash('Invalid file type. Please upload a DOCX file.', 'error') return redirect(request.url) if file: try: ensure_upload_folder() # Secure the filename filename = secure_filename(file.filename) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"{timestamp}_{filename}" file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename) # Save the file file.save(file_path) logger.info(f"File saved: {file_path}") # Extract text text = extract_text_from_docx(file_path) if not text.strip(): flash('The uploaded document appears to be empty. Please check the file.', 'error') return redirect(request.url) # Extract data using Gemini data = extract_data_using_gemini(text) if data: # Create plots plots = create_plots(data) if plots: flash('Document processed successfully! 🎉', 'success') else: flash('Error creating visualizations. Please try again.', 'error') else: flash( 'Could not extract event data from the document. Please ensure the document contains event statistics in the expected format.', 'error') # Clean up uploaded file try: os.remove(file_path) logger.info(f"Cleaned up file: {file_path}") except Exception as e: logger.warning(f"Could not remove file {file_path}: {e}") except Exception as e: logger.error(f"Error processing document: {e}") flash(f'Error processing document: {str(e)}', 'error') return render_template('index.html', plots=plots) @app.errorhandler(413) def too_large(e): """Handle file too large error.""" flash("File too large. Please upload a file smaller than 16MB.", 'error') return redirect(request.url) @app.errorhandler(404) def not_found(e): """Handle 404 errors.""" return render_template('404.html'), 404 @app.errorhandler(500) def internal_error(e): """Handle internal server errors.""" logger.error(f"Internal server error: {e}") flash('An internal error occurred. Please try again.', 'error') return redirect(url_for('index')) if __name__ == '__main__': print("🚀 Starting Event Analytics Application...") print("📊 Upload a DOCX file to analyze event data") print("🔗 Access the application at: http://localhost:5001") if not model: print("⚠️ Warning: Gemini API not configured. Please set GOOGLE_API_KEY environment variable.") app.run(debug=True, port=5001, host='0.0.0.0')