pranit144's picture
Upload 2 files
3cc6b13 verified
from flask import Flask, render_template, request, flash, redirect, url_for
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pandas as pd
import google.generativeai as genai
import os
import logging
from docx import Document
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from werkzeug.utils import secure_filename
import re
import ast
import json
from datetime import datetime
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = Flask(__name__)
app.secret_key = 'your-secret-key-here' # Change this to a random secret key
app.config['UPLOAD_FOLDER'] = 'uploads'
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size
# Configure Gemini API - Replace with your actual API key
GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY', 'AIzaSyBLcWuSj6N1bkhQsTF4kt3_hFh4ibH11pQ')
if GOOGLE_API_KEY and GOOGLE_API_KEY != 'your-api-key-here':
try:
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-2.0-flash-exp')
logger.info("Gemini API configured successfully")
except Exception as e:
logger.error(f"Failed to configure Gemini API: {e}")
model = None
else:
logger.warning("Gemini API key not configured")
model = None
def ensure_upload_folder():
"""Create upload folder if it doesn't exist."""
try:
if not os.path.exists(app.config['UPLOAD_FOLDER']):
os.makedirs(app.config['UPLOAD_FOLDER'])
logger.info(f"Created upload folder: {app.config['UPLOAD_FOLDER']}")
except Exception as e:
logger.error(f"Failed to create upload folder: {e}")
raise
def extract_text_from_docx(file_path):
"""Extract text from a DOCX file."""
try:
doc = Document(file_path)
full_text = []
for paragraph in doc.paragraphs:
if paragraph.text.strip(): # Only add non-empty paragraphs
full_text.append(paragraph.text)
# Also extract text from tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
full_text.append(cell.text)
text = '\n'.join(full_text)
logger.info(f"Extracted {len(text)} characters from document")
return text
except Exception as e:
logger.error(f"Error extracting text from DOCX: {e}")
raise
def extract_data_using_gemini(text):
"""Extract event data using Gemini AI."""
if not model:
logger.error("Gemini model not configured")
return None
prompt = """
Extract the event counts from the following text. Look for data organized by academic years from 2018-2019 to 2022-2023.
Find numbers for these categories:
- Cultural competitions/events
- Sports competitions/events
- Technical fest/Academic fest
- Social activities/events
- Any other events through Active clubs and forums
Return ONLY a Python dictionary in this exact format:
{
'2022-2023': {'Cultural': X, 'Sports': Y, 'Technical': Z, 'Social': A, 'Other': B},
'2021-2022': {'Cultural': X, 'Sports': Y, 'Technical': Z, 'Social': A, 'Other': B},
'2020-2021': {'Cultural': X, 'Sports': Y, 'Technical': Z, 'Social': A, 'Other': B},
'2019-2020': {'Cultural': X, 'Sports': Y, 'Technical': Z, 'Social': A, 'Other': B},
'2018-2019': {'Cultural': X, 'Sports': Y, 'Technical': Z, 'Social': A, 'Other': B}
}
Replace X, Y, Z, A, B with the actual numbers from the text. If a number is not found, use 0.
"""
try:
# Debug: Look for patterns in text
years = re.findall(r'(20\d{2}-20\d{2})', text)
logger.info(f"Found years in text: {years}")
# Generate response using Gemini
response = model.generate_content(f"{text}\n\n{prompt}")
response_text = response.text.strip()
logger.info(f"Gemini response length: {len(response_text)}")
# Clean the response
if '```' in response_text:
# Extract code block
code_blocks = re.findall(r'```(?:python)?\s*(.*?)\s*```', response_text, re.DOTALL)
if code_blocks:
response_text = code_blocks[0].strip()
# Remove any extra whitespace and comments
response_text = re.sub(r'#.*$', '', response_text, flags=re.MULTILINE)
response_text = response_text.strip()
logger.info(f"Cleaned response: {response_text[:200]}...")
# Parse the response
try:
data = ast.literal_eval(response_text)
except (ValueError, SyntaxError):
# Fallback to JSON parsing
response_text = response_text.replace("'", '"')
data = json.loads(response_text)
# Validate data structure
if not isinstance(data, dict):
raise ValueError("Response is not a dictionary")
# Ensure all expected years are present
expected_years = ['2022-2023', '2021-2022', '2020-2021', '2019-2020', '2018-2019']
for year in expected_years:
if year not in data:
logger.warning(f"Missing year {year}, adding with zeros")
data[year] = {'Cultural': 0, 'Sports': 0, 'Technical': 0, 'Social': 0, 'Other': 0}
# Ensure all categories are present for each year
required_categories = ['Cultural', 'Sports', 'Technical', 'Social', 'Other']
for year in data:
for cat in required_categories:
if cat not in data[year]:
logger.warning(f"Missing category {cat} in year {year}, setting to 0")
data[year][cat] = 0
# Ensure values are integers
try:
data[year][cat] = int(data[year][cat])
except (ValueError, TypeError):
data[year][cat] = 0
logger.info(f"Successfully extracted data: {data}")
return data
except Exception as e:
logger.error(f"Error processing with Gemini: {e}")
return None
def get_graph_insights(data, plot_type):
"""Generate insights and SWOT analysis for different plot types."""
try:
df = pd.DataFrame(data).T
insights = {
'main_insight': "",
'swot': {
'strengths': [],
'weaknesses': [],
'opportunities': [],
'threats': []
},
'recommendations': []
}
if plot_type == 'bar':
total_by_category = df.sum()
max_category = total_by_category.idxmax()
min_category = total_by_category.idxmin()
avg_events = total_by_category.mean()
insights[
'main_insight'] = f"The most active category is {max_category} with {int(total_by_category[max_category])} total events, while {min_category} has the least with {int(total_by_category[min_category])} events."
insights['swot']['strengths'] = [
f"Strong performance in {max_category} events ({int(total_by_category[max_category])} total)",
f"Diverse event portfolio across {len(total_by_category)} categories",
f"Average of {avg_events:.1f} events per category shows balanced approach"
]
insights['swot']['weaknesses'] = [
f"Underperformance in {min_category} category",
f"Significant gap between highest and lowest performing categories",
"Potential resource allocation imbalances"
]
insights['swot']['opportunities'] = [
f"Growth potential in {min_category} category",
"Cross-category collaboration possibilities",
"Opportunity to standardize event quality"
]
insights['swot']['threats'] = [
"Over-reliance on dominant categories",
"Resource competition between categories",
"Sustainability challenges for high-volume categories"
]
insights['recommendations'] = [
f"Increase focus on {min_category} events",
"Implement balanced resource allocation strategy",
"Develop cross-category event initiatives"
]
elif plot_type == 'pie':
latest_year = '2022-2023'
year_data = data[latest_year]
total = sum(year_data.values())
max_cat = max(year_data.items(), key=lambda x: x[1])
min_cat = min(year_data.items(), key=lambda x: x[1])
if total > 0:
percentage = (max_cat[1] / total) * 100
insights[
'main_insight'] = f"In {latest_year}, {max_cat[0]} events dominated with {max_cat[1]} events ({percentage:.1f}% of total)."
else:
insights['main_insight'] = f"No events recorded for {latest_year}."
elif plot_type == 'line':
if len(df) > 1:
trend_direction = "increasing" if df.iloc[-1].mean() > df.iloc[0].mean() else "decreasing"
growth_rate = ((df.iloc[-1].mean() - df.iloc[0].mean()) / df.iloc[0].mean() * 100) if df.iloc[
0].mean() > 0 else 0
insights[
'main_insight'] = f"Overall trend shows {trend_direction} pattern with {growth_rate:.1f}% change in average events."
return insights
except Exception as e:
logger.error(f"Error generating insights: {e}")
return {
'main_insight': "Unable to generate insights for this visualization.",
'swot': {'strengths': [], 'weaknesses': [], 'opportunities': [], 'threats': []},
'recommendations': []
}
def create_plots(data):
"""Create various plots and analyses from the data."""
plots = {}
try:
df = pd.DataFrame(data).T
# Color scheme for consistency
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
# 1. Bar Chart - Events by Category and Year
fig1 = px.bar(
df,
barmode='group',
title='Event Distribution Across Years and Categories',
labels={'index': 'Year', 'value': 'Number of Events', 'variable': 'Category'},
color_discrete_sequence=colors
)
fig1.update_layout(
xaxis_title="Academic Year",
yaxis_title="Number of Events",
legend_title="Event Category",
template="plotly_white"
)
plots['bar'] = {
'plot': pio.to_html(fig1, full_html=False, div_id="bar-chart"),
'insight': get_graph_insights(data, 'bar')
}
# 2. Pie Chart - Latest Year Distribution
latest_year = '2022-2023'
if latest_year in data:
fig2 = px.pie(
names=list(data[latest_year].keys()),
values=list(data[latest_year].values()),
title=f'Event Distribution for {latest_year}',
color_discrete_sequence=colors
)
fig2.update_traces(textposition='inside', textinfo='percent+label')
plots['pie'] = {
'plot': pio.to_html(fig2, full_html=False, div_id="pie-chart"),
'insight': get_graph_insights(data, 'pie')
}
# 3. Line Chart - Trends Over Time
fig3 = px.line(
df,
markers=True,
title='Event Trends Over Years',
labels={'index': 'Year', 'value': 'Number of Events', 'variable': 'Category'},
color_discrete_sequence=colors
)
fig3.update_layout(
xaxis_title="Academic Year",
yaxis_title="Number of Events",
legend_title="Event Category",
template="plotly_white"
)
plots['line'] = {
'plot': pio.to_html(fig3, full_html=False, div_id="line-chart"),
'insight': get_graph_insights(data, 'line')
}
# 4. Stacked Area Chart
fig4 = px.area(
df,
title='Cumulative Event Distribution Over Years',
labels={'index': 'Year', 'value': 'Number of Events', 'variable': 'Category'},
color_discrete_sequence=colors
)
fig4.update_layout(
xaxis_title="Academic Year",
yaxis_title="Number of Events",
legend_title="Event Category",
template="plotly_white"
)
plots['area'] = {
'plot': pio.to_html(fig4, full_html=False, div_id="area-chart"),
'insight': get_graph_insights(data, 'area')
}
# 5. Statistical Summary
total_events = df.sum().sum()
avg_events_per_year = df.sum(axis=1).mean()
most_active_year = df.sum(axis=1).idxmax()
most_common_category = df.sum().idxmax()
stats = {
'total_events': int(total_events),
'avg_events_per_year': round(avg_events_per_year, 1),
'most_active_year': most_active_year,
'most_common_category': most_common_category,
'category_totals': df.sum().to_dict(),
'yearly_totals': df.sum(axis=1).to_dict()
}
plots['stats'] = stats
logger.info("Successfully created all plots")
return plots
except Exception as e:
logger.error(f"Error creating plots: {e}")
return None
def allowed_file(filename):
"""Check if the uploaded file is allowed."""
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ['docx']
@app.route('/', methods=['GET', 'POST'])
def index():
"""Main route for the application."""
plots = None
if request.method == 'POST':
# Check if file is uploaded
if 'document' not in request.files:
flash('No file uploaded. Please select a DOCX file.', 'error')
return redirect(request.url)
file = request.files['document']
if file.filename == '':
flash('No file selected. Please choose a DOCX file.', 'error')
return redirect(request.url)
if not allowed_file(file.filename):
flash('Invalid file type. Please upload a DOCX file.', 'error')
return redirect(request.url)
if file:
try:
ensure_upload_folder()
# Secure the filename
filename = secure_filename(file.filename)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{timestamp}_{filename}"
file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
# Save the file
file.save(file_path)
logger.info(f"File saved: {file_path}")
# Extract text
text = extract_text_from_docx(file_path)
if not text.strip():
flash('The uploaded document appears to be empty. Please check the file.', 'error')
return redirect(request.url)
# Extract data using Gemini
data = extract_data_using_gemini(text)
if data:
# Create plots
plots = create_plots(data)
if plots:
flash('Document processed successfully! πŸŽ‰', 'success')
else:
flash('Error creating visualizations. Please try again.', 'error')
else:
flash(
'Could not extract event data from the document. Please ensure the document contains event statistics in the expected format.',
'error')
# Clean up uploaded file
try:
os.remove(file_path)
logger.info(f"Cleaned up file: {file_path}")
except Exception as e:
logger.warning(f"Could not remove file {file_path}: {e}")
except Exception as e:
logger.error(f"Error processing document: {e}")
flash(f'Error processing document: {str(e)}', 'error')
return render_template('index.html', plots=plots)
@app.errorhandler(413)
def too_large(e):
"""Handle file too large error."""
flash("File too large. Please upload a file smaller than 16MB.", 'error')
return redirect(request.url)
@app.errorhandler(404)
def not_found(e):
"""Handle 404 errors."""
return render_template('404.html'), 404
@app.errorhandler(500)
def internal_error(e):
"""Handle internal server errors."""
logger.error(f"Internal server error: {e}")
flash('An internal error occurred. Please try again.', 'error')
return redirect(url_for('index'))
if __name__ == '__main__':
print("πŸš€ Starting Event Analytics Application...")
print("πŸ“Š Upload a DOCX file to analyze event data")
print("πŸ”— Access the application at: http://localhost:5001")
if not model:
print("⚠️ Warning: Gemini API not configured. Please set GOOGLE_API_KEY environment variable.")
app.run(debug=True, port=5001, host='0.0.0.0')