Spaces:

pranit144
/

Institute_activity_anaylisis

Sleeping

App Files Files Community

Institute_activity_anaylisis / app.py

pranit144

Upload 2 files

3cc6b13 verified 3 months ago

raw

history blame contribute delete

17.9 kB

	from flask import Flask, render_template, request, flash, redirect, url_for
	import matplotlib

	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	import pandas as pd
	import google.generativeai as genai
	import os
	import logging
	from docx import Document
	import plotly.express as px
	import plotly.graph_objects as go
	import plotly.io as pio
	from werkzeug.utils import secure_filename
	import re
	import ast
	import json
	from datetime import datetime

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	app = Flask(__name__)
	app.secret_key = 'your-secret-key-here' # Change this to a random secret key
	app.config['UPLOAD_FOLDER'] = 'uploads'
	app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size

	# Configure Gemini API - Replace with your actual API key
	GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY', 'AIzaSyBLcWuSj6N1bkhQsTF4kt3_hFh4ibH11pQ')
	if GOOGLE_API_KEY and GOOGLE_API_KEY != 'your-api-key-here':
	try:
	genai.configure(api_key=GOOGLE_API_KEY)
	model = genai.GenerativeModel('gemini-2.0-flash-exp')
	logger.info("Gemini API configured successfully")
	except Exception as e:
	logger.error(f"Failed to configure Gemini API: {e}")
	model = None
	else:
	logger.warning("Gemini API key not configured")
	model = None


	def ensure_upload_folder():
	"""Create upload folder if it doesn't exist."""
	try:
	if not os.path.exists(app.config['UPLOAD_FOLDER']):
	os.makedirs(app.config['UPLOAD_FOLDER'])
	logger.info(f"Created upload folder: {app.config['UPLOAD_FOLDER']}")
	except Exception as e:
	logger.error(f"Failed to create upload folder: {e}")
	raise


	def extract_text_from_docx(file_path):
	"""Extract text from a DOCX file."""
	try:
	doc = Document(file_path)
	full_text = []
	for paragraph in doc.paragraphs:
	if paragraph.text.strip(): # Only add non-empty paragraphs
	full_text.append(paragraph.text)

	# Also extract text from tables
	for table in doc.tables:
	for row in table.rows:
	for cell in row.cells:
	if cell.text.strip():
	full_text.append(cell.text)

	text = '\n'.join(full_text)
	logger.info(f"Extracted {len(text)} characters from document")
	return text
	except Exception as e:
	logger.error(f"Error extracting text from DOCX: {e}")
	raise


	def extract_data_using_gemini(text):
	"""Extract event data using Gemini AI."""
	if not model:
	logger.error("Gemini model not configured")
	return None

	prompt = """
	Extract the event counts from the following text. Look for data organized by academic years from 2018-2019 to 2022-2023.

	Find numbers for these categories:
	- Cultural competitions/events
	- Sports competitions/events
	- Technical fest/Academic fest
	- Social activities/events
	- Any other events through Active clubs and forums

	Return ONLY a Python dictionary in this exact format:
	{
	'2022-2023': {'Cultural': X, 'Sports': Y, 'Technical': Z, 'Social': A, 'Other': B},
	'2021-2022': {'Cultural': X, 'Sports': Y, 'Technical': Z, 'Social': A, 'Other': B},
	'2020-2021': {'Cultural': X, 'Sports': Y, 'Technical': Z, 'Social': A, 'Other': B},
	'2019-2020': {'Cultural': X, 'Sports': Y, 'Technical': Z, 'Social': A, 'Other': B},
	'2018-2019': {'Cultural': X, 'Sports': Y, 'Technical': Z, 'Social': A, 'Other': B}
	}

	Replace X, Y, Z, A, B with the actual numbers from the text. If a number is not found, use 0.
	"""

	try:
	# Debug: Look for patterns in text
	years = re.findall(r'(20\d{2}-20\d{2})', text)
	logger.info(f"Found years in text: {years}")

	# Generate response using Gemini
	response = model.generate_content(f"{text}\n\n{prompt}")
	response_text = response.text.strip()

	logger.info(f"Gemini response length: {len(response_text)}")

	# Clean the response
	if '```' in response_text:
	# Extract code block
	code_blocks = re.findall(r'```(?:python)?\s(.?)\s*```', response_text, re.DOTALL)
	if code_blocks:
	response_text = code_blocks[0].strip()

	# Remove any extra whitespace and comments
	response_text = re.sub(r'#.*$', '', response_text, flags=re.MULTILINE)
	response_text = response_text.strip()

	logger.info(f"Cleaned response: {response_text[:200]}...")

	# Parse the response
	try:
	data = ast.literal_eval(response_text)
	except (ValueError, SyntaxError):
	# Fallback to JSON parsing
	response_text = response_text.replace("'", '"')
	data = json.loads(response_text)

	# Validate data structure
	if not isinstance(data, dict):
	raise ValueError("Response is not a dictionary")

	# Ensure all expected years are present
	expected_years = ['2022-2023', '2021-2022', '2020-2021', '2019-2020', '2018-2019']
	for year in expected_years:
	if year not in data:
	logger.warning(f"Missing year {year}, adding with zeros")
	data[year] = {'Cultural': 0, 'Sports': 0, 'Technical': 0, 'Social': 0, 'Other': 0}

	# Ensure all categories are present for each year
	required_categories = ['Cultural', 'Sports', 'Technical', 'Social', 'Other']
	for year in data:
	for cat in required_categories:
	if cat not in data[year]:
	logger.warning(f"Missing category {cat} in year {year}, setting to 0")
	data[year][cat] = 0
	# Ensure values are integers
	try:
	data[year][cat] = int(data[year][cat])
	except (ValueError, TypeError):
	data[year][cat] = 0

	logger.info(f"Successfully extracted data: {data}")
	return data

	except Exception as e:
	logger.error(f"Error processing with Gemini: {e}")
	return None


	def get_graph_insights(data, plot_type):
	"""Generate insights and SWOT analysis for different plot types."""
	try:
	df = pd.DataFrame(data).T

	insights = {
	'main_insight': "",
	'swot': {
	'strengths': [],
	'weaknesses': [],
	'opportunities': [],
	'threats': []
	},
	'recommendations': []
	}

	if plot_type == 'bar':
	total_by_category = df.sum()
	max_category = total_by_category.idxmax()
	min_category = total_by_category.idxmin()
	avg_events = total_by_category.mean()

	insights[
	'main_insight'] = f"The most active category is {max_category} with {int(total_by_category[max_category])} total events, while {min_category} has the least with {int(total_by_category[min_category])} events."

	insights['swot']['strengths'] = [
	f"Strong performance in {max_category} events ({int(total_by_category[max_category])} total)",
	f"Diverse event portfolio across {len(total_by_category)} categories",
	f"Average of {avg_events:.1f} events per category shows balanced approach"
	]

	insights['swot']['weaknesses'] = [
	f"Underperformance in {min_category} category",
	f"Significant gap between highest and lowest performing categories",
	"Potential resource allocation imbalances"
	]

	insights['swot']['opportunities'] = [
	f"Growth potential in {min_category} category",
	"Cross-category collaboration possibilities",
	"Opportunity to standardize event quality"
	]

	insights['swot']['threats'] = [
	"Over-reliance on dominant categories",
	"Resource competition between categories",
	"Sustainability challenges for high-volume categories"
	]

	insights['recommendations'] = [
	f"Increase focus on {min_category} events",
	"Implement balanced resource allocation strategy",
	"Develop cross-category event initiatives"
	]

	elif plot_type == 'pie':
	latest_year = '2022-2023'
	year_data = data[latest_year]
	total = sum(year_data.values())
	max_cat = max(year_data.items(), key=lambda x: x[1])
	min_cat = min(year_data.items(), key=lambda x: x[1])

	if total > 0:
	percentage = (max_cat[1] / total) * 100
	insights[
	'main_insight'] = f"In {latest_year}, {max_cat[0]} events dominated with {max_cat[1]} events ({percentage:.1f}% of total)."
	else:
	insights['main_insight'] = f"No events recorded for {latest_year}."

	elif plot_type == 'line':
	if len(df) > 1:
	trend_direction = "increasing" if df.iloc[-1].mean() > df.iloc[0].mean() else "decreasing"
	growth_rate = ((df.iloc[-1].mean() - df.iloc[0].mean()) / df.iloc[0].mean() * 100) if df.iloc[
	0].mean() > 0 else 0
	insights[
	'main_insight'] = f"Overall trend shows {trend_direction} pattern with {growth_rate:.1f}% change in average events."

	return insights

	except Exception as e:
	logger.error(f"Error generating insights: {e}")
	return {
	'main_insight': "Unable to generate insights for this visualization.",
	'swot': {'strengths': [], 'weaknesses': [], 'opportunities': [], 'threats': []},
	'recommendations': []
	}


	def create_plots(data):
	"""Create various plots and analyses from the data."""
	plots = {}

	try:
	df = pd.DataFrame(data).T

	# Color scheme for consistency
	colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']

	# 1. Bar Chart - Events by Category and Year
	fig1 = px.bar(
	df,
	barmode='group',
	title='Event Distribution Across Years and Categories',
	labels={'index': 'Year', 'value': 'Number of Events', 'variable': 'Category'},
	color_discrete_sequence=colors
	)
	fig1.update_layout(
	xaxis_title="Academic Year",
	yaxis_title="Number of Events",
	legend_title="Event Category",
	template="plotly_white"
	)
	plots['bar'] = {
	'plot': pio.to_html(fig1, full_html=False, div_id="bar-chart"),
	'insight': get_graph_insights(data, 'bar')
	}

	# 2. Pie Chart - Latest Year Distribution
	latest_year = '2022-2023'
	if latest_year in data:
	fig2 = px.pie(
	names=list(data[latest_year].keys()),
	values=list(data[latest_year].values()),
	title=f'Event Distribution for {latest_year}',
	color_discrete_sequence=colors
	)
	fig2.update_traces(textposition='inside', textinfo='percent+label')
	plots['pie'] = {
	'plot': pio.to_html(fig2, full_html=False, div_id="pie-chart"),
	'insight': get_graph_insights(data, 'pie')
	}

	# 3. Line Chart - Trends Over Time
	fig3 = px.line(
	df,
	markers=True,
	title='Event Trends Over Years',
	labels={'index': 'Year', 'value': 'Number of Events', 'variable': 'Category'},
	color_discrete_sequence=colors
	)
	fig3.update_layout(
	xaxis_title="Academic Year",
	yaxis_title="Number of Events",
	legend_title="Event Category",
	template="plotly_white"
	)
	plots['line'] = {
	'plot': pio.to_html(fig3, full_html=False, div_id="line-chart"),
	'insight': get_graph_insights(data, 'line')
	}

	# 4. Stacked Area Chart
	fig4 = px.area(
	df,
	title='Cumulative Event Distribution Over Years',
	labels={'index': 'Year', 'value': 'Number of Events', 'variable': 'Category'},
	color_discrete_sequence=colors
	)
	fig4.update_layout(
	xaxis_title="Academic Year",
	yaxis_title="Number of Events",
	legend_title="Event Category",
	template="plotly_white"
	)
	plots['area'] = {
	'plot': pio.to_html(fig4, full_html=False, div_id="area-chart"),
	'insight': get_graph_insights(data, 'area')
	}

	# 5. Statistical Summary
	total_events = df.sum().sum()
	avg_events_per_year = df.sum(axis=1).mean()
	most_active_year = df.sum(axis=1).idxmax()
	most_common_category = df.sum().idxmax()

	stats = {
	'total_events': int(total_events),
	'avg_events_per_year': round(avg_events_per_year, 1),
	'most_active_year': most_active_year,
	'most_common_category': most_common_category,
	'category_totals': df.sum().to_dict(),
	'yearly_totals': df.sum(axis=1).to_dict()
	}

	plots['stats'] = stats

	logger.info("Successfully created all plots")
	return plots

	except Exception as e:
	logger.error(f"Error creating plots: {e}")
	return None


	def allowed_file(filename):
	"""Check if the uploaded file is allowed."""
	return '.' in filename and filename.rsplit('.', 1)[1].lower() in ['docx']


	@app.route('/', methods=['GET', 'POST'])
	def index():
	"""Main route for the application."""
	plots = None

	if request.method == 'POST':
	# Check if file is uploaded
	if 'document' not in request.files:
	flash('No file uploaded. Please select a DOCX file.', 'error')
	return redirect(request.url)

	file = request.files['document']

	if file.filename == '':
	flash('No file selected. Please choose a DOCX file.', 'error')
	return redirect(request.url)

	if not allowed_file(file.filename):
	flash('Invalid file type. Please upload a DOCX file.', 'error')
	return redirect(request.url)

	if file:
	try:
	ensure_upload_folder()

	# Secure the filename
	filename = secure_filename(file.filename)
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"{timestamp}_{filename}"
	file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)

	# Save the file
	file.save(file_path)
	logger.info(f"File saved: {file_path}")

	# Extract text
	text = extract_text_from_docx(file_path)

	if not text.strip():
	flash('The uploaded document appears to be empty. Please check the file.', 'error')
	return redirect(request.url)

	# Extract data using Gemini
	data = extract_data_using_gemini(text)

	if data:
	# Create plots
	plots = create_plots(data)
	if plots:
	flash('Document processed successfully! 🎉', 'success')
	else:
	flash('Error creating visualizations. Please try again.', 'error')
	else:
	flash(
	'Could not extract event data from the document. Please ensure the document contains event statistics in the expected format.',
	'error')

	# Clean up uploaded file
	try:
	os.remove(file_path)
	logger.info(f"Cleaned up file: {file_path}")
	except Exception as e:
	logger.warning(f"Could not remove file {file_path}: {e}")

	except Exception as e:
	logger.error(f"Error processing document: {e}")
	flash(f'Error processing document: {str(e)}', 'error')

	return render_template('index.html', plots=plots)


	@app.errorhandler(413)
	def too_large(e):
	"""Handle file too large error."""
	flash("File too large. Please upload a file smaller than 16MB.", 'error')
	return redirect(request.url)


	@app.errorhandler(404)
	def not_found(e):
	"""Handle 404 errors."""
	return render_template('404.html'), 404


	@app.errorhandler(500)
	def internal_error(e):
	"""Handle internal server errors."""
	logger.error(f"Internal server error: {e}")
	flash('An internal error occurred. Please try again.', 'error')
	return redirect(url_for('index'))


	if __name__ == '__main__':
	print("🚀 Starting Event Analytics Application...")
	print("📊 Upload a DOCX file to analyze event data")
	print("🔗 Access the application at: http://localhost:5001")

	if not model:
	print("⚠️ Warning: Gemini API not configured. Please set GOOGLE_API_KEY environment variable.")

	app.run(debug=True, port=5001, host='0.0.0.0')