Spaces:
Sleeping
Sleeping
from flask import Flask, render_template, request | |
import matplotlib | |
matplotlib.use('Agg') | |
import matplotlib.pyplot as plt | |
import io | |
import base64 | |
import pandas as pd | |
import google.generativeai as genai | |
import os | |
from docx import Document | |
import plotly.express as px | |
import plotly.io as pio | |
app = Flask(__name__) | |
app.config['UPLOAD_FOLDER'] = 'uploads' | |
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size | |
# Configure Gemini API | |
GOOGLE_API_KEY = 'AIzaSyBLcWuSj6N1bkhQsTF4kt3_hFh4ibH11pQ' # Replace with your actual API key | |
genai.configure(api_key=GOOGLE_API_KEY) | |
model = genai.GenerativeModel('gemini-2.0-flash') | |
def ensure_upload_folder(): | |
if not os.path.exists(app.config['UPLOAD_FOLDER']): | |
os.makedirs(app.config['UPLOAD_FOLDER']) | |
def extract_text_from_docx(file_path): | |
doc = Document(file_path) | |
full_text = [] | |
for paragraph in doc.paragraphs: | |
full_text.append(paragraph.text) | |
return '\n'.join(full_text) | |
def extract_data_using_gemini(text): | |
prompt = """ | |
Extract the event counts from the following table format in the text: | |
2022-2023 | |
Cultural competitions/events: NUMBER | |
Sports competitions/events: NUMBER | |
Technical fest/Academic fest: NUMBER | |
Social activities/events: NUMBER | |
Any other events through Active clubs and forums: NUMBER | |
2021-2022 | |
Cultural competitions/events: NUMBER | |
Sports competitions/events: NUMBER | |
Technical fest/Academic fest: NUMBER | |
Social activities/events: NUMBER | |
Any other events through Active clubs and forums: NUMBER | |
2020-2021 | |
Cultural competitions/events: NUMBER | |
Sports competitions/events: NUMBER | |
Technical fest/Academic fest: NUMBER | |
Social activities/events: NUMBER | |
Any other events through Active clubs and forums: NUMBER | |
2019-2020 | |
Cultural competitions/events: NUMBER | |
Sports competitions/events: NUMBER | |
Technical fest/Academic fest: NUMBER | |
Social activities/events: NUMBER | |
Any other events through Active clubs and forums: NUMBER | |
2018-2019 | |
Cultural competitions/events: NUMBER | |
Sports competitions/events: NUMBER | |
Technical fest/Academic fest: NUMBER | |
Social activities/events: NUMBER | |
Any other events through Active clubs and forums: NUMBER | |
Look for these exact numbers in the text. The data appears in a table with years and categories. | |
For each year, find: | |
- Number of Cultural competitions/events | |
- Number of Sports competitions/events | |
- Number of Technical fest/Academic fest events | |
- Number of Social activities/events | |
- Number of "Any other events through Active clubs and forums" | |
Return the data in this exact Python dictionary format: | |
{ | |
'2022-2023': {'Cultural': 11, 'Sports': 10, 'Technical': 29, 'Social': 15, 'Other': 20}, | |
'2021-2022': {'Cultural': 7, 'Sports': 8, 'Technical': 13, 'Social': 12, 'Other': 15}, | |
'2020-2021': {'Cultural': 7, 'Sports': 9, 'Technical': 15, 'Social': 10, 'Other': 17}, | |
'2019-2020': {'Cultural': 12, 'Sports': 17, 'Technical': 21, 'Social': 14, 'Other': 11}, | |
'2018-2019': {'Cultural': 8, 'Sports': 17, 'Technical': 15, 'Social': 11, 'Other': 9} | |
} | |
Important: | |
- Use the EXACT numbers from the document | |
- Include ALL years from 2018-2019 to 2022-2023 | |
- Make sure to find the correct table in the document that has these numbers | |
- Return only the Python dictionary, no other text | |
""" | |
try: | |
# Print the first part of the text for debugging | |
print("\nSearching in text:") | |
print("=" * 50) | |
# Look for specific patterns in text | |
import re | |
years = re.findall(r'(20\d{2}-20\d{2})', text) | |
print(f"Found years: {years}") | |
# Look for numbers near key terms | |
cultural = re.findall(r'Cultural competitions/events\s*(\d+)', text) | |
sports = re.findall(r'Sports competitions/events\s*(\d+)', text) | |
technical = re.findall(r'Technical fest/Academic fest\s*(\d+)', text) | |
other = re.findall(r'Any other events.*?(\d+)', text) | |
social = re.findall(r'Social activities/events\s*(\d+)', text) | |
print(f"Found cultural numbers: {cultural}") | |
print(f"Found sports numbers: {sports}") | |
print(f"Found technical numbers: {technical}") | |
print(f"Found other numbers: {other}") | |
print(f"Found social numbers: {social}") | |
print("=" * 50) | |
response = model.generate_content(text + "\n" + prompt) | |
response_text = response.text.strip() | |
# Debug print | |
print("Raw response:", response_text) | |
# Remove any markdown formatting | |
if '' in response_text: | |
response_text = response_text.split('')[1] | |
if 'python' in response_text.split('\n')[0]: | |
response_text = '\n'.join(response_text.split('\n')[1:]) | |
# Clean the response text | |
response_text = response_text.strip() | |
print("Cleaned response:", response_text) | |
# Parse the response | |
try: | |
import ast | |
data = ast.literal_eval(response_text) | |
except: | |
# Fallback to JSON parsing if ast fails | |
response_text = response_text.replace("'", '"') | |
import json | |
data = json.loads(response_text) | |
# Validate data structure | |
if not isinstance(data, dict): | |
raise ValueError("Response is not a dictionary") | |
# Ensure all years are present | |
expected_years = ['2022-2023', '2021-2022', '2020-2021', '2019-2020', '2018-2019'] | |
if not all(year in data for year in expected_years): | |
raise ValueError("Missing some years in the data") | |
# Ensure all categories are present for each year | |
required_categories = {'Cultural', 'Sports', 'Technical', 'Social', 'Other'} | |
for year in data: | |
if not all(cat in data[year] for cat in required_categories): | |
raise ValueError(f"Missing categories in year {year}") | |
return data | |
except Exception as e: | |
print(f"Error processing with Gemini: {str(e)}") | |
print(f"Response text was: {response_text if 'response_text' in locals() else 'No response text'}") | |
return None | |
def get_graph_insights(data, plot_type): | |
"""Generate detailed insights including SWOT analysis for different types of plots.""" | |
df = pd.DataFrame(data).T | |
if plot_type == 'bar': | |
total_by_category = df.sum() | |
max_category = total_by_category.idxmax() | |
min_category = total_by_category.idxmin() | |
avg_events = total_by_category.mean() | |
insights = { | |
'main_insight': f"The most frequent event category overall is {max_category} with {int(total_by_category[max_category])} events, while {min_category} has the least with {int(total_by_category[min_category])} events.", | |
'swot': { | |
'strengths': [ | |
f"Strong performance in {max_category} events", | |
f"Diverse range of events across categories", | |
f"Average of {avg_events:.1f} events per category" | |
], | |
'weaknesses': [ | |
f"Low participation in {min_category} events", | |
f"Uneven distribution across categories", | |
"Potential resource allocation issues" | |
], | |
'opportunities': [ | |
f"Room for growth in {min_category} category", | |
"Potential for cross-category events", | |
"Scope for balanced development" | |
], | |
'threats': [ | |
"Risk of over-dependence on dominant category", | |
"Resource strain in peak periods", | |
"Sustainability challenges" | |
] | |
}, | |
'recommendations': [ | |
f"Consider boosting {min_category} events", | |
"Implement balanced resource allocation", | |
"Develop cross-category initiatives" | |
] | |
} | |
return insights | |
elif plot_type == 'pie': | |
latest_year = '2022-2023' | |
year_data = data[latest_year] | |
total = sum(year_data.values()) | |
max_cat = max(year_data.items(), key=lambda x: x[1]) | |
min_cat = min(year_data.items(), key=lambda x: x[1]) | |
percentage = (max_cat[1] / total) * 100 | |
insights = { | |
'main_insight': f"In {latest_year}, {max_cat[0]} events dominated with {max_cat[1]} events ({percentage:.1f}% of total events).", | |
'swot': { | |
'strengths': [ | |
f"Strong presence in {max_cat[0]} category", | |
"Clear category leadership", | |
"Established event structure" | |
], | |
'weaknesses': [ | |
f"Under-representation in {min_cat[0]} category", | |
"Imbalanced distribution", | |
"Resource concentration risks" | |
], | |
'opportunities': [ | |
"Potential for category diversification", | |
"Growth in underserved categories", | |
"New event type development" | |
], | |
'threats': [ | |
"Category saturation risk", | |
"Resource allocation challenges", | |
"Sustainability concerns" | |
] | |
}, | |
'recommendations': [ | |
"Diversify event portfolio", | |
f"Strengthen {min_cat[0]} category", | |
"Implement balanced growth strategy" | |
] | |
} | |
return insights | |
elif plot_type == 'line': | |
trend = "increasing" if df.iloc[-1].mean() > df.iloc[0].mean() else "decreasing" | |
growth_rate = ((df.iloc[-1].mean() - df.iloc[0].mean()) / df.iloc[0].mean() * 100) | |
insights = { | |
'main_insight': f"The overall trend shows a {trend} pattern with a {growth_rate:.1f}% change in event frequency over the years.", | |
'swot': { | |
'strengths': [ | |
f"Consistent {trend} trend", | |
"Clear growth trajectory", | |
"Established pattern" | |
], | |
'weaknesses': [ | |
"Fluctuations in growth rate", | |
"Periodic inconsistencies", | |
"Resource scaling challenges" | |
], | |
'opportunities': [ | |
"Growth optimization potential", | |
"Pattern regularization", | |
"Strategic planning possibilities" | |
], | |
'threats': [ | |
"Sustainability of growth rate", | |
"Resource management challenges", | |
"Market saturation risks" | |
] | |
}, | |
'recommendations': [ | |
"Develop sustainable growth plan", | |
"Implement resource scaling strategy", | |
"Monitor growth patterns" | |
] | |
} | |
return insights | |
elif plot_type == 'growth': | |
growth_rates = df.pct_change() * 100 | |
avg_growth = growth_rates.mean().mean() | |
max_growth = growth_rates.max().max() | |
min_growth = growth_rates.min().min() | |
insights = { | |
'main_insight': f"The average year-over-year growth rate is {avg_growth:.1f}%, with peaks of {max_growth:.1f}% and lows of {min_growth:.1f}%.", | |
'swot': { | |
'strengths': [ | |
"Positive average growth rate", | |
"Strong peak performance periods", | |
"Growth momentum" | |
], | |
'weaknesses': [ | |
"Growth rate volatility", | |
"Negative growth periods", | |
"Inconsistent patterns" | |
], | |
'opportunities': [ | |
"Growth stabilization potential", | |
"Performance optimization", | |
"Strategic growth planning" | |
], | |
'threats': [ | |
"Growth sustainability", | |
"Resource scaling challenges", | |
"Market fluctuations" | |
] | |
}, | |
'recommendations': [ | |
"Stabilize growth patterns", | |
"Develop contingency plans", | |
"Implement growth monitoring" | |
] | |
} | |
return insights | |
elif plot_type == 'area': | |
total_growth = ((df.iloc[-1].sum() - df.iloc[0].sum()) / df.iloc[0].sum() * 100) | |
avg_yearly_growth = total_growth / (len(df) - 1) | |
insights = { | |
'main_insight': f"The cumulative events show a {total_growth:.1f}% total change, averaging {avg_yearly_growth:.1f}% yearly growth.", | |
'swot': { | |
'strengths': [ | |
"Consistent cumulative growth", | |
"Strong overall trajectory", | |
"Clear progress pattern" | |
], | |
'weaknesses': [ | |
"Growth rate variations", | |
"Resource scaling challenges", | |
"Potential sustainability issues" | |
], | |
'opportunities': [ | |
"Long-term growth potential", | |
"Pattern optimization", | |
"Strategic expansion" | |
], | |
'threats': [ | |
"Scaling challenges", | |
"Resource constraints", | |
"Market saturation" | |
] | |
}, | |
'recommendations': [ | |
"Develop long-term growth strategy", | |
"Implement resource planning", | |
"Monitor cumulative trends" | |
] | |
} | |
return insights | |
return { | |
'main_insight': "No specific insights available for this visualization.", | |
'swot': { | |
'strengths': [], | |
'weaknesses': [], | |
'opportunities': [], | |
'threats': [] | |
}, | |
'recommendations': [] | |
} | |
def create_plots(data): | |
plots = {} | |
df = pd.DataFrame(data).T | |
# Bar Chart | |
fig1 = px.bar(df, barmode='group', title='Events Distribution Across Years') | |
plots['bar'] = { | |
'plot': pio.to_html(fig1, full_html=False), | |
'insight': get_graph_insights(data, 'bar') | |
} | |
# Pie Chart | |
latest_year = '2022-2023' | |
fig2 = px.pie(names=data[latest_year].keys(), values=data[latest_year].values(), | |
title=f'Event Distribution for {latest_year}') | |
plots['pie'] = { | |
'plot': pio.to_html(fig2, full_html=False), | |
'insight': get_graph_insights(data, 'pie') | |
} | |
# Line Chart | |
fig3 = px.line(df, markers=True, title='Event Trends Over Years') | |
plots['line'] = { | |
'plot': pio.to_html(fig3, full_html=False), | |
'insight': get_graph_insights(data, 'line') | |
} | |
# Growth Rate Chart | |
growth_rates = df.pct_change() * 100 | |
fig4 = px.bar(growth_rates, title='Year-over-Year Growth Rate by Category') | |
plots['growth'] = { | |
'plot': pio.to_html(fig4, full_html=False), | |
'insight': get_graph_insights(data, 'growth') | |
} | |
# Area Chart | |
fig5 = px.area(df, title='Cumulative Events Distribution') | |
plots['area'] = { | |
'plot': pio.to_html(fig5, full_html=False), | |
'insight': get_graph_insights(data, 'area') | |
} | |
# Statistical Analysis | |
stats = { | |
'total_events': df.sum().sum(), | |
'avg_events_per_year': df.sum(axis=1).mean().round(2), | |
'most_active_year': df.sum(axis=1).idxmax(), | |
'most_common_category': df.sum().idxmax(), | |
'growth_analysis': { | |
'total_growth': ((df.iloc[-1].sum() - df.iloc[0].sum()) / df.iloc[0].sum() * 100).round(2), | |
'category_growth': ((df.iloc[-1] - df.iloc[0]) / df.iloc[0] * 100).round(2).to_dict() | |
} | |
} | |
plots['stats'] = stats | |
return plots | |
def index(): | |
plots = None | |
error_message = None | |
if request.method == 'POST': | |
if 'document' not in request.files: | |
error_message = 'No file uploaded' | |
return render_template('index.html', error=error_message) | |
file = request.files['document'] | |
if file.filename == '': | |
error_message = 'No file selected' | |
return render_template('index.html', error=error_message) | |
if file and file.filename.endswith('.docx'): | |
ensure_upload_folder() | |
file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename) | |
file.save(file_path) | |
try: | |
text = extract_text_from_docx(file_path) | |
data = extract_data_using_gemini(text) | |
print("Extracted data:", data) | |
if data: | |
plots = create_plots(data) | |
else: | |
error_message = 'Could not extract data from document. Please check the document format.' | |
os.remove(file_path) | |
except Exception as e: | |
error_message = f'Error processing document: {str(e)}' | |
print(f"Full error: {str(e)}") | |
else: | |
error_message = 'Please upload a .docx file' | |
return render_template('index.html', plots=plots, error=error_message) | |
if __name__ == '__main__': | |
app.run(debug=True, port=5001) |