# Helper functions

import ast
from typing import Dict, Optional, List
import pandas as pd

def categorize_description(desc: str) -> str:
	if not isinstance(desc, str) or desc.strip() == "":
		return 'Other'
	d = desc.lower()
	if any(word in d for word in ['university', 'college', 'admissions', 'communications', 'marketing dept']):
		return 'University Account'
	if any(word in d for word in ['student', 'studying', 'undergraduate', 'postgraduate']):
		return 'Student'
	if any(word in d for word in ['applicant', 'prospective', 'future student', 'candidate']):
		return 'Potential Student'
	if any(word in d for word in ['marketing', 'digital', 'brand', 'social media']):
		return 'Marketing'
	return 'Other'
	if not isinstance(desc, str) or desc.strip() == "":
		return 'Other'
	d = desc.lower()
	if any(word in d for word in ['university', 'college', 'admissions', 'communications', 'marketing dept']):
		return 'University Account'
	if any(word in d for word in ['student', 'studying', 'undergraduate', 'postgraduate']):
		return 'Student'
	if any(word in d for word in ['applicant', 'prospective', 'future student', 'candidate']):
		return 'Potential Student'
	if any(word in d for word in ['marketing', 'digital', 'brand', 'social media']):
		return 'Marketing'
	return 'Other'

def derive_author_metrics(df: pd.DataFrame) -> pd.DataFrame:
	df = df.copy()
	def parse_verified(v):
		if isinstance(v, bool):
			return v
		if isinstance(v, str):
			try:
				return ast.literal_eval(v)
			except Exception:
				return False
		return False
	df['is_verified'] = df['verified'].apply(parse_verified)
	df['has_profile_image'] = df['profile_image_url'].apply(lambda x: isinstance(x, str) and x.strip() != '')
	def extract_country(location: Optional[str]) -> str:
		if not isinstance(location, str) or location.strip() == '':
			return 'Unknown'
		parts = [p.strip() for p in location.split(',')]
		return parts[-1] if len(parts) > 1 else parts[0]
	df['country'] = df['location'].apply(extract_country)
	df['description_category'] = df['description'].apply(categorize_description)
	return df
	df = df.copy()
	def parse_verified(v):
		if isinstance(v, bool):
			return v
		if isinstance(v, str):
			try:
				import ast
				return ast.literal_eval(v)
			except Exception:
				return False
		return False
	df['is_verified'] = df['verified'].apply(parse_verified)
	df['has_profile_image'] = df['profile_image_url'].apply(lambda x: isinstance(x, str) and x.strip() != '')
	def extract_country(location: Optional[str]) -> str:
		if not isinstance(location, str) or location.strip() == '':
			return 'Unknown'
		parts = [p.strip() for p in location.split(',')]
		return parts[-1] if len(parts) > 1 else parts[0]
	df['country'] = df['location'].apply(extract_country)
	df['description_category'] = df['description'].apply(categorize_description)
	return df

def derive_tweet_metrics(df: pd.DataFrame) -> pd.DataFrame:
	df = df.copy()
	df['has_attachments'] = df['attachments'].apply(lambda x: not pd.isna(x))
	df['has_referenced_tweet'] = df['referenced_tweets'].apply(lambda x: not pd.isna(x))
	df['has_geo'] = df['geo'].apply(lambda x: not pd.isna(x))
	df['is_sensitive'] = df['possibly_sensitive'].fillna(False).astype(bool)
	df['is_reply'] = df['in_reply_to_user_id'].apply(lambda x: not pd.isna(x))
	if df['tweet_created_at'].dtype != 'datetime64[ns, UTC]' and df['tweet_created_at'].dtype != '<M8[ns]':
		df['tweet_created_at'] = pd.to_datetime(df['tweet_created_at'], errors='coerce')
	df['tweet_date'] = df['tweet_created_at'].dt.date
	return df
	df = df.copy()
	df['has_attachments'] = df['attachments'].apply(lambda x: not pd.isna(x))
	df['has_referenced_tweet'] = df['referenced_tweets'].apply(lambda x: not pd.isna(x))
	df['has_geo'] = df['geo'].apply(lambda x: not pd.isna(x))
	df['is_sensitive'] = df['possibly_sensitive'].fillna(False).astype(bool)
	df['is_reply'] = df['in_reply_to_user_id'].apply(lambda x: not pd.isna(x))
	if df['tweet_created_at'].dtype != 'datetime64[ns, UTC]' and df['tweet_created_at'].dtype != '<M8[ns]':
		df['tweet_created_at'] = pd.to_datetime(df['tweet_created_at'], errors='coerce')
	df['tweet_date'] = df['tweet_created_at'].dt.date
	return df

def compute_kpi_tables(df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
	kpis: Dict[str, pd.DataFrame] = {}
	kpis['verified_by_country'] = df.pivot_table(
		index='country',
		columns='is_verified',
		values='author_id',
		aggfunc='nunique',
		fill_value=0
	).rename(columns={False: 'unverified_users', True: 'verified_users'})
	public_metric_cols = ['followers_count', 'following_count', 'like_count',
						  'listed_count', 'media_count', 'tweet_count']
	kpis['public_metrics_summary'] = df[public_metric_cols].describe().transpose()
	kpis['profile_image_by_country'] = df.pivot_table(
		index='country',
		columns='has_profile_image',
		values='author_id',
		aggfunc='nunique',
		fill_value=0
	).rename(columns={False: 'no_image', True: 'has_image'})
	kpis['authors_by_country'] = df['country'].value_counts().reset_index().rename(columns={'index': 'country', 'country': 'author_count'})
	kpis['description_category_by_country'] = df.pivot_table(
		index='country',
		columns='description_category',
		values='author_id',
		aggfunc='nunique',
		fill_value=0
	)
	kpis['language_counts'] = df['lang'].value_counts().reset_index().rename(columns={'index': 'language', 'lang': 'tweet_count'})
	if 'reply_settings' in df.columns:
		kpis['reply_settings_counts'] = df['reply_settings'].value_counts(dropna=False).reset_index().rename(columns={'index': 'reply_setting', 'reply_settings': 'tweet_count'})
	kpis['referenced_tweets_counts'] = df['has_referenced_tweet'].value_counts().reset_index().rename(columns={'index': 'has_referenced', 'has_referenced_tweet': 'tweet_count'})
	kpis['geo_present_counts'] = df['has_geo'].value_counts().reset_index().rename(columns={'index': 'has_geo', 'has_geo': 'tweet_count'})
	kpis['attachments_counts'] = df['has_attachments'].value_counts().reset_index().rename(columns={'index': 'has_attachments', 'has_attachments': 'tweet_count'})
	kpis['sensitive_counts'] = df['is_sensitive'].value_counts().reset_index().rename(columns={'index': 'is_sensitive', 'is_sensitive': 'tweet_count'})
	kpis['reply_counts'] = df['is_reply'].value_counts().reset_index().rename(columns={'index': 'is_reply', 'is_reply': 'tweet_count'})
	daily_counts = df['tweet_date'].value_counts().reset_index()
	daily_counts.columns = ['date', 'tweet_count']
	daily_counts = daily_counts.sort_values(by='date')
	kpis['daily_tweet_counts'] = daily_counts
	return kpis
	kpis: Dict[str, pd.DataFrame] = {}
	kpis['verified_by_country'] = df.pivot_table(
		index='country',
		columns='is_verified',
		values='author_id',
		aggfunc='nunique',
		fill_value=0
	).rename(columns={False: 'unverified_users', True: 'verified_users'})
	public_metric_cols = ['followers_count', 'following_count', 'like_count',
						  'listed_count', 'media_count', 'tweet_count']
	kpis['public_metrics_summary'] = df[public_metric_cols].describe().transpose()
	kpis['profile_image_by_country'] = df.pivot_table(
		index='country',
		columns='has_profile_image',
		values='author_id',
		aggfunc='nunique',
		fill_value=0
	).rename(columns={False: 'no_image', True: 'has_image'})
	kpis['authors_by_country'] = df['country'].value_counts().reset_index().rename(columns={'index': 'country', 'country': 'author_count'})
	kpis['description_category_by_country'] = df.pivot_table(
		index='country',
		columns='description_category',
		values='author_id',
		aggfunc='nunique',
		fill_value=0
	)
	kpis['language_counts'] = df['lang'].value_counts().reset_index().rename(columns={'index': 'language', 'lang': 'tweet_count'})
	if 'reply_settings' in df.columns:
		kpis['reply_settings_counts'] = df['reply_settings'].value_counts(dropna=False).reset_index().rename(columns={'index': 'reply_setting', 'reply_settings': 'tweet_count'})
	kpis['referenced_tweets_counts'] = df['has_referenced_tweet'].value_counts().reset_index().rename(columns={'index': 'has_referenced', 'has_referenced_tweet': 'tweet_count'})
	kpis['geo_present_counts'] = df['has_geo'].value_counts().reset_index().rename(columns={'index': 'has_geo', 'has_geo': 'tweet_count'})
	kpis['attachments_counts'] = df['has_attachments'].value_counts().reset_index().rename(columns={'index': 'has_attachments', 'has_attachments': 'tweet_count'})
	kpis['sensitive_counts'] = df['is_sensitive'].value_counts().reset_index().rename(columns={'index': 'is_sensitive', 'is_sensitive': 'tweet_count'})
	kpis['reply_counts'] = df['is_reply'].value_counts().reset_index().rename(columns={'index': 'is_reply', 'is_reply': 'tweet_count'})
	kpis['daily_tweet_counts'] = df['tweet_date'].value_counts().reset_index().rename(columns={'index': 'date', 'tweet_date': 'tweet_count'}).sort_values('date')
	return kpis

def compute_segments(df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
	segments: Dict[str, pd.DataFrame] = {}
	segments['location_segment'] = (
		df.groupby('country')
		.size()
		.reset_index(name='tweet_count')
		.sort_values('tweet_count', ascending=False)
	)
	def follower_bucket(x: float) -> str:
		if x < 1_000:
			return 'Small (<1k)'
		elif x < 10_000:
			return 'Medium (1k–10k)'
		elif x < 100_000:
			return 'Large (10k–100k)'
		else:
			return 'Very Large (>100k)'
	df = df.copy()
	df['follower_segment'] = df['followers_count'].apply(follower_bucket)
	segments['follower_segment'] = (
		df.groupby('follower_segment')
		.size()
		.reset_index(name='tweet_count')
		.sort_values('tweet_count', ascending=False)
	)
	segments['verification_location_segment'] = (
		df.groupby(['country', 'is_verified'])
		.size()
		.reset_index(name='tweet_count')
		.sort_values('tweet_count', ascending=False)
	)
	segments['description_segment'] = (
		df.groupby('description_category')
		.size()
		.reset_index(name='tweet_count')
		.sort_values('tweet_count', ascending=False)
	)
	segments['language_segment'] = (
		df.groupby('lang')
		.size()
		.reset_index(name='tweet_count')
		.sort_values('tweet_count', ascending=False)
	)
	if 'reply_settings' in df.columns:
		segments['reply_setting_segment'] = (
			df.groupby(df['reply_settings'].fillna('Unknown'))
			.size()
			.reset_index(name='tweet_count')
			.rename(columns={'reply_settings': 'reply_setting'})
			.sort_values('tweet_count', ascending=False)
		)
	segments['geo_segment'] = (
		df.groupby('has_geo')
		.size()
		.reset_index(name='tweet_count')
		.rename(columns={'has_geo': 'has_geo'})
		.sort_values('tweet_count', ascending=False)
	)
	segments['attachment_segment'] = (
		df.groupby('has_attachments')
		.size()
		.reset_index(name='tweet_count')
		.rename(columns={'has_attachments': 'has_attachments'})
		.sort_values('tweet_count', ascending=False)
	)
	segments['sensitive_segment'] = (
		df.groupby('is_sensitive')
		.size()
		.reset_index(name='tweet_count')
		.rename(columns={'is_sensitive': 'is_sensitive'})
		.sort_values('tweet_count', ascending=False)
	)
	segments: Dict[str, pd.DataFrame] = {}
	segments['location_segment'] = (
		df.groupby('country')
		.size()
		.reset_index(name='tweet_count')
		.sort_values('tweet_count', ascending=False)
	)
	def follower_bucket(x: float) -> str:
		if x < 1_000:
			return 'Small (<1k)'
		elif x < 10_000:
			return 'Medium (1k–10k)'
		elif x < 100_000:
			return 'Large (10k–100k)'
		else:
			return 'Very Large (>100k)'
	df = df.copy()
	df['follower_segment'] = df['followers_count'].apply(follower_bucket)
	segments['follower_segment'] = (
		df.groupby('follower_segment')
		.size()
		.reset_index(name='tweet_count')
		.sort_values('tweet_count', ascending=False)
	)
	segments['verification_location_segment'] = (
		df.groupby(['country', 'is_verified'])
		.size()
		.reset_index(name='tweet_count')
		.sort_values('tweet_count', ascending=False)
	)
	segments['description_segment'] = (
		df.groupby('description_category')
		.size()
		.reset_index(name='tweet_count')
		.sort_values('tweet_count', ascending=False)
	)
	segments['language_segment'] = (
		df.groupby('lang')
		.size()
		.reset_index(name='tweet_count')
		.sort_values('tweet_count', ascending=False)
	)
	if 'reply_settings' in df.columns:
		segments['reply_setting_segment'] = (
			df.groupby(df['reply_settings'].fillna('Unknown'))
			.size()
			.reset_index(name='tweet_count')
			.rename(columns={'reply_settings': 'reply_setting'})
			.sort_values('tweet_count', ascending=False)
		)
	segments['geo_segment'] = (
		df.groupby('has_geo')
		.size()
		.reset_index(name='tweet_count')
		.rename(columns={'has_geo': 'has_geo'})
		.sort_values('tweet_count', ascending=False)
	)
	segments['attachment_segment'] = (
		df.groupby('has_attachments')
		.size()
		.reset_index(name='tweet_count')
		.rename(columns={'has_attachments': 'has_attachments'})
		.sort_values('tweet_count', ascending=False)
	)
	segments['sensitive_segment'] = (
		df.groupby('is_sensitive')
		.size()
		.reset_index(name='tweet_count')
		.rename(columns={'is_sensitive': 'is_sensitive'})
		.sort_values('tweet_count', ascending=False)
	)
	segments['reply_segment'] = (
		df.groupby('is_reply')
		.size()
		.reset_index(name='tweet_count')
		.rename(columns={'is_reply': 'is_reply'})
		.sort_values('tweet_count', ascending=False)
	)
	return segments
	segments['reply_segment'] = (
		df.groupby('is_reply')
		.size()
		.reset_index(name='tweet_count')
		.rename(columns={'is_reply': 'is_reply'})
		.sort_values('tweet_count', ascending=False)
	)
	return segments

# EDA, metrics, KPIs, time series, clustering