# Helper functions import ast from typing import Dict, Optional, List import pandas as pd def categorize_description(desc: str) -> str: if not isinstance(desc, str) or desc.strip() == "": return 'Other' d = desc.lower() if any(word in d for word in ['university', 'college', 'admissions', 'communications', 'marketing dept']): return 'University Account' if any(word in d for word in ['student', 'studying', 'undergraduate', 'postgraduate']): return 'Student' if any(word in d for word in ['applicant', 'prospective', 'future student', 'candidate']): return 'Potential Student' if any(word in d for word in ['marketing', 'digital', 'brand', 'social media']): return 'Marketing' return 'Other' if not isinstance(desc, str) or desc.strip() == "": return 'Other' d = desc.lower() if any(word in d for word in ['university', 'college', 'admissions', 'communications', 'marketing dept']): return 'University Account' if any(word in d for word in ['student', 'studying', 'undergraduate', 'postgraduate']): return 'Student' if any(word in d for word in ['applicant', 'prospective', 'future student', 'candidate']): return 'Potential Student' if any(word in d for word in ['marketing', 'digital', 'brand', 'social media']): return 'Marketing' return 'Other' def derive_author_metrics(df: pd.DataFrame) -> pd.DataFrame: df = df.copy() def parse_verified(v): if isinstance(v, bool): return v if isinstance(v, str): try: return ast.literal_eval(v) except Exception: return False return False df['is_verified'] = df['verified'].apply(parse_verified) df['has_profile_image'] = df['profile_image_url'].apply(lambda x: isinstance(x, str) and x.strip() != '') def extract_country(location: Optional[str]) -> str: if not isinstance(location, str) or location.strip() == '': return 'Unknown' parts = [p.strip() for p in location.split(',')] return parts[-1] if len(parts) > 1 else parts[0] df['country'] = df['location'].apply(extract_country) df['description_category'] = df['description'].apply(categorize_description) return df df = df.copy() def parse_verified(v): if isinstance(v, bool): return v if isinstance(v, str): try: import ast return ast.literal_eval(v) except Exception: return False return False df['is_verified'] = df['verified'].apply(parse_verified) df['has_profile_image'] = df['profile_image_url'].apply(lambda x: isinstance(x, str) and x.strip() != '') def extract_country(location: Optional[str]) -> str: if not isinstance(location, str) or location.strip() == '': return 'Unknown' parts = [p.strip() for p in location.split(',')] return parts[-1] if len(parts) > 1 else parts[0] df['country'] = df['location'].apply(extract_country) df['description_category'] = df['description'].apply(categorize_description) return df def derive_tweet_metrics(df: pd.DataFrame) -> pd.DataFrame: df = df.copy() df['has_attachments'] = df['attachments'].apply(lambda x: not pd.isna(x)) df['has_referenced_tweet'] = df['referenced_tweets'].apply(lambda x: not pd.isna(x)) df['has_geo'] = df['geo'].apply(lambda x: not pd.isna(x)) df['is_sensitive'] = df['possibly_sensitive'].fillna(False).astype(bool) df['is_reply'] = df['in_reply_to_user_id'].apply(lambda x: not pd.isna(x)) if df['tweet_created_at'].dtype != 'datetime64[ns, UTC]' and df['tweet_created_at'].dtype != ' Dict[str, pd.DataFrame]: kpis: Dict[str, pd.DataFrame] = {} kpis['verified_by_country'] = df.pivot_table( index='country', columns='is_verified', values='author_id', aggfunc='nunique', fill_value=0 ).rename(columns={False: 'unverified_users', True: 'verified_users'}) public_metric_cols = ['followers_count', 'following_count', 'like_count', 'listed_count', 'media_count', 'tweet_count'] kpis['public_metrics_summary'] = df[public_metric_cols].describe().transpose() kpis['profile_image_by_country'] = df.pivot_table( index='country', columns='has_profile_image', values='author_id', aggfunc='nunique', fill_value=0 ).rename(columns={False: 'no_image', True: 'has_image'}) kpis['authors_by_country'] = df['country'].value_counts().reset_index().rename(columns={'index': 'country', 'country': 'author_count'}) kpis['description_category_by_country'] = df.pivot_table( index='country', columns='description_category', values='author_id', aggfunc='nunique', fill_value=0 ) kpis['language_counts'] = df['lang'].value_counts().reset_index().rename(columns={'index': 'language', 'lang': 'tweet_count'}) if 'reply_settings' in df.columns: kpis['reply_settings_counts'] = df['reply_settings'].value_counts(dropna=False).reset_index().rename(columns={'index': 'reply_setting', 'reply_settings': 'tweet_count'}) kpis['referenced_tweets_counts'] = df['has_referenced_tweet'].value_counts().reset_index().rename(columns={'index': 'has_referenced', 'has_referenced_tweet': 'tweet_count'}) kpis['geo_present_counts'] = df['has_geo'].value_counts().reset_index().rename(columns={'index': 'has_geo', 'has_geo': 'tweet_count'}) kpis['attachments_counts'] = df['has_attachments'].value_counts().reset_index().rename(columns={'index': 'has_attachments', 'has_attachments': 'tweet_count'}) kpis['sensitive_counts'] = df['is_sensitive'].value_counts().reset_index().rename(columns={'index': 'is_sensitive', 'is_sensitive': 'tweet_count'}) kpis['reply_counts'] = df['is_reply'].value_counts().reset_index().rename(columns={'index': 'is_reply', 'is_reply': 'tweet_count'}) daily_counts = df['tweet_date'].value_counts().reset_index() daily_counts.columns = ['date', 'tweet_count'] daily_counts = daily_counts.sort_values(by='date') kpis['daily_tweet_counts'] = daily_counts return kpis kpis: Dict[str, pd.DataFrame] = {} kpis['verified_by_country'] = df.pivot_table( index='country', columns='is_verified', values='author_id', aggfunc='nunique', fill_value=0 ).rename(columns={False: 'unverified_users', True: 'verified_users'}) public_metric_cols = ['followers_count', 'following_count', 'like_count', 'listed_count', 'media_count', 'tweet_count'] kpis['public_metrics_summary'] = df[public_metric_cols].describe().transpose() kpis['profile_image_by_country'] = df.pivot_table( index='country', columns='has_profile_image', values='author_id', aggfunc='nunique', fill_value=0 ).rename(columns={False: 'no_image', True: 'has_image'}) kpis['authors_by_country'] = df['country'].value_counts().reset_index().rename(columns={'index': 'country', 'country': 'author_count'}) kpis['description_category_by_country'] = df.pivot_table( index='country', columns='description_category', values='author_id', aggfunc='nunique', fill_value=0 ) kpis['language_counts'] = df['lang'].value_counts().reset_index().rename(columns={'index': 'language', 'lang': 'tweet_count'}) if 'reply_settings' in df.columns: kpis['reply_settings_counts'] = df['reply_settings'].value_counts(dropna=False).reset_index().rename(columns={'index': 'reply_setting', 'reply_settings': 'tweet_count'}) kpis['referenced_tweets_counts'] = df['has_referenced_tweet'].value_counts().reset_index().rename(columns={'index': 'has_referenced', 'has_referenced_tweet': 'tweet_count'}) kpis['geo_present_counts'] = df['has_geo'].value_counts().reset_index().rename(columns={'index': 'has_geo', 'has_geo': 'tweet_count'}) kpis['attachments_counts'] = df['has_attachments'].value_counts().reset_index().rename(columns={'index': 'has_attachments', 'has_attachments': 'tweet_count'}) kpis['sensitive_counts'] = df['is_sensitive'].value_counts().reset_index().rename(columns={'index': 'is_sensitive', 'is_sensitive': 'tweet_count'}) kpis['reply_counts'] = df['is_reply'].value_counts().reset_index().rename(columns={'index': 'is_reply', 'is_reply': 'tweet_count'}) kpis['daily_tweet_counts'] = df['tweet_date'].value_counts().reset_index().rename(columns={'index': 'date', 'tweet_date': 'tweet_count'}).sort_values('date') return kpis def compute_segments(df: pd.DataFrame) -> Dict[str, pd.DataFrame]: segments: Dict[str, pd.DataFrame] = {} segments['location_segment'] = ( df.groupby('country') .size() .reset_index(name='tweet_count') .sort_values('tweet_count', ascending=False) ) def follower_bucket(x: float) -> str: if x < 1_000: return 'Small (<1k)' elif x < 10_000: return 'Medium (1k–10k)' elif x < 100_000: return 'Large (10k–100k)' else: return 'Very Large (>100k)' df = df.copy() df['follower_segment'] = df['followers_count'].apply(follower_bucket) segments['follower_segment'] = ( df.groupby('follower_segment') .size() .reset_index(name='tweet_count') .sort_values('tweet_count', ascending=False) ) segments['verification_location_segment'] = ( df.groupby(['country', 'is_verified']) .size() .reset_index(name='tweet_count') .sort_values('tweet_count', ascending=False) ) segments['description_segment'] = ( df.groupby('description_category') .size() .reset_index(name='tweet_count') .sort_values('tweet_count', ascending=False) ) segments['language_segment'] = ( df.groupby('lang') .size() .reset_index(name='tweet_count') .sort_values('tweet_count', ascending=False) ) if 'reply_settings' in df.columns: segments['reply_setting_segment'] = ( df.groupby(df['reply_settings'].fillna('Unknown')) .size() .reset_index(name='tweet_count') .rename(columns={'reply_settings': 'reply_setting'}) .sort_values('tweet_count', ascending=False) ) segments['geo_segment'] = ( df.groupby('has_geo') .size() .reset_index(name='tweet_count') .rename(columns={'has_geo': 'has_geo'}) .sort_values('tweet_count', ascending=False) ) segments['attachment_segment'] = ( df.groupby('has_attachments') .size() .reset_index(name='tweet_count') .rename(columns={'has_attachments': 'has_attachments'}) .sort_values('tweet_count', ascending=False) ) segments['sensitive_segment'] = ( df.groupby('is_sensitive') .size() .reset_index(name='tweet_count') .rename(columns={'is_sensitive': 'is_sensitive'}) .sort_values('tweet_count', ascending=False) ) segments: Dict[str, pd.DataFrame] = {} segments['location_segment'] = ( df.groupby('country') .size() .reset_index(name='tweet_count') .sort_values('tweet_count', ascending=False) ) def follower_bucket(x: float) -> str: if x < 1_000: return 'Small (<1k)' elif x < 10_000: return 'Medium (1k–10k)' elif x < 100_000: return 'Large (10k–100k)' else: return 'Very Large (>100k)' df = df.copy() df['follower_segment'] = df['followers_count'].apply(follower_bucket) segments['follower_segment'] = ( df.groupby('follower_segment') .size() .reset_index(name='tweet_count') .sort_values('tweet_count', ascending=False) ) segments['verification_location_segment'] = ( df.groupby(['country', 'is_verified']) .size() .reset_index(name='tweet_count') .sort_values('tweet_count', ascending=False) ) segments['description_segment'] = ( df.groupby('description_category') .size() .reset_index(name='tweet_count') .sort_values('tweet_count', ascending=False) ) segments['language_segment'] = ( df.groupby('lang') .size() .reset_index(name='tweet_count') .sort_values('tweet_count', ascending=False) ) if 'reply_settings' in df.columns: segments['reply_setting_segment'] = ( df.groupby(df['reply_settings'].fillna('Unknown')) .size() .reset_index(name='tweet_count') .rename(columns={'reply_settings': 'reply_setting'}) .sort_values('tweet_count', ascending=False) ) segments['geo_segment'] = ( df.groupby('has_geo') .size() .reset_index(name='tweet_count') .rename(columns={'has_geo': 'has_geo'}) .sort_values('tweet_count', ascending=False) ) segments['attachment_segment'] = ( df.groupby('has_attachments') .size() .reset_index(name='tweet_count') .rename(columns={'has_attachments': 'has_attachments'}) .sort_values('tweet_count', ascending=False) ) segments['sensitive_segment'] = ( df.groupby('is_sensitive') .size() .reset_index(name='tweet_count') .rename(columns={'is_sensitive': 'is_sensitive'}) .sort_values('tweet_count', ascending=False) ) segments['reply_segment'] = ( df.groupby('is_reply') .size() .reset_index(name='tweet_count') .rename(columns={'is_reply': 'is_reply'}) .sort_values('tweet_count', ascending=False) ) return segments segments['reply_segment'] = ( df.groupby('is_reply') .size() .reset_index(name='tweet_count') .rename(columns={'is_reply': 'is_reply'}) .sort_values('tweet_count', ascending=False) ) return segments # EDA, metrics, KPIs, time series, clustering