In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
from pathlib import Path

workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("workding dir:", workding_dir)

from dotenv import find_dotenv, load_dotenv

found_dotenv = find_dotenv(".env")

if len(found_dotenv) == 0:
    found_dotenv = find_dotenv(".env.example")
print(f"loading env vars from: {found_dotenv}")
load_dotenv(found_dotenv, override=True)

workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents
loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env


True

In [2]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

import gensim
from gensim import corpora
from gensim import similarities
from gensim import models
from gensim.models import CoherenceModel

# from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import re
import os
import datetime

import warnings

warnings.filterwarnings("ignore")

from pprint import pprint
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [3]:
print(datetime.datetime.now())

2024-06-30 15:39:16.255404


# Import the data with full news content

In [4]:
df = pd.read_parquet("data/processed_data2.parquet")

In [5]:
df.head()

Unnamed: 0,id,Headline,Details,Severity,Category,Region,Datetime,Year,lat,lon,...,if_labeled,Month,Week,Headline_Details,url,title,content,cleaned_content,binary_content,word_count
0,1,Grasberg Mine- Grasberg mine workers extend st...,Media sources indicate that workers at the Gra...,Moderate,Mine Workers Strike,Indonesia,28/5/17 17:08,2017.0,-4.05608,137.11302,...,False,5.0,21.0,Grasberg Mine- Grasberg mine workers extend st...,https://news.google.com/rss/articles/CBMiZ2h0d...,Freeport Indonesia mine workers extend strike ...,Trucks are seen on a road in the Grasberg copp...,"[truck, be, see, on, road, in, grasberg, coppe...","[adkerson_jakarta_try, agreement_freeport_indo...",53
1,3,Shanghai port congestion impacts terminals in ...,The persisting port congestion at Shanghai’s Y...,Minor,Port Congestion,China,27/4/17 9:16,2017.0,29.52,121.3319,...,False,4.0,17.0,Shanghai port congestion impacts terminals in ...,https://news.google.com/rss/articles/CBMiVWh0d...,Typhoon Muifa to shut China ports for second t...,By Sam Whelan 13/09/2022\n\nAnother typhoon ha...,"[by, sam, whelan, typhoon, have, prompt, port,...","[additional_ripple_effect, avoid_path_typhoon,...",44
2,5,UPDATE - Indonesia: Police confirm two explosi...,"According to local police in Jakarta, two expl...",Extreme,"Bombing, Police Operations",Indonesia,24/5/17 16:20,2017.0,,,...,True,5.0,21.0,UPDATE - Indonesia: Police confirm two explosi...,https://news.google.com/rss/articles/CBMiZWh0d...,Jakarta Police Receive 2 More Reports on Coldp...,"TEMPO.CO, Jakarta - South Jakarta Metro Police...","[jakarta, south, jakarta, metro, police, recei...","[actress_accord, available_day_concert, click_...",24
3,6,UPDATE - Indonesia: Severe winds damage infras...,Severe winds have downed billboards and trees ...,Moderate,"Roadway Closure / Disruption, Flooding, Severe...",Indonesia,19/4/17 9:10,2017.0,-6.91264,107.657,...,True,4.0,16.0,UPDATE - Indonesia: Severe winds damage infras...,https://news.google.com/rss/articles/CBMiSWh0d...,Indonesia hit by some of strongest winds recorded,A man stands near damaged houses following a t...,"[man, stand, near, damage, house, follow, torn...","[bbc_indonesia, climatologist_government_resea...",28
4,14,2 miles E of Chesterfield - A tornado has touc...,Government sources are reporting a tornado has...,Minor,Tornado,United States,17/9/18 19:55,2018.0,37.51,-77.61,...,True,9.0,38.0,2 miles E of Chesterfield - A tornado has touc...,https://news.google.com/rss/articles/CBMigAFod...,UPDATE: Number of homes without power down to ...,"More than 90,000 homes and businesses across t...","[more, than, home, business, across, richmond,...","[advise_seek_alternate, affect_richmond, alter...",134


In [6]:
df.shape

(3681, 23)

# Vectorisation

NLP vectorization refers to the process of converting text data into numerical vectors that machine learning algorithms can understand and process. 

Bag-of-Words (BoW) is used here that represents text as a collection of unique words along with their frequencies. Each word is assigned an index, and the vector contains the count of each word present in the document.

In [7]:
df_copy = df.copy()

In [8]:
# choose only the extreme and severe cases for modelling
cleaned = df_copy[df_copy["Severity"].isin(["Extreme", "Severe"])]
cleaned.reset_index(drop=True, inplace=True)

In [9]:
headline = cleaned.binary_content

In [10]:
headline[5]

array(['number_container', 'accord_detective_llamas', 'anyone_talk_crime',
       'arizmendi_girlfriend_become', 'auto_theft_robbery',
       'clothing_makeup_shoe', 'clue_loot', 'decode_container_stack',
       'detective_chavez', 'detective_put', 'electric_bicycle',
       'empire_farther_south', 'freight_train_repeat',
       'google_placard_lock', 'hard_drive_tablet', 'homicide_drug_gang',
       'inside_container_secure', 'llama_straight_tell',
       'llama_work_connie', 'metal_lock_size', 'mile_east',
       'motel_room_storage', 'plentiful_tv_beer',
       'southern_california_couple', 'succumb_bolt_cutter', 'sure_sign',
       'upgraded_lock'], dtype=object)

In [11]:
# vectorise the words
doc_dict = gensim.corpora.Dictionary(headline)
docs_vecs = [doc_dict.doc2bow(doc) for doc in headline]

In [12]:
print("Number of unique tokens: %d" % len(doc_dict))
print("Number of articles: %d" % len(docs_vecs))

Number of unique tokens: 30464
Number of articles: 300


In [13]:
# Calculate word frequencies
word_frequencies = {doc_dict[word_id]: freq for word_id, freq in doc_dict.cfs.items()}
sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)

pprint(sorted_words[:100])

[('heavy_rain', 15),
 ('global_supply_chain', 15),
 ('national_hurricane_center', 13),
 ('heavy_rainfall', 12),
 ('port_los', 12),
 ('hong_kong', 12),
 ('united_state', 11),
 ('critical_destination_port', 11),
 ('global_port_tracker', 11),
 ('global_shipping_disruption', 11),
 ('sign_confidence_consumer', 11),
 ('upgrade_import_forecast', 11),
 ('national_weather_service', 10),
 ('social_medium', 10),
 ('moment_exception_request', 10),
 ('sorry_site', 10),
 ('technical_difficulty_please', 10),
 ('trade_statement', 9),
 ('tropical_storm', 9),
 ('help_business', 9),
 ('meet_firm', 9),
 ('website_see_service', 9),
 ('supply_chain', 8),
 ('strong_wind', 8),
 ('coastal_area', 7),
 ('geological_survey', 7),
 ('asian_country', 7),
 ('day_trade_asia', 7),
 ('global_demand', 7),
 ('global_economy', 7),
 ('high_yard_density', 7),
 ('inch_rain', 7),
 ('coast_port', 7),
 ('empty_container', 7),
 ('union_worker', 7),
 ('many_area', 6),
 ('customer_demand', 6),
 ('economic_growth', 6),
 ('free_day',

# LDA Modelling

We initially selected a fixed topic number for model pipelien development and benchmark model setup. Then we used the full dataset for fine-tuning and evaluation.

## Benchmark Model

In [14]:
# Build LDA benchmark model
lda_model = gensim.models.LdaMulticore(
    corpus=docs_vecs,
    id2word=doc_dict,
    num_topics=4,
    random_state=42,
    chunksize=100,
    passes=10,
    per_word_topics=True,
)

In [15]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[docs_vecs]

[(0,
  '0.001*"technical_difficulty_please" + 0.001*"moment_exception_request" + '
  '0.001*"sorry_site" + 0.001*"heavy_rain" + 0.000*"heavy_rainfall" + '
  '0.000*"port_los" + 0.000*"national_weather_service" + 0.000*"coast_port" + '
  '0.000*"united_state" + 0.000*"empty_container"'),
 (1,
  '0.001*"upgrade_import_forecast" + 0.001*"sign_confidence_consumer" + '
  '0.001*"global_shipping_disruption" + 0.001*"global_port_tracker" + '
  '0.001*"meet_firm" + 0.001*"website_see_service" + 0.001*"help_business" + '
  '0.000*"national_hurricane_center" + 0.000*"passenger_service" + '
  '0.000*"hong_kong"'),
 (2,
  '0.001*"global_supply_chain" + 0.000*"negative_impact" + '
  '0.000*"critical_destination_port" + 0.000*"trade_statement" + '
  '0.000*"warm_winter_china" + 0.000*"import_volume" + 0.000*"global_demand" + '
  '0.000*"day_trade_asia" + 0.000*"slow_react" + 0.000*"full_network"'),
 (3,
  '0.000*"critical_destination_port" + 0.000*"social_medium" + '
  '0.000*"meteorological_agency"

In [17]:
%%time

# Compute Benchmark Coherence Score
coherence_model_lda = CoherenceModel(
    model=lda_model, texts=headline, dictionary=doc_dict, coherence="c_v"
)
coherence_lda = coherence_model_lda.get_coherence()
print("\nCoherence Score LDAModel: ", coherence_lda)


Coherence Score LDAModel:  0.37181231277776183
CPU times: user 216 ms, sys: 147 ms, total: 364 ms
Wall time: 8min 58s


In [18]:
# Compute Benchmark Perplexity
perplex = lda_model.log_perplexity(docs_vecs, total_docs=None)  # For LDAModel
# a measure of how good the model is. lower the better.

print("\nPerplexity for LDAModel: ", perplex)


Perplexity for LDAModel:  -10.57378514568444


In [19]:
from pprint import pprint
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# feed the LDA model into the pyLDAvis instance
pyLDAvis.enable_notebook()
visual = gensimvis.prepare(lda_model, docs_vecs, doc_dict)

# Save the output to the html file
pyLDAvis.save_html(visual, "data/topic_viz_benchmark_severe.html")

  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()


In [20]:
pd.set_option("max_colwidth", 200)
# Get the topics and their top keywords into a dataframe
topics = lda_model.show_topics(num_words=6)

topic_keywords = pd.DataFrame()
for topic_id, topic in topics:
    topic_keywords.at[topic_id, "Topic Keywords"] = topic

topic_keywords["Topic ID"] = topic_keywords.index
topic_keywords

Unnamed: 0,Topic Keywords,Topic ID
0,"0.001*""technical_difficulty_please"" + 0.001*""moment_exception_request"" + 0.001*""sorry_site"" + 0.001*""heavy_rain"" + 0.000*""heavy_rainfall"" + 0.000*""port_los""",0
1,"0.001*""upgrade_import_forecast"" + 0.001*""sign_confidence_consumer"" + 0.001*""global_shipping_disruption"" + 0.001*""global_port_tracker"" + 0.001*""meet_firm"" + 0.001*""website_see_service""",1
2,"0.001*""global_supply_chain"" + 0.000*""negative_impact"" + 0.000*""critical_destination_port"" + 0.000*""trade_statement"" + 0.000*""warm_winter_china"" + 0.000*""import_volume""",2
3,"0.000*""critical_destination_port"" + 0.000*""social_medium"" + 0.000*""meteorological_agency"" + 0.000*""hong_kong"" + 0.000*""many_area"" + 0.000*""trade_statement""",3


In [21]:
# break

# Hyper-Perameter Tuning and Evaluation

Run the cells below only for re-modelling with new datasets, the whole tuning and evaluation process may take hours to run.

In [22]:
# hyper-perameter tuning (alpha and beta)
def compute_coherence_values(corpus, dictionary, k, a, b):

    lda_model = gensim.models.LdaMulticore(
        corpus=corpus,
        id2word=dictionary,
        num_topics=k,
        random_state=42,
        chunksize=100,
        passes=10,
        alpha=a,
        eta=b,
    )

    coherence_model_lda = CoherenceModel(
        model=lda_model, texts=headline, dictionary=doc_dict, coherence="c_v"
    )
    coherence = coherence_model_lda.get_coherence()
    perplex = lda_model.log_perplexity(docs_vecs, total_docs=None)

    return coherence, perplex

In [23]:
# setup
import numpy as np

from gensim.models import CoherenceModel

model_list = []
coherence_values = []
perplexity_values = []
model_topics = []
alpha_result = []
beta_result = []

# topic ranges
num_topics = range(4, 13)

# Alpha parameter
alpha = list(np.arange(0.31, 1, 0.3))
alpha.append("symmetric")
alpha.append("asymmetric")

# Beta parameter
beta = list(np.arange(0.31, 1, 0.3))
beta.append("symmetric")

Rational behind the alpha and eta: https://stats.stackexchange.com/questions/37405/natural-interpretation-for-lda-hyperparameters

In [24]:
print("Topic range: ", num_topics)
print("Alpha: ", alpha)
print("Beta: ", beta)

Topic range:  range(4, 13)
Alpha:  [0.31, 0.61, 0.9099999999999999, 'symmetric', 'asymmetric']
Beta:  [0.31, 0.61, 0.9099999999999999, 'symmetric']


In [26]:
%%time

import datetime
import numpy as np
from gensim.models import CoherenceModel

print(datetime.datetime.now())

for a in alpha:
    for b in beta:
        for num in num_topics:
            cv, pv = compute_coherence_values(
                corpus=docs_vecs, dictionary=doc_dict, k=num, a=a, b=b
            )

            model_topics.append(num)
            coherence_values.append(cv)
            perplexity_values.append(pv)
            alpha_result.append(a)
            beta_result.append(b)
            print(
                "#Topics: "
                + str(num)
                + ", CV Score: "
                + str(coherence_values[-1])
                + ", PV Score: "
                + str(perplexity_values[-1])
                + ", Alpha: "
                + str(alpha_result[-1])
                + ", Beta: "
                + str(beta_result[-1])
            )

print(datetime.datetime.now())

2024-06-30 15:56:56.953954


#Topics: 4, CV Score: 0.3720156705867761, PV Score: -10.531015192970104, Alpha: 0.31, Beta: 0.31
#Topics: 5, CV Score: 0.5104309491692648, PV Score: -10.501295581191243, Alpha: 0.31, Beta: 0.31
#Topics: 6, CV Score: 0.4577416605401658, PV Score: -10.460472706228693, Alpha: 0.31, Beta: 0.31
#Topics: 7, CV Score: 0.47905186758060786, PV Score: -10.45017220690116, Alpha: 0.31, Beta: 0.31
#Topics: 8, CV Score: 0.5170931021465908, PV Score: -10.435198097218018, Alpha: 0.31, Beta: 0.31
#Topics: 9, CV Score: 0.5418406648591022, PV Score: -10.41813577221336, Alpha: 0.31, Beta: 0.31
#Topics: 10, CV Score: 0.6289688892634311, PV Score: -10.40650838076674, Alpha: 0.31, Beta: 0.31
#Topics: 11, CV Score: 0.615861099169618, PV Score: -10.400571085548444, Alpha: 0.31, Beta: 0.31


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
_pickle.UnpicklingError: pickle data was truncated


#Topics: 12, CV Score: 0.5821904373179804, PV Score: -10.405657307982493, Alpha: 0.31, Beta: 0.31
#Topics: 4, CV Score: 0.39197924000152073, PV Score: -10.431108130993747, Alpha: 0.31, Beta: 0.61
#Topics: 5, CV Score: 0.5158552240339984, PV Score: -10.415847323918724, Alpha: 0.31, Beta: 0.61
#Topics: 6, CV Score: 0.4398095546006567, PV Score: -10.395709424729047, Alpha: 0.31, Beta: 0.61
#Topics: 7, CV Score: 0.4759542844436549, PV Score: -10.390971943426882, Alpha: 0.31, Beta: 0.61
#Topics: 8, CV Score: 0.5228046057671669, PV Score: -10.390099173623508, Alpha: 0.31, Beta: 0.61
#Topics: 9, CV Score: 0.534380007483663, PV Score: -10.383173065174224, Alpha: 0.31, Beta: 0.61
#Topics: 10, CV Score: 0.601346262577239, PV Score: -10.38283548973593, Alpha: 0.31, Beta: 0.61
#Topics: 11, CV Score: 0.6182871521157967, PV Score: -10.381370037404881, Alpha: 0.31, Beta: 0.61
#Topics: 12, CV Score: 0.6301666636692548, PV Score: -10.388156880830003, Alpha: 0.31, Beta: 0.61
#Topics: 4, CV Score: 0.3906

KeyboardInterrupt: 

The table below reveals the top 20 fine tuned models with best combinations of coherence score and perplexity score. It was sorted by the coherence score in descending order as a higher coherence score indicates a better model, and sorted the perplexity score in ascending order as a lower perplexity score indicates a better model. While coherence score evaluates the quality of the topics, the perplexity score evaluates the overall performance of the model in predicting new documents. Usually, the coherence score is a better metric to use if the goal is to obtain topics that are semantically coherent and interpretable. Perplexity score, on the other hand, is a better metric to use if the goal is to build a model that generalises well to new data, in other words, how confident the model is in predicting the new data (Sánchez-Aguayo, et al., 2022). Ultimately, we aim to get a balance between the perplexity value and coherence score when determining our final model.

In [None]:
# Find the top 20 combinations based on Coherence Score and Perplexity Score
result = pd.DataFrame(
    {
        "Topics": model_topics,
        "Coherence Score": coherence_values,
        "Perplexity Score": perplexity_values,
        "Alpha": alpha_result,
        "Beta": beta_result,
    }
)
result.sort_values(
    by=["Coherence Score", "Perplexity Score"], ascending=[False, True]
).head(20)

: 

In [None]:
result.to_csv("data/lda_fine_tuning_result_severe.csv")

: 

In [None]:
# Show graph Topics vs Coherence Score
result.groupby("Alpha").plot(x="Topics", y="Coherence Score", legend=True)

: 

In [None]:
# Show graph Topics vs Perplexity Score

plt.plot(model_topics, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence Score")
plt.legend(("Coherence Score"), loc="best")
plt.show()

: 

In [None]:
# Show graph Topics vs Perplexity Score

plt.plot(model_topics, perplexity_values)
plt.xlabel("Num Topics")
plt.ylabel("Perplexity score")
plt.legend(("perplexity_values"), loc="best")
plt.show()

: 

## Final Model

Topic 4, 8, 9 and 10 were selected for further evaluation using the visual graphs, considering that the best combination does not always yield the best result since a model with higher number of topics tends to have a better measurable result but may not fit the data the most. 

However, take note that even the random_state was preset and all other parameters were fixed, there are still randomness found that the model may produce inconsistant output each time. 

unfortunately, the alter of the number of topics has no much effect on the results, and the news are not clustered into relevant topics properly. also, most topics are stacked together, indicating high similarity and ambiguity among them due to the multi-aspect nature of the news contents. As a result, LDA may not be a suitable solution for this kind of news content. same result goes for moderate and minor.

In [None]:
# realised that there may be some overlaps for 8 topics, thus 4-6 topics are optimal
k = 8
a = "asymmetric"
# a = 0.91
# b = 0.61
b = "symmetric"


final_model = gensim.models.LdaMulticore(
    corpus=docs_vecs,
    id2word=doc_dict,
    num_topics=k,
    random_state=42,
    chunksize=100,
    passes=10,
    alpha=a,
    eta=b,
)

: 

In [None]:
compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict, k=k, a=a, b=b)

: 

In [None]:
# Set up the environment to display the graphical outputs
# feed the LDA model into the pyLDAvis instance
pyLDAvis.enable_notebook()
visual = gensimvis.prepare(final_model, docs_vecs, doc_dict)

# Save the output to the html file
pyLDAvis.save_html(visual, "data/topic_viz8_severe_training.html")

: 

In [None]:
final_model.print_topics(num_words=30)

: 

This allows ease access to the trained model for future prediction work.

In [None]:
# Save a model to disk, or reload a pre-trained model
# naming convention: final_model_topic_alpha_eta
final_model.save("final_model_8_asym_sym")

: 

get dominant topics and topic percentage contribution.
Made use of gensim lda's own function: https://radimrehurek.com/gensim/models/ldamodel.html

In [None]:
import warnings

warnings.filterwarnings("ignore")


def format_topics_sentences(ldamodel, corpus, data):
    # Preallocate memory for the DataFrame
    num_docs = len(corpus)
    sent_topics = {
        "Dominant_Topic": [0] * num_docs,
        "Perc_Contribution": [0.0] * num_docs,
        "Topic_Distribution": [()] * num_docs,
    }

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        if row:
            # Get the Dominant topic, Perc Contribution and Keywords for each document
            dominant_topic, perc_contribution = row[0]
            topic_distribution = row
            sent_topics["Dominant_Topic"][i] = int(dominant_topic)
            sent_topics["Perc_Contribution"][i] = round(perc_contribution, 4)
            sent_topics["Topic_Distribution"][i] = topic_distribution

    # Create the DataFrame
    sent_topics_df = pd.DataFrame(sent_topics)
    sent_topics_df["Text"] = data

    return sent_topics_df

: 

In [None]:
df_topic_sents_keywords = format_topics_sentences(
    ldamodel=final_model, corpus=docs_vecs, data=cleaned.Headline_Details
)

: 

In [None]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = [
    "Document_No",
    "Dominant_Topic",
    "Topic_Perc_Contrib",
    "Topic_Distribution",
    "Text",
]

# Show
df_dominant_topic.head(10)

: 

# Result Analysis

In [None]:
df_dominant_topic["Dominant_Topic"].value_counts()

: 

In [None]:
import matplotlib.pyplot as plt

# Get value counts of each topic
topic_counts = df_dominant_topic["Dominant_Topic"].value_counts()

# Create a bar plot
plt.figure(figsize=(8, 6))
topic_counts.plot(kind="bar", color="skyblue")

# Add labels to the bars
for i, count in enumerate(topic_counts):
    plt.text(i, count, str(count), ha="center", va="bottom")

# Add labels and title
plt.xlabel("Topics")
plt.ylabel("Number of News")
plt.title("Topic Distribution")

# Show the plot
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

: 

In [None]:
df_dominant_topic.sort_values(by="Topic_Perc_Contrib", ascending=False).head(20)

: 

In [None]:
# Sample 100 rows, can change the random_state for different samples
sampled_data = df_dominant_topic.sample(n=100, random_state=42)
sampled_df = pd.DataFrame(sampled_data).reset_index()
sampled_df.to_csv("data/sample_severe.csv")

: 

: 