In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
from pathlib import Path

workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("workding dir:", workding_dir)

from dotenv import find_dotenv, load_dotenv

found_dotenv = find_dotenv(".env")

if len(found_dotenv) == 0:
    found_dotenv = find_dotenv(".env.example")
print(f"loading env vars from: {found_dotenv}")
load_dotenv(found_dotenv, override=True)

workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents
loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env


True

In [2]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

import gensim
from gensim import corpora
from gensim import similarities
from gensim import models
from gensim.models import CoherenceModel

# from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import re
import os
import datetime

import warnings

warnings.filterwarnings("ignore")

from pprint import pprint
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Import Data with only the news headline and details

In [6]:
df = pd.read_parquet("data/processed_data.parquet")

In [7]:
df.head()

Unnamed: 0,id,Headline,Details,Severity,Category,Region,Datetime,Year,lat,lon,maritime_label,found_ports,contains_port_info,if_labeled,Month,Week,Headline_Details,cleaned_Headline_Details,binary_Headline_Details,word_count
0,1.0,Grasberg Mine- Grasberg mine workers extend st...,Media sources indicate that workers at the Gra...,Moderate,Mine Workers Strike,Indonesia,28/5/17 17:08,2017.0,-4.05608,137.11302,False,['freeport'],1.0,False,5.0,21.0,Grasberg Mine- Grasberg mine workers extend st...,"[grasberg, grasberg, mine, worker, extend, str...",[worker_grasberg_mine],1
1,2.0,Indonesia: Undersea internet cables damaged by...,News sources are stating that recent typhoons ...,Minor,Travel Warning,Indonesia,4/9/17 14:30,2017.0,,,False,['hong kong'],1.0,False,4.0,14.0,Indonesia: Undersea internet cables damaged by...,"[indonesia, undersea, internet, cable, damage,...",[undersea_internet_cable],1
2,3.0,Shanghai port congestion impacts terminals in ...,The persisting port congestion at Shanghai’s Y...,Minor,Port Congestion,China,27/4/17 9:16,2017.0,29.52,121.3319,True,"['ningbo', 'qingdao', 'shanghai']",1.0,False,4.0,17.0,Shanghai port congestion impacts terminals in ...,"[shanghai, port, congestion, impact, terminal,...",[],0
3,4.0,UPDATE - Indonesia: Explosion at KP Terminal i...,Updated local media sources from Jakarta indic...,Extreme,"Bombing, Police Operations",Indonesia,24/5/17 15:15,2017.0,-6.22465,106.867,True,['jakarta'],1.0,False,5.0,21.0,UPDATE - Indonesia: Explosion at KP Terminal i...,"[update, indonesia, explosion, at, kp, termina...",[],0
4,5.0,UPDATE - Indonesia: Police confirm two explosi...,"According to local police in Jakarta, two expl...",Extreme,"Bombing, Police Operations",Indonesia,24/5/17 16:20,2017.0,,,True,['jakarta'],1.0,True,5.0,21.0,UPDATE - Indonesia: Police confirm two explosi...,"[update, indonesia, police, confirm, two, expl...",[],0


In [8]:
df.shape

(5778, 20)

In [26]:
df2 = pd.read_parquet("IS424_Data_Mining/code/LDA/processed_data.parquet")
# Check if two DataFrames are exactly the same
are_identical = df.equals(df2)
print(f"DataFrames are identical: {are_identical}")

DataFrames are identical: True


# Vectorisation

NLP vectorization refers to the process of converting text data into numerical vectors that machine learning algorithms can understand and process. 

Bag-of-Words (BoW) is used here that represents text as a collection of unique words along with their frequencies. Each word is assigned an index, and the vector contains the count of each word present in the document.

In [14]:
cleaned = df.copy()

In [15]:
headline = cleaned.binary_Headline_Details

In [16]:
headline[0]

array(['worker_grasberg_mine'], dtype=object)

In [17]:
# vectorise the words
doc_dict = gensim.corpora.Dictionary(headline)
docs_vecs = [doc_dict.doc2bow(doc) for doc in headline]

In [18]:
print("Number of unique tokens: %d" % len(doc_dict))
print("Number of articles: %d" % len(docs_vecs))

Number of unique tokens: 5319
Number of articles: 5778


In [19]:
# Calculate word frequencies
word_frequencies = {doc_dict[word_id]: freq for word_id, freq in doc_dict.cfs.items()}
sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)

pprint(sorted_words[:100])

[('due_dense_fog', 20),
 ('strong_wind', 19),
 ('indicate_average', 19),
 ('day_port', 17),
 ('san_antonio', 17),
 ('vessel_port', 16),
 ('low_visibility', 15),
 ('average_wait', 13),
 ('port_qingdao', 12),
 ('east_coast_parkway', 12),
 ('port_shanghai', 11),
 ('port_hong_kong', 11),
 ('severe_wind', 11),
 ('blank_week_service', 11),
 ('congestion_port_manila', 10),
 ('wait_hour', 10),
 ('day_situation', 9),
 ('port_ningbo', 9),
 ('coastal_area', 9),
 ('strong_wind_forecast', 9),
 ('vessel_arrival', 8),
 ('high_wind', 8),
 ('pomeranian_voivodeship', 8),
 ('wait_day', 8),
 ('previous_week', 8),
 ('vessel_wait', 8),
 ('average_wait_port', 8),
 ('affect_operation_port', 7),
 ('berth_manila_south', 7),
 ('day_port_saigon', 7),
 ('vessel_port_hong', 7),
 ('vessel_port_shanghai', 7),
 ('high_risk_port', 7),
 ('wait_hour_port', 7),
 ('qianwan_container', 7),
 ('day_port_shanghai', 6),
 ('international_container', 6),
 ('tropical_storm', 6),
 ('arrive_window', 6),
 ('moderate_risk_port', 6),
 

# LDA Modelling

## Benchmark Model

In [20]:
# Build LDA benchmark model
lda_model = gensim.models.LdaMulticore(
    corpus=docs_vecs,
    id2word=doc_dict,
    num_topics=4,
    random_state=42,
    chunksize=100,
    passes=10,
    per_word_topics=True,
)

In [21]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[docs_vecs]

[(0,
  '0.003*"indicate_average" + 0.002*"coastal_area" + 0.002*"vessel_port_hong" '
  '+ 0.002*"east_coast_parkway" + 0.002*"port_charleston" + '
  '0.002*"average_wait_port" + 0.002*"severe_wind" + '
  '0.002*"pomeranian_voivodeship" + 0.002*"engine_failure" + '
  '0.002*"due_dense_fog"'),
 (1,
  '0.005*"san_antonio" + 0.003*"blank_week_service" + 0.003*"wait_hour" + '
  '0.003*"day_port_saigon" + 0.002*"low_visibility" + 0.002*"high_wind" + '
  '0.002*"waterside_landside_operation" + 0.002*"tropical_storm" + '
  '0.002*"qianwan_container" + 0.002*"port_ningbo"'),
 (2,
  '0.004*"strong_wind" + 0.002*"port_shanghai" + 0.002*"port_qingdao" + '
  '0.002*"vessel_port_qingdao" + 0.002*"day_port_shanghai" + '
  '0.002*"indicate_average" + 0.002*"disrupt_operation_port" + '
  '0.002*"strong_wind_area" + 0.002*"port_ho_chi" + '
  '0.001*"operation_pier_port"'),
 (3,
  '0.004*"port_hong_kong" + 0.003*"vessel_port" + 0.003*"day_port" + '
  '0.003*"congestion_port_manila" + 0.002*"berth_manila_

In [22]:
# Compute Benchmark Coherence Score
coherence_model_lda = CoherenceModel(
    model=lda_model, texts=headline, dictionary=doc_dict, coherence="c_v"
)
coherence_lda = coherence_model_lda.get_coherence()
print("\nCoherence Score LDAModel: ", coherence_lda)


Coherence Score LDAModel:  0.7011993291597081


In [23]:
# Compute Benchmark Perplexity
perplex = lda_model.log_perplexity(docs_vecs, total_docs=None)  # For LDAModel
# a measure of how good the model is. lower the better.

print("\nPerplexity for LDAModel: ", perplex)


Perplexity for LDAModel:  -9.594271136114548


In [24]:
from pprint import pprint
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# feed the LDA model into the pyLDAvis instance
pyLDAvis.enable_notebook()
visual = gensimvis.prepare(lda_model, docs_vecs, doc_dict)

# Save the output to the html file
pyLDAvis.save_html(visual, "topic_viz_benchmark.html")

  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()


In [25]:
pd.set_option("max_colwidth", 200)
# Get the topics and their top keywords into a dataframe
topics = lda_model.show_topics(num_words=6)

topic_keywords = pd.DataFrame()
for topic_id, topic in topics:
    topic_keywords.at[topic_id, "Topic Keywords"] = topic

topic_keywords["Topic ID"] = topic_keywords.index
# topic_keywords['Topic Name'] = topic_mapping
topic_keywords

Unnamed: 0,Topic Keywords,Topic ID
0,"0.003*""indicate_average"" + 0.002*""coastal_area"" + 0.002*""vessel_port_hong"" + 0.002*""east_coast_parkway"" + 0.002*""port_charleston"" + 0.002*""average_wait_port""",0
1,"0.005*""san_antonio"" + 0.003*""blank_week_service"" + 0.003*""wait_hour"" + 0.003*""day_port_saigon"" + 0.002*""low_visibility"" + 0.002*""high_wind""",1
2,"0.004*""strong_wind"" + 0.002*""port_shanghai"" + 0.002*""port_qingdao"" + 0.002*""vessel_port_qingdao"" + 0.002*""day_port_shanghai"" + 0.002*""indicate_average""",2
3,"0.004*""port_hong_kong"" + 0.003*""vessel_port"" + 0.003*""day_port"" + 0.003*""congestion_port_manila"" + 0.002*""berth_manila_south"" + 0.002*""arrive_window""",3


## Conclusion

No significant insights gained from the model result as we cannot identify any topics, maybe more data is needed, will deploy web scraping for full news content after mid term.