In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
from pathlib import Path

workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("workding dir:", workding_dir)

from dotenv import find_dotenv, load_dotenv

found_dotenv = find_dotenv(".env")

if len(found_dotenv) == 0:
    found_dotenv = find_dotenv(".env.example")
print(f"loading env vars from: {found_dotenv}")
load_dotenv(found_dotenv, override=True)

workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents
loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env


True

## Import Statement

In [2]:
import pandas as pd

### read the data

In [3]:
df = pd.read_csv("data/all_port_labelled.csv")

In [4]:
df.head(2)

Unnamed: 0.2,Unnamed: 0,Index,Unnamed: 0.1,Headline,Details,Severity,Category,Region,Datetime,Year,...,IT,EP,NEW,CSD,RPE,MN,NM,if_labeled,Month,Week
0,0.0,8.0,34.0,Grasberg Mine- Grasberg mine workers extend st...,Media sources indicate that workers at the Gra...,Moderate,Mine Workers Strike,Indonesia,28/5/17 17:08,2017.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False,5.0,21.0
1,1.0,10.0,63.0,Indonesia: Undersea internet cables damaged by...,News sources are stating that recent typhoons ...,Minor,Travel Warning,Indonesia,4/9/17 14:30,2017.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,False,4.0,14.0


### Clean empty data

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')


def clean_text(text):
    # Lowercase
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Removing punctuation
    tokens = [word for word in tokens if word not in string.punctuation]
    # Removing stop words
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return " ".join(tokens)

In [6]:
import nltk

nltk.download("omw-1.4")

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/inflaton/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### The Details column has an issue

some of the data are of the type float and none of the text processing functions can be applied to it therefore we have to process it

In [7]:
text_df = df[["Details", "maritime_label"]]
text_df.info()
text_df["Details_cleaned"] = text_df["Details"].apply(
    lambda x: clean_text(x) if not isinstance(x, float) else None
)
# no_nan_df[no_nan_df["Details"].apply(lambda x: print(type(x)))]
# cleaned_df = text_df[text_df["Details"].apply(lambda x: clean_text(x))]
# cleaned_df = df['Details'][1:2]
# type(no_nan_df["Details"][0])
# print(clean_text(no_nan_df["Details"][0]))
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Details         5781 non-null   object
 1   maritime_label  5781 non-null   object
dtypes: object(2)
memory usage: 90.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Details          5781 non-null   object
 1   maritime_label   5781 non-null   object
 2   Details_cleaned  5781 non-null   object
dtypes: object(3)
memory usage: 135.6+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_df['Details_cleaned'] = text_df['Details'].apply(lambda x: clean_text(x) if not isinstance(x, float) else None)


In [8]:
processed_data = text_df.dropna()
processed_data.head()

Unnamed: 0,Details,maritime_label,Details_cleaned
0,Media sources indicate that workers at the Gra...,False,medium source indicate worker grasberg mine ex...
1,News sources are stating that recent typhoons ...,False,news source stating recent typhoon impact hong...
2,The persisting port congestion at Shanghai’s Y...,True,persisting port congestion shanghai ’ yangshan...
3,Updated local media sources from Jakarta indic...,True,updated local medium source jakarta indicate e...
4,"According to local police in Jakarta, two expl...",True,according local police jakarta two explosion c...


## Naive Bayes Model

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [10]:
X = processed_data["Details_cleaned"]
y = processed_data["maritime_label"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [12]:
# vectorizer = CountVectorizer()
# X_train_vec = vectorizer.fit_transform(X_train)
# X_test_vec = vectorizer.transform(X_test)

tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [13]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tfidf, y_train)

In [14]:
predictions = naive_bayes.predict(X_test_tfidf)

In [15]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy of Naive Bayes model:", accuracy)
print(classification_report(y_test, predictions))

Accuracy of Naive Bayes model: 0.8582541054451167
              precision    recall  f1-score   support

       FALSE       0.88      0.94      0.91       847
        TRUE       0.79      0.65      0.71       310

    accuracy                           0.86      1157
   macro avg       0.83      0.79      0.81      1157
weighted avg       0.85      0.86      0.85      1157



## Logistic Regression model

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [18]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [19]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [20]:
y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Logistic Regression Model:", accuracy)
print(classification_report(y_test, y_pred))

Accuracy of Logistic Regression Model: 0.9308556611927399
              precision    recall  f1-score   support

       FALSE       0.92      0.99      0.95       847
        TRUE       0.98      0.76      0.86       310

    accuracy                           0.93      1157
   macro avg       0.95      0.88      0.90      1157
weighted avg       0.93      0.93      0.93      1157



## Support Vector Machine (SVM) model

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [23]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [24]:
svm_model = SVC(kernel="linear")
svm_model.fit(X_train_tfidf, y_train)

In [25]:
y_pred = svm_model.predict(X_test_tfidf)

In [26]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of SVM model:", accuracy)
print(classification_report(y_test, y_pred))

Accuracy of SVM model: 0.9524632670700086
              precision    recall  f1-score   support

       FALSE       0.94      1.00      0.97       847
        TRUE       1.00      0.83      0.90       310

    accuracy                           0.95      1157
   macro avg       0.97      0.91      0.94      1157
weighted avg       0.96      0.95      0.95      1157



## Random Forest Model

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [29]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [30]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

In [31]:
y_pred = rf_model.predict(X_test_tfidf)

In [32]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Random Forest Model:", accuracy)
print(classification_report(y_test, y_pred))

Accuracy of Random Forest Model: 0.9628349178910977
              precision    recall  f1-score   support

       FALSE       0.96      1.00      0.98       847
        TRUE       0.99      0.87      0.93       310

    accuracy                           0.96      1157
   macro avg       0.97      0.93      0.95      1157
weighted avg       0.96      0.96      0.96      1157

