In [1]:
# !pip install contractions

In [2]:
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import contractions

import gensim
from gensim import corpora
from gensim import similarities
from gensim import models
from gensim.models import CoherenceModel

# from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import re
import os
import glob
import json

import psycopg2
import pickle
from datetime import datetime
import datetime

# Import Data

In [4]:
df = pd.read_csv("cleaned_data.csv")

# EDA

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df_copy = df.copy()

In [None]:
df_copy.info()

In [None]:
df_copy.isnull().sum()

In [None]:
df_copy.dropna(subset=["Headline_Details"], inplace=True)

In [None]:
print("Published Date Statistics:")
print("Min Date:", df_copy["Datetime"].min())
print("Max Date:", df_copy["Datetime"].max())

In [None]:
# Check if there are any duplicated titles since a news can be published for multiple times by different publisher at different time
df_copy[["Year", "Headline_Details", "Region"]].duplicated().any()

In [None]:
# drop the duplicated news
duplicates = df_copy.duplicated(
    subset=["Year", "Headline_Details", "Region"], keep="first"
)
df_uni = df_copy[~duplicates]

In [None]:
df_uni.shape

# Text Preprocessing
contractions -> punctuation removal -> lowercase -> -> lemmanisation -> stop words removal + bigram

In [None]:
df_uni["Headline_Details"][5]

In [None]:
## remove contractions, lowercase, remove numbers and punctuations, remove stopwords
# run time roughly 2 mins
df_uni["cleaned_Headline_Details"] = df_uni["Headline_Details"].apply(
    lambda x: [contractions.fix(word) for word in x.split()]
)

## convert back into string so that tokenization can be done
df_uni["cleaned_Headline_Details"] = [
    " ".join(map(str, l)) for l in df_uni["cleaned_Headline_Details"]
]

In [None]:
df_uni["cleaned_Headline_Details"][5]

### Stemming / Lemmatization - To normalize text and prepare words.

https://towardsdatascience.com/stemming-vs-lemmatization-in-nlp-dea008600a0#:~:text=Stemming%20and%20Lemmatization%20are%20methods,be%20used%20in%20similar%20contexts.

Decided to use lemmatization because lemmatization provides better results by performing an analysis that depends on the word’s part-of-speech and producing real, dictionary words. As a result, lemmatization is harder to implement and slower compared to stemming.

To sum up, lemmatization is almost always a better choice from a qualitative point of view. With today’s computational resources, running lemmatization algorithms shouldn’t have a significant impact on the overall performance. However, if we are heavily optimizing for speed, a simpler stemming algorithm can be a possibility.

POS taggin + lemming for better lemming performance. However, the lemmatizer requires the correct POS tag to be accurate, 
if you use the default settings of the WordNetLemmatizer.lemmatize(), the default tag is noun.

https://github.com/nltk/nltk/blob/develop/nltk/stem/wordnet.py#L39 

In [None]:
# ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
# keep only ADJ, ADV, NOUN and VERB.

wnl = WordNetLemmatizer()


def lemmatize_words(text):
    # Tokenize the text into sentences and then words
    sentences = sent_tokenize(text)
    words = [word_tokenize(sentence) for sentence in sentences]

    # Remove punctuation and tokenize into lowercase words
    punc = [[w.lower() for w in word if re.search("^[a-zA-Z]+$", w)] for word in words]

    # Perform lemmatization on words with valid POS tags
    doc_lemmed = [
        wnl.lemmatize(word, pos[0].lower())
        for sentence in punc
        for word, pos in pos_tag(sentence, tagset="universal")
        if pos[0].lower() in ["a", "s", "r", "n", "v"]
    ]

    return doc_lemmed

In [None]:
print(datetime.datetime.now())

In [None]:
df_uni["cleaned_Headline_Details"] = df_uni["cleaned_Headline_Details"].apply(
    lemmatize_words
)

In [None]:
print(datetime.datetime.now())

### N-gram + Stopword removal

In [None]:
stop_list = nltk.corpus.stopwords.words("english")
stop_list += ["local", "time", "wednesday", "source", "certain", "report", "update"]


def corpus2docs2(corpus):
    # corpus is a object returned by load_corpus that represents a corpus.
    docs = []
    for text in corpus:
        cleaned = [w for w in text if w not in stop_list]
        doc_pos = nltk.pos_tag(cleaned)
        phrases = []
        i = 0
        while i < len(doc_pos):
            if doc_pos[i][1] == "JJ":
                if (
                    i + 2 < len(doc_pos)
                    and doc_pos[i + 1][1] == "NN"
                    and doc_pos[i + 2][1] == "NN"
                ):
                    phrases.append(
                        (doc_pos[i][0], doc_pos[i + 1][0], doc_pos[i + 2][0])
                    )
                    i += 3
                elif i + 1 < len(doc_pos) and doc_pos[i + 1][1] == "NN":
                    phrases.append((doc_pos[i][0], doc_pos[i + 1][0]))
                    i += 2
                else:
                    i += 1
            elif doc_pos[i][1] == "NN":
                if (
                    i + 2 < len(doc_pos)
                    and doc_pos[i + 1][1] == "NN"
                    and doc_pos[i + 2][1] == "NN"
                ):
                    phrases.append(
                        (doc_pos[i][0], doc_pos[i + 1][0], doc_pos[i + 2][0])
                    )
                    i += 3
                elif i + 1 < len(doc_pos) and doc_pos[i + 1][1] == "NN":
                    phrases.append((doc_pos[i][0], doc_pos[i + 1][0]))
                    i += 2
                else:
                    i += 1
            else:
                i += 1
        phrase_set = ["_".join(word_set) for word_set in phrases]
        docs.append(phrase_set)
    return docs

In [None]:
print(stop_list)

In [None]:
df_uni["binary_Headline_Details"] = corpus2docs2(df_uni["cleaned_Headline_Details"])

In [None]:
df_uni["binary_Headline_Details"][5]

In [None]:
fdist_doc = nltk.FreqDist(df_uni["binary_Headline_Details"][5]).most_common(25)

x, y = zip(*fdist_doc)
plt.figure(figsize=(50, 30))
plt.margins(0.02)
plt.bar(x, y)
plt.xlabel("Words", fontsize=50)
plt.ylabel("Frequency of Words", fontsize=50)
plt.yticks(fontsize=40)
plt.xticks(rotation=60, fontsize=40)
plt.title("Frequency of 25 Most Common Words for One Random News", fontsize=60)
plt.show()

In [None]:
all_words = [word for sublist in df_uni["binary_Headline_Details"] for word in sublist]
all_words[:2]
# Calculate word frequencies
fdist = FreqDist(all_words)

In [None]:
# Plot the word frequency distribution as a bar graph
plt.figure(figsize=(12, 6))
plt.title("Frequency of 25 Most Common Words of the Dataset", fontsize=12)
fdist.plot(30, cumulative=False)

# Wordcloud

In [None]:
com = df_uni["Severity"].unique()
com[:10]

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Plotting with Seaborn for each company
for company in com[:10]:
    haha = df_uni["binary_Headline_Details"].loc[df_uni.Severity == company]
    text = " ".join(" ".join(item) for item in haha)
    wordcloud = WordCloud(background_color="white").generate(text)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.title(f"Wordcloud for {company}")
    plt.axis("off")
    plt.margins(x=0, y=0)
    plt.show()

## IT-IDF Word Removal

remove those frequently appeared but less important words like say, will, year, use, etc.

In [None]:
df_uni["binary_Headline_Details"] = df_uni["binary_Headline_Details"].apply(
    lambda x: " ".join(x)
)

# Tokenize the text and create a dictionary
documents = df_uni["binary_Headline_Details"].str.split()
dictionary = corpora.Dictionary(documents)

tfidf = models.TfidfModel(dictionary=dictionary, normalize=True)
tfidf_corpus = [tfidf[dictionary.doc2bow(doc)] for doc in documents]
term_frequencies = {dictionary[id]: freq for id, freq in tfidf.dfs.items()}

In [None]:
sorted_term_frequencies = dict(
    sorted(term_frequencies.items(), key=lambda item: item[1], reverse=True)
)
sorted_term_frequencies

threshold = 0.04 seems to be an appropriate cutoff with variation at +- 0.01 for this set of data.

In [None]:
# customisable, lower threshold, more words retained.
threshold = 0.4


def filter_and_join(tfidf_doc):
    filtered_terms = [dictionary[id] for id, score in tfidf_doc if score >= threshold]
    return filtered_terms


df_uni["binary_Headline_Details"] = [filter_and_join(doc) for doc in tfidf_corpus]

In [None]:
fdist_doc = nltk.FreqDist(df_uni["binary_Headline_Details"][0]).most_common(25)

x, y = zip(*fdist_doc)
plt.figure(figsize=(50, 30))
plt.margins(0.02)
plt.bar(x, y)
plt.xlabel("Words", fontsize=50)
plt.ylabel("Frequency of Words", fontsize=50)
plt.yticks(fontsize=40)
plt.xticks(rotation=60, fontsize=40)
plt.title("Frequency of 25 Most Common Words for One Random News", fontsize=60)
plt.show()

In [None]:
all_words_filtered = [
    word for sublist in df_uni["binary_Headline_Details"] for word in sublist
]
all_words_filtered[:2]
# Calculate word frequencies
fdist_filtered = FreqDist(all_words_filtered)

In [None]:
# Plot the word frequency distribution as a bar graph
# apparently, the dataset is much cleaner now.
plt.figure(figsize=(12, 6))
plt.title("Frequency of 25 Most Common Words of the Dataset", fontsize=12)
fdist_filtered.plot(30, cumulative=False)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Plotting with Seaborn for each company
for region in com[:10]:
    haha = df_uni["binary_Headline_Details"].loc[df_uni.Severity == region]
    text = " ".join(" ".join(item) for item in haha)
    wordcloud = WordCloud(background_color="white").generate(text)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.title(f"Wordcloud for {company}")
    plt.axis("off")
    plt.margins(x=0, y=0)
    plt.show()

In [None]:
df_uni["word_count"] = df_uni["binary_Headline_Details"].apply(len)

In [None]:
df_uni[["word_count"]].describe().round()

In [None]:
# count of news by sector
df_uni[["binary_Headline_Details", "Region"]].groupby("Region").count().sort_values(
    by="binary_Headline_Details", ascending=False
)

In [None]:
df_uni[["binary_Headline_Details", "Severity"]].groupby("Severity").count().sort_values(
    by="binary_Headline_Details", ascending=False
)

# Save data to database for modelling

In [None]:
df_uni.head()

In [None]:
df_uni.columns.to_list()

In [None]:
# export as parquet data file instead of csv for easier list extraction
df_uni.to_parquet("processed_data1.parquet", index=False)