keluhcerdas / helper.py
mahfud00's picture
Update helper.py
c8e5195 verified
import demoji
import re
import pandas as pd
import numpy as np
import nltk
import keras
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
from string import punctuation
from keybert import KeyBERT
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
import os
import warnings
warnings.filterwarnings("ignore")
# --- Configuration & Global Variables ---
MAX_LENGTH = 128
base_path = os.path.join('data')
model_path = os.path.join('Model')
# --- Helper: Download NLTK data ---
nltk.download('stopwords')
# --- Helper: Download NLTK data ---
nltk.download('stopwords', download_dir='./nltk_data')
nltk.data.path.append('./nltk_data')
# --- Load Resources ---
alay_dict = pd.read_csv(os.path.join(base_path, 'kamus_alay.csv'), names=['alay', 'normal'], encoding='latin-1')
alay_dict_map = dict(zip(alay_dict['alay'], alay_dict['normal']))
stop_words = set(stopwords.words('indonesian'))
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-large-p1")
bert_model = TFBertModel.from_pretrained("indobenchmark/indobert-large-p1")
lstm_model = keras.models.load_model(os.path.join(model_path, 'indobert_lstm_model.keras'))
# --- Preprocessing Functions ---
def process_text(text):
# Baca kamus CSV ke dalam DataFrame
global alay_dict_map
text = str(text) # Convert Object to str
text = text.lower() # Lowercase text
text = re.sub(r'\d+', '', text) # Remove number
text = text.replace('\\n\\n\\n', ' ')
text = text.replace('\\n\\n', ' ')
text = text.replace('\\n', ' ')
text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) # Remove link
text = re.sub(f"[{re.escape(punctuation)}]", " ", text) # Remove punctuation
text = demoji.replace(text, "") # Remove emoji
text = " ".join(text.split()) # Remove extra spaces, tabs, and new lines
text = text.split()
text = [alay_dict_map[word] if word in alay_dict_map else word for word in text]
text = ' '.join(text)
return text
# --- Emotion Prediction ---
def load_tflite_model(tflite_path="Model/indobert_lstm_model.tflite"):
interpreter = tf.lite.Interpreter(model_path=tflite_path)
interpreter.allocate_tensors()
return interpreter
def predict_emotion(text, interpreter):
cleaned = process_text(text)
tokens = tokenizer(cleaned, return_tensors="tf", padding='max_length', truncation=True, max_length=128)
# Ambil seluruh token embeddings (bukan hanya CLS token)
outputs = bert_model(**tokens)
embeddings = outputs.last_hidden_state # shape (1, 128, 1024)
input_data = embeddings.numpy().astype(np.float32) # sesuai shape TFLite
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
interpreter.set_tensor(input_details[0]['index'], input_data)
interpreter.invoke()
output = interpreter.get_tensor(output_details[0]['index'])
label = np.argmax(output, axis=1)[0]
emotions = ['anger', 'fear', 'sadness']
return emotions[label]
# --- Keyword Extraction & Ranking ---
# Load rank keyword
df_rank_keyword = pd.read_excel(os.path.join(base_path, 'Keyword_KeyBERT.xlsx'))
df_rank_keyword['keyword'] = df_rank_keyword['keyword'].apply(process_text)
df_rank_keyword['new_rank'] = df_rank_keyword['rank'].max() - df_rank_keyword['rank'] + 1
def rank_keywords(row):
total_ranking = 0
total_keyword = 0
for keyword in row:
frekuensi_rank = df_rank_keyword.loc[df_rank_keyword['keyword'] == keyword]
if not frekuensi_rank.empty:
total_ranking += frekuensi_rank['new_rank'].values[0]
total_keyword += 1
if total_keyword > 0:
return total_ranking / total_keyword
else:
return 0
def keyword(text):
# Model Keyword
sentence_model = SentenceTransformer("denaya/indoSBERT-large", trust_remote_code=True)
# Buat objek KeyBERT
kw_model = KeyBERT(model=sentence_model)
# Proses Keyword
stop_words = set(stopwords.words('indonesian'))
text = text.split()
text = [w for w in text if not w in stop_words]
text = ' '.join(text)
text = process_text(text)
keywords = kw_model.extract_keywords(text, top_n=5)
keyword = [keyword for keyword, _ in keywords]
rank = rank_keywords(keyword)
return keyword, rank