import demoji import re import pandas as pd import numpy as np import nltk import keras from transformers import BertTokenizer, TFBertModel import tensorflow as tf from string import punctuation from keybert import KeyBERT from nltk.corpus import stopwords from sentence_transformers import SentenceTransformer import os import warnings warnings.filterwarnings("ignore") # --- Configuration & Global Variables --- MAX_LENGTH = 128 base_path = os.path.join('data') model_path = os.path.join('Model') # --- Helper: Download NLTK data --- nltk.download('stopwords') # --- Helper: Download NLTK data --- nltk.download('stopwords', download_dir='./nltk_data') nltk.data.path.append('./nltk_data') # --- Load Resources --- alay_dict = pd.read_csv(os.path.join(base_path, 'kamus_alay.csv'), names=['alay', 'normal'], encoding='latin-1') alay_dict_map = dict(zip(alay_dict['alay'], alay_dict['normal'])) stop_words = set(stopwords.words('indonesian')) tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-large-p1") bert_model = TFBertModel.from_pretrained("indobenchmark/indobert-large-p1") lstm_model = keras.models.load_model(os.path.join(model_path, 'indobert_lstm_model.keras')) # --- Preprocessing Functions --- def process_text(text): # Baca kamus CSV ke dalam DataFrame global alay_dict_map text = str(text) # Convert Object to str text = text.lower() # Lowercase text text = re.sub(r'\d+', '', text) # Remove number text = text.replace('\\n\\n\\n', ' ') text = text.replace('\\n\\n', ' ') text = text.replace('\\n', ' ') text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) # Remove link text = re.sub(f"[{re.escape(punctuation)}]", " ", text) # Remove punctuation text = demoji.replace(text, "") # Remove emoji text = " ".join(text.split()) # Remove extra spaces, tabs, and new lines text = text.split() text = [alay_dict_map[word] if word in alay_dict_map else word for word in text] text = ' '.join(text) return text # --- Emotion Prediction --- def load_tflite_model(tflite_path="Model/indobert_lstm_model.tflite"): interpreter = tf.lite.Interpreter(model_path=tflite_path) interpreter.allocate_tensors() return interpreter def predict_emotion(text, interpreter): cleaned = process_text(text) tokens = tokenizer(cleaned, return_tensors="tf", padding='max_length', truncation=True, max_length=128) # Ambil seluruh token embeddings (bukan hanya CLS token) outputs = bert_model(**tokens) embeddings = outputs.last_hidden_state # shape (1, 128, 1024) input_data = embeddings.numpy().astype(np.float32) # sesuai shape TFLite input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() interpreter.set_tensor(input_details[0]['index'], input_data) interpreter.invoke() output = interpreter.get_tensor(output_details[0]['index']) label = np.argmax(output, axis=1)[0] emotions = ['anger', 'fear', 'sadness'] return emotions[label] # --- Keyword Extraction & Ranking --- # Load rank keyword df_rank_keyword = pd.read_excel(os.path.join(base_path, 'Keyword_KeyBERT.xlsx')) df_rank_keyword['keyword'] = df_rank_keyword['keyword'].apply(process_text) df_rank_keyword['new_rank'] = df_rank_keyword['rank'].max() - df_rank_keyword['rank'] + 1 def rank_keywords(row): total_ranking = 0 total_keyword = 0 for keyword in row: frekuensi_rank = df_rank_keyword.loc[df_rank_keyword['keyword'] == keyword] if not frekuensi_rank.empty: total_ranking += frekuensi_rank['new_rank'].values[0] total_keyword += 1 if total_keyword > 0: return total_ranking / total_keyword else: return 0 def keyword(text): # Model Keyword sentence_model = SentenceTransformer("denaya/indoSBERT-large", trust_remote_code=True) # Buat objek KeyBERT kw_model = KeyBERT(model=sentence_model) # Proses Keyword stop_words = set(stopwords.words('indonesian')) text = text.split() text = [w for w in text if not w in stop_words] text = ' '.join(text) text = process_text(text) keywords = kw_model.extract_keywords(text, top_n=5) keyword = [keyword for keyword, _ in keywords] rank = rank_keywords(keyword) return keyword, rank