Spaces:
Runtime error
Runtime error
File size: 4,337 Bytes
c8e5195 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import demoji
import re
import pandas as pd
import numpy as np
import nltk
import keras
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
from string import punctuation
from keybert import KeyBERT
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
import os
import warnings
warnings.filterwarnings("ignore")
# --- Configuration & Global Variables ---
MAX_LENGTH = 128
base_path = os.path.join('data')
model_path = os.path.join('Model')
# --- Helper: Download NLTK data ---
nltk.download('stopwords')
# --- Helper: Download NLTK data ---
nltk.download('stopwords', download_dir='./nltk_data')
nltk.data.path.append('./nltk_data')
# --- Load Resources ---
alay_dict = pd.read_csv(os.path.join(base_path, 'kamus_alay.csv'), names=['alay', 'normal'], encoding='latin-1')
alay_dict_map = dict(zip(alay_dict['alay'], alay_dict['normal']))
stop_words = set(stopwords.words('indonesian'))
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-large-p1")
bert_model = TFBertModel.from_pretrained("indobenchmark/indobert-large-p1")
lstm_model = keras.models.load_model(os.path.join(model_path, 'indobert_lstm_model.keras'))
# --- Preprocessing Functions ---
def process_text(text):
# Baca kamus CSV ke dalam DataFrame
global alay_dict_map
text = str(text) # Convert Object to str
text = text.lower() # Lowercase text
text = re.sub(r'\d+', '', text) # Remove number
text = text.replace('\\n\\n\\n', ' ')
text = text.replace('\\n\\n', ' ')
text = text.replace('\\n', ' ')
text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) # Remove link
text = re.sub(f"[{re.escape(punctuation)}]", " ", text) # Remove punctuation
text = demoji.replace(text, "") # Remove emoji
text = " ".join(text.split()) # Remove extra spaces, tabs, and new lines
text = text.split()
text = [alay_dict_map[word] if word in alay_dict_map else word for word in text]
text = ' '.join(text)
return text
# --- Emotion Prediction ---
def load_tflite_model(tflite_path="Model/indobert_lstm_model.tflite"):
interpreter = tf.lite.Interpreter(model_path=tflite_path)
interpreter.allocate_tensors()
return interpreter
def predict_emotion(text, interpreter):
cleaned = process_text(text)
tokens = tokenizer(cleaned, return_tensors="tf", padding='max_length', truncation=True, max_length=128)
# Ambil seluruh token embeddings (bukan hanya CLS token)
outputs = bert_model(**tokens)
embeddings = outputs.last_hidden_state # shape (1, 128, 1024)
input_data = embeddings.numpy().astype(np.float32) # sesuai shape TFLite
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
interpreter.set_tensor(input_details[0]['index'], input_data)
interpreter.invoke()
output = interpreter.get_tensor(output_details[0]['index'])
label = np.argmax(output, axis=1)[0]
emotions = ['anger', 'fear', 'sadness']
return emotions[label]
# --- Keyword Extraction & Ranking ---
# Load rank keyword
df_rank_keyword = pd.read_excel(os.path.join(base_path, 'Keyword_KeyBERT.xlsx'))
df_rank_keyword['keyword'] = df_rank_keyword['keyword'].apply(process_text)
df_rank_keyword['new_rank'] = df_rank_keyword['rank'].max() - df_rank_keyword['rank'] + 1
def rank_keywords(row):
total_ranking = 0
total_keyword = 0
for keyword in row:
frekuensi_rank = df_rank_keyword.loc[df_rank_keyword['keyword'] == keyword]
if not frekuensi_rank.empty:
total_ranking += frekuensi_rank['new_rank'].values[0]
total_keyword += 1
if total_keyword > 0:
return total_ranking / total_keyword
else:
return 0
def keyword(text):
# Model Keyword
sentence_model = SentenceTransformer("denaya/indoSBERT-large", trust_remote_code=True)
# Buat objek KeyBERT
kw_model = KeyBERT(model=sentence_model)
# Proses Keyword
stop_words = set(stopwords.words('indonesian'))
text = text.split()
text = [w for w in text if not w in stop_words]
text = ' '.join(text)
text = process_text(text)
keywords = kw_model.extract_keywords(text, top_n=5)
keyword = [keyword for keyword, _ in keywords]
rank = rank_keywords(keyword)
return keyword, rank
|