|
from flask import Flask, request, jsonify |
|
import tensorflow as tf |
|
from transformers import AutoTokenizer, TFT5ForConditionalGeneration |
|
from transformers import MBartForConditionalGeneration, MBart50Tokenizer |
|
import os |
|
import re |
|
import spacy |
|
from nltk.corpus import wordnet as wn |
|
import random |
|
import nltk |
|
nltk.download('wordnet') |
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
app = Flask(__name__) |
|
|
|
|
|
LOCAL_QG_MODEL_PATH = "blaxx14/t5-question-generation" |
|
|
|
"""string into dictionary""" |
|
def parse_to_dict(input_string): |
|
try: |
|
question_part, answer_part = input_string.split('Answer: ') |
|
question = question_part.replace('Question: ', '').strip() |
|
answer = answer_part.strip() |
|
|
|
result_dict = { |
|
"Question": question, |
|
"Answer": answer |
|
} |
|
|
|
return result_dict |
|
|
|
except ValueError: |
|
print("Format input string tidak sesuai") |
|
return None |
|
|
|
|
|
"""Find sinonim""" |
|
def get_synonyms(word): |
|
synonyms = set() |
|
for syn in wn.synsets(word): |
|
for lemma in syn.lemmas(): |
|
synonyms.add(lemma.name()) |
|
return list(synonyms) |
|
|
|
|
|
"""Create distractor""" |
|
def generate_distractors(question, correct_answer): |
|
doc = nlp(question) |
|
|
|
keywords = [token.text for token in doc if token.pos_ in ['NOUN', 'PROPN']] |
|
|
|
distractors = [] |
|
|
|
for keyword in keywords: |
|
synonyms = get_synonyms(keyword) |
|
synonyms = [word for word in synonyms if word.lower() != correct_answer.lower()] |
|
distractors.extend(synonyms) |
|
|
|
distractors = random.sample(distractors, min(3, len(distractors))) |
|
|
|
return distractors |
|
|
|
"""Load question generator model and tokenizer""" |
|
print("Loading model...") |
|
model = TFT5ForConditionalGeneration.from_pretrained(LOCAL_QG_MODEL_PATH, from_pt=False) |
|
tokenizer = AutoTokenizer.from_pretrained("t5-small") |
|
print("Model loaded successfully.") |
|
|
|
"""Function for generate question""" |
|
def generate_question(text, max_length=4096): |
|
input_text = f"Generate question answer: {text}" |
|
input_ids = tokenizer.encode(input_text, return_tensors="tf", max_length=512, truncation=True) |
|
|
|
output = model.generate( |
|
input_ids, |
|
max_length=max_length, |
|
num_beams=10, |
|
top_k=0, |
|
top_p=0.8, |
|
temperature=1.5, |
|
do_sample=True, |
|
early_stopping=True |
|
) |
|
|
|
output_text = tokenizer.decode(output[0], skip_special_tokens=True) |
|
return output_text |
|
|
|
"""Cleaning input""" |
|
def clean_text(text): |
|
cleaned_text = text.replace("translit.", "") |
|
cleaned_text = re.sub(r'\[.*?\]', '', cleaned_text) |
|
return cleaned_text |
|
|
|
def split_text_into_sentences(paragraph): |
|
text = clean_text(paragraph) |
|
sentences = re.split(r'(?<=[.?!])\s+', text) |
|
return sentences |
|
|
|
def split_into_parts(sentences, num_parts=5): |
|
if len(sentences) <= num_parts: |
|
return sentences |
|
else: |
|
part_size = len(sentences) // num_parts |
|
parts = [sentences[i:i + part_size] for i in range(0, len(sentences), part_size)] |
|
|
|
if len(parts) > num_parts: |
|
parts[-2].extend(parts[-1]) |
|
parts = parts[:-1] |
|
|
|
return parts |
|
|
|
"""Route for run generator and save the results in cloud""" |
|
@app.route('/generate-question', methods=['POST']) |
|
def api_generate_question(): |
|
try: |
|
data = request.json |
|
text = data.get('text', '') |
|
|
|
if not text: |
|
return jsonify({'error': 'Text tidak boleh kosong'}), 400 |
|
|
|
|
|
|
|
"""Run cleaning input""" |
|
formatted_sentences = split_text_into_sentences(text) |
|
parts = split_into_parts(formatted_sentences) |
|
|
|
|
|
"""Just for checking""" |
|
|
|
|
|
"""Generate question""" |
|
question_list = [] |
|
|
|
for sentence in parts: |
|
combined_input = ' '.join(sentence) |
|
result = generate_question(combined_input) |
|
result_dict = parse_to_dict(result) |
|
|
|
distractors = generate_distractors(result_dict["Question"], result_dict["Answer"]) |
|
result_dict["distractor"] = distractors |
|
question_list.append(result_dict) |
|
|
|
print(question_list) |
|
return jsonify({'generated_question': question_list}) |
|
except Exception as e: |
|
return jsonify({'error': str(e)}), 500 |
|
|
|
|
|
if __name__ == '__main__': |
|
app.run(host='0.0.0.0', port=8080) |
|
|