NullClass_Task-1 / Task 1 /beam_search_spanish.py

Upload 28 files

db56a5e verified about 1 year ago

6.72 kB

	# Import libraries
	import tkinter as tk
	from tkinter import ttk

	from keras.layers import TextVectorization
	import re
	import tensorflow.strings as tf_strings
	import json
	import string
	from keras.models import load_model
	import tensorflow as tf
	from keras.preprocessing.text import tokenizer_from_json
	from keras.utils import pad_sequences
	import numpy as np

	# English to Spanish translation
	strip_chars = string.punctuation + "¿"
	strip_chars = strip_chars.replace("[", "")
	strip_chars = strip_chars.replace("]", "")

	def custom_standardization(input_string):
	lowercase = tf_strings.lower(input_string)
	return tf_strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "")

	# Load the English vectorization layer configuration
	with open('eng_vectorization_config.json') as json_file:
	eng_vectorization_config = json.load(json_file)

	# Recreate the English vectorization layer with basic configuration
	eng_vectorization = TextVectorization(
	max_tokens=eng_vectorization_config['max_tokens'],
	output_mode=eng_vectorization_config['output_mode'],
	output_sequence_length=eng_vectorization_config['output_sequence_length']
	)

	# Apply the custom standardization function
	eng_vectorization.standardize = custom_standardization

	# Load the Spanish vectorization layer configuration
	with open('spa_vectorization_config.json') as json_file:
	spa_vectorization_config = json.load(json_file)

	# Recreate the Spanish vectorization layer with basic configuration
	spa_vectorization = TextVectorization(
	max_tokens=spa_vectorization_config['max_tokens'],
	output_mode=spa_vectorization_config['output_mode'],
	output_sequence_length=spa_vectorization_config['output_sequence_length'],
	standardize=custom_standardization
	)

	# Load and set the English vocabulary
	with open('eng_vocab.json') as json_file:
	eng_vocab = json.load(json_file)
	eng_vectorization.set_vocabulary(eng_vocab)

	# Load and set the Spanish vocabulary
	with open('spa_vocab.json') as json_file:
	spa_vocab = json.load(json_file)
	spa_vectorization.set_vocabulary(spa_vocab)

	# Load the Spanish model
	transformer = load_model('transformer_model')

	spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
	max_decoded_sentence_length = 20

	def beam_search_decode(input_sentence, beam_width=3):
	tokenized_input_sentence = eng_vectorization([input_sentence])
	decoded_sentences = [("[start]", 0.0)]

	for i in range(max_decoded_sentence_length):
	all_candidates = []
	for decoded_sentence, score in decoded_sentences:
	tokenized_target_sentence = spa_vectorization([decoded_sentence])[:, :-1]
	predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
	top_k = tf.math.top_k(predictions[0, i, :], k=beam_width)

	for j in range(beam_width):
	predicted_token_index = top_k.indices[j].numpy()
	predicted_token = spa_index_lookup[predicted_token_index]
	candidate = (decoded_sentence + " " + predicted_token, score + top_k.values[j].numpy())
	all_candidates.append(candidate)

	ordered = sorted(all_candidates, key=lambda x: x[1], reverse=True)
	decoded_sentences = ordered[:beam_width]

	if all(sentence[0].endswith("[end]") for sentence in decoded_sentences):
	break

	return decoded_sentences[0][0]

	# English to French translation
	# Load French model
	model = load_model('english_to_french_model')

	# Load Tokenizer
	with open('english_tokenizer.json') as f:
	data = json.load(f)
	english_tokenizer = tokenizer_from_json(data)

	with open('french_tokenizer.json') as f:
	data = json.load(f)
	french_tokenizer = tokenizer_from_json(data)

	# Load max length
	with open('sequence_length.json') as f:
	max_length = json.load(f)

	def pad(x, length=None):
	return pad_sequences(x, maxlen=length, padding='post')

	def translate_to_french(english_sentence):
	english_sentence = english_sentence.lower()
	english_sentence = re.sub(r'[.?!,]', '', english_sentence)
	english_sentence = english_tokenizer.texts_to_sequences([english_sentence])
	english_sentence = pad(english_sentence, max_length)
	english_sentence = english_sentence.reshape((-1, max_length))

	french_sentence = model.predict(english_sentence)[0]
	french_sentence = [np.argmax(word) for word in french_sentence]
	french_sentence = french_tokenizer.sequences_to_texts([french_sentence])[0]

	return french_sentence

	def translate_to_spanish(english_sentence):
	spanish_sentence = beam_search_decode(english_sentence)
	return spanish_sentence.replace("[start]", "").replace("[end]", "").strip()

	# Function to handle translation request based on selected language
	def handle_translate():
	selected_language = language_var.get()
	english_sentence = text_input.get("1.0", "end-1c")

	if selected_language == "French":
	translation = translate_to_french(english_sentence)
	elif selected_language == "Spanish":
	translation = translate_to_spanish(english_sentence)

	translation_output.delete("1.0", "end")
	translation_output.insert("end", f"{selected_language} translation: {translation}")

	# Setting up the main window
	root = tk.Tk()
	root.title("Language Translator")
	root.geometry("550x600")

	# Font configuration
	font_style = "Times New Roman"
	font_size = 14

	# Frame for input
	input_frame = tk.Frame(root)
	input_frame.pack(pady=10)

	# Heading for input
	input_heading = tk.Label(input_frame, text="Enter the text to be translated", font=(font_style, font_size, 'bold'))
	input_heading.pack()
	# Text input for English sentence
	text_input = tk.Text(input_frame, height=5, width=50, font=(font_style, font_size))
	text_input.pack()

	# Language selection
	language_var = tk.StringVar()
	language_label = tk.Label(root, text="Select the language to translate to", font=(font_style, font_size, 'bold'))
	language_label.pack()
	language_select = ttk.Combobox(root, textvariable=language_var, values=["French", "Spanish"], font=(font_style, font_size), state="readonly")
	language_select.pack()

	# Submit button
	submit_button = ttk.Button(root, text="Translate", command=handle_translate)
	submit_button.pack(pady=10)

	# Frame for output
	output_frame = tk.Frame(root)
	output_frame.pack(pady=10)
	# Heading for output
	output_heading = tk.Label(output_frame, text="Translation: ", font=(font_style, font_size, 'bold'))
	output_heading.pack()

	# Text output for translations
	translation_output = tk.Text(output_frame, height=10, width=50, font=(font_style, font_size))
	translation_output.pack()

	# Running the application
	root.mainloop()