mjuvilla's picture
moved scripts to src folder, created new create that hopefully should be able to work with any type of document
ad4ed41
import fileinput
import os
import platform
from subprocess import Popen, PIPE
# Class to align original and translated sentences
# based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
class Aligner():
def __init__(self, config_folder, source_lang, target_lang, temp_folder):
forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")
fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err"))
rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err"))
self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
if platform.system().lower() == "windows":
fastalign_bin = "fast_align.exe"
atools_bin = "atools.exe"
else:
fastalign_bin = "./fast_align"
atools_bin = "./atools"
self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences_to_align.txt")
self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f",
forward_params_path]
self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f",
reverse_params_path, "r"]
self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j",
self.reverse_alignment_file_path, "-c", "grow-diag-final-and"]
def __simplify_alignment_file(self, file):
with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
for line in f:
print(line.split('|||')[2].strip())
def __read_err(self, err):
(T, m) = ('', '')
for line in open(err):
# expected target length = source length * N
if 'expected target length' in line:
m = line.split()[-1]
# final tension: N
elif 'final tension' in line:
T = line.split()[-1]
return T, m
def align(self, original_sentences, translated_sentences):
# create temporary file which fastalign will use
with open(self.temp_file_path, "w") as temp_file:
for original, translated in zip(original_sentences, translated_sentences):
temp_file.write(f"{original} ||| {translated}\n")
# generate forward alignment
with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out:
fw_process = Popen(self.forward_command, stdout=f_out)
# generate reverse alignment
r_process = Popen(self.reverse_command, stdout=r_out)
# wait for both to finish
fw_process.wait()
r_process.wait()
# for some reason the output file contains more information than needed, remove it
self.__simplify_alignment_file(self.forward_alignment_file_path)
self.__simplify_alignment_file(self.reverse_alignment_file_path)
# generate symmetrical alignment
process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
process.wait()
# get final alignments and format them
alignments_str = process.communicate()[0].decode('utf-8')
alignments = []
for line in alignments_str.splitlines():
alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]])
return alignments