Spaces:
Sleeping
Sleeping
import fileinput | |
import os | |
import platform | |
from subprocess import Popen, PIPE | |
# Class to align original and translated sentences | |
# based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py | |
class Aligner(): | |
def __init__(self, config_folder, source_lang, target_lang, temp_folder): | |
forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params") | |
reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params") | |
fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err")) | |
rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err")) | |
self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align") | |
self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align") | |
if platform.system().lower() == "windows": | |
fastalign_bin = "fast_align.exe" | |
atools_bin = "atools.exe" | |
else: | |
fastalign_bin = "./fast_align" | |
atools_bin = "./atools" | |
self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences_to_align.txt") | |
self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f", | |
forward_params_path] | |
self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f", | |
reverse_params_path, "r"] | |
self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j", | |
self.reverse_alignment_file_path, "-c", "grow-diag-final-and"] | |
def __simplify_alignment_file(self, file): | |
with fileinput.FileInput(file, inplace=True, backup='.bak') as f: | |
for line in f: | |
print(line.split('|||')[2].strip()) | |
def __read_err(self, err): | |
(T, m) = ('', '') | |
for line in open(err): | |
# expected target length = source length * N | |
if 'expected target length' in line: | |
m = line.split()[-1] | |
# final tension: N | |
elif 'final tension' in line: | |
T = line.split()[-1] | |
return T, m | |
def align(self, original_sentences, translated_sentences): | |
# create temporary file which fastalign will use | |
with open(self.temp_file_path, "w") as temp_file: | |
for original, translated in zip(original_sentences, translated_sentences): | |
temp_file.write(f"{original} ||| {translated}\n") | |
# generate forward alignment | |
with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out: | |
fw_process = Popen(self.forward_command, stdout=f_out) | |
# generate reverse alignment | |
r_process = Popen(self.reverse_command, stdout=r_out) | |
# wait for both to finish | |
fw_process.wait() | |
r_process.wait() | |
# for some reason the output file contains more information than needed, remove it | |
self.__simplify_alignment_file(self.forward_alignment_file_path) | |
self.__simplify_alignment_file(self.reverse_alignment_file_path) | |
# generate symmetrical alignment | |
process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE) | |
process.wait() | |
# get final alignments and format them | |
alignments_str = process.communicate()[0].decode('utf-8') | |
alignments = [] | |
for line in alignments_str.splitlines(): | |
alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]]) | |
return alignments | |