Spaces:
Sleeping
Sleeping
File size: 3,788 Bytes
ad4ed41 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import fileinput
import os
import platform
from subprocess import Popen, PIPE
# Class to align original and translated sentences
# based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
class Aligner():
def __init__(self, config_folder, source_lang, target_lang, temp_folder):
forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")
fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err"))
rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err"))
self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
if platform.system().lower() == "windows":
fastalign_bin = "fast_align.exe"
atools_bin = "atools.exe"
else:
fastalign_bin = "./fast_align"
atools_bin = "./atools"
self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences_to_align.txt")
self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f",
forward_params_path]
self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f",
reverse_params_path, "r"]
self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j",
self.reverse_alignment_file_path, "-c", "grow-diag-final-and"]
def __simplify_alignment_file(self, file):
with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
for line in f:
print(line.split('|||')[2].strip())
def __read_err(self, err):
(T, m) = ('', '')
for line in open(err):
# expected target length = source length * N
if 'expected target length' in line:
m = line.split()[-1]
# final tension: N
elif 'final tension' in line:
T = line.split()[-1]
return T, m
def align(self, original_sentences, translated_sentences):
# create temporary file which fastalign will use
with open(self.temp_file_path, "w") as temp_file:
for original, translated in zip(original_sentences, translated_sentences):
temp_file.write(f"{original} ||| {translated}\n")
# generate forward alignment
with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out:
fw_process = Popen(self.forward_command, stdout=f_out)
# generate reverse alignment
r_process = Popen(self.reverse_command, stdout=r_out)
# wait for both to finish
fw_process.wait()
r_process.wait()
# for some reason the output file contains more information than needed, remove it
self.__simplify_alignment_file(self.forward_alignment_file_path)
self.__simplify_alignment_file(self.reverse_alignment_file_path)
# generate symmetrical alignment
process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
process.wait()
# get final alignments and format them
alignments_str = process.communicate()[0].decode('utf-8')
alignments = []
for line in alignments_str.splitlines():
alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]])
return alignments
|