File size: 3,788 Bytes
ad4ed41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import fileinput
import os
import platform
from subprocess import Popen, PIPE

# Class to align original and translated sentences
# based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
class Aligner():
    def __init__(self, config_folder, source_lang, target_lang, temp_folder):
        forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
        reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")

        fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err"))
        rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err"))

        self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
        self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")

        if platform.system().lower() == "windows":
            fastalign_bin = "fast_align.exe"
            atools_bin = "atools.exe"
        else:
            fastalign_bin = "./fast_align"
            atools_bin = "./atools"

        self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences_to_align.txt")

        self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f",
                                forward_params_path]
        self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f",
                                reverse_params_path, "r"]

        self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j",
                                  self.reverse_alignment_file_path, "-c", "grow-diag-final-and"]

    def __simplify_alignment_file(self, file):
        with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
            for line in f:
                print(line.split('|||')[2].strip())

    def __read_err(self, err):
        (T, m) = ('', '')
        for line in open(err):
            # expected target length = source length * N
            if 'expected target length' in line:
                m = line.split()[-1]
            # final tension: N
            elif 'final tension' in line:
                T = line.split()[-1]
        return T, m

    def align(self, original_sentences, translated_sentences):
        # create temporary file which fastalign will use
        with open(self.temp_file_path, "w") as temp_file:
            for original, translated in zip(original_sentences, translated_sentences):
                temp_file.write(f"{original} ||| {translated}\n")

        # generate forward alignment
        with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out:
            fw_process = Popen(self.forward_command, stdout=f_out)
            # generate reverse alignment
            r_process = Popen(self.reverse_command, stdout=r_out)

            # wait for both to finish
            fw_process.wait()
            r_process.wait()

        # for some reason the output file contains more information than needed, remove it
        self.__simplify_alignment_file(self.forward_alignment_file_path)
        self.__simplify_alignment_file(self.reverse_alignment_file_path)

        # generate symmetrical alignment
        process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
        process.wait()

        # get final alignments and format them
        alignments_str = process.communicate()[0].decode('utf-8')
        alignments = []
        for line in alignments_str.splitlines():
            alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]])

        return alignments