File size: 12,143 Bytes
100f3e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1792639
 
100f3e3
 
1792639
100f3e3
1792639
 
 
100f3e3
 
1792639
100f3e3
 
 
1792639
100f3e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1792639
100f3e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad4ed41
 
 
 
0efc9da
 
 
 
 
100f3e3
0efc9da
100f3e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0348f21
100f3e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1792639
100f3e3
 
 
 
 
 
 
 
 
1792639
100f3e3
 
 
 
 
 
 
 
 
 
1792639
36f2ac1
100f3e3
 
 
36f2ac1
1792639
100f3e3
 
 
 
 
 
0348f21
100f3e3
0348f21
 
 
 
 
 
 
 
 
100f3e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1792639
100f3e3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import time
import json
import requests
import tqdm
import os

from docx import Document
from docx.text.hyperlink import Hyperlink
from docx.text.run import Run
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.tokenize import sent_tokenize, word_tokenize

from itertools import groupby

ip = "192.168.20.216"
port = "8000"


def translate(text, ip, port):
    myobj = {
        'id': '1',
        'src': text,
    }
    port = str(int(port))
    url = 'http://' + ip + ':' + port + '/translate'
    x = requests.post(url, json=myobj)
    json_response = json.loads(x.text)
    return json_response['tgt']


# Function to extract paragraphs with their runs
def extract_paragraphs_with_runs(doc):
    paragraphs_with_runs = []
    for idx, paragraph in enumerate(doc.paragraphs):
        runs = []
        for item in paragraph.iter_inner_content():
            if isinstance(item, Run):
                runs.append({
                    'text': item.text,
                    'bold': item.bold,
                    'italic': item.italic,
                    'underline': item.underline,
                    'font_name': item.font.name,
                    'font_size': item.font.size,
                    'font_color': item.font.color.rgb,
                    'paragraph_index': idx
                })
            elif isinstance(item, Hyperlink):
                runs.append({
                    'text': item.runs[0].text,
                    'bold': item.runs[0].bold,
                    'italic': item.runs[0].italic,
                    'underline': item.runs[0].underline,
                    'font_name': item.runs[0].font.name,
                    'font_size': item.runs[0].font.size,
                    'font_color': item.runs[0].font.color.rgb,
                    'paragraph_index': idx
                })

        paragraphs_with_runs.append(runs)
    return paragraphs_with_runs


def tokenize_with_runs(runs, detokenizer):
    text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
    sentences = sent_tokenize(text_paragraph)
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

    tokens_with_style = []
    for run in runs:
        tokens = word_tokenize(run["text"])
        for token in tokens:
            tokens_with_style.append(run.copy())
            tokens_with_style[-1]["text"] = token

    token_index = 0
    tokenized_sentences_with_style = []
    for sentence in tokenized_sentences:
        sentence_with_style = []
        for word in sentence:
            if word == tokens_with_style[token_index]["text"]:
                sentence_with_style.append(tokens_with_style[token_index])
                token_index += 1
            else:
                if word.startswith(tokens_with_style[token_index]["text"]):
                    # this token might be split into several runs
                    word_left = word

                    while word_left:
                        sentence_with_style.append(tokens_with_style[token_index])
                        word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
                        token_index += 1
                else:
                    raise "Something unexpected happened I'm afraid"
        tokenized_sentences_with_style.append(sentence_with_style)
    return tokenized_sentences_with_style


def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, aligner, temp_folder, detokenizer):
    # clean temp folder
    for f in os.listdir(temp_folder):
        os.remove(os.path.join(temp_folder, f))

    # tokenize the original text by sentence and words while keeping the style
    original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
                                               original_paragraphs_with_runs]

    # flatten all the runs so we can align with just one call instead of one per paragraph
    original_tokenized_sentences_with_style = [item for sublist in original_tokenized_sentences_with_style for item in
                                               sublist]

    # tokenize the translated text by sentence and word
    translated_tokenized_sentences = [word_tokenize(sentence) for
                                      translated_paragraph in translated_paragraphs for sentence in
                                      sent_tokenize(translated_paragraph)]

    assert len(translated_tokenized_sentences) == len(
        original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentence, likely due to a translation error"

    original_sentences = []
    translated_sentences = []
    for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
        original_sentences.append(' '.join(item['text'] for item in original))
        translated_sentences.append(' '.join(translated))

    alignments = aligner.align(original_sentences, translated_sentences)

    # using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
    translated_sentences_with_style = []
    for sentence_idx, sentence_alignments in enumerate(alignments):

        # reverse the order of the alignments and build a dict with it
        sentence_alignments = {target: source for source, target in sentence_alignments}

        translated_sentence_with_style = []
        for token_idx, translated_token in enumerate(translated_tokenized_sentences[sentence_idx]):
            # fastalign has found a token aligned with the translated one
            if token_idx in sentence_alignments.keys():
                # get the aligned token
                original_idx = sentence_alignments[token_idx]
                new_entry = original_tokenized_sentences_with_style[sentence_idx][original_idx].copy()
                new_entry["text"] = translated_token
                translated_sentence_with_style.append(new_entry)
            else:
                # WARNING this is a test
                # since fastalign doesn't know from which word to reference this token, copy the style of the previous word
                new_entry = translated_sentence_with_style[-1].copy()
                new_entry["text"] = translated_token
                translated_sentence_with_style.append(new_entry)

        translated_sentences_with_style.append(translated_sentence_with_style)

    return translated_sentences_with_style


# group contiguous elements with the same boolean values
def group_by_style(values, detokenizer):
    groups = []
    for key, group in groupby(values, key=lambda x: (
            x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'],
            x['paragraph_index'])):
        text = detokenizer.detokenize([item['text'] for item in group])

        if groups and not text.startswith((",", ";", ":", ".", ")", "!", "?")):
            text = " " + text

        groups.append({"text": text,
                       "bold": key[0],
                       "italic": key[1],
                       "underline": key[2],
                       "font_name": key[3],
                       "font_size": key[4],
                       "font_color": key[5],
                       'paragraph_index': key[6]})
    return groups


def preprocess_runs(runs_in_paragraph):
    new_runs = []

    for run in runs_in_paragraph:

        # sometimes the parameters are False and sometimes they are None, set them all to False
        for key, value in run.items():
            if value is None and not key.startswith("font"):
                run[key] = False

        if not new_runs:
            new_runs.append(run)
        else:
            # if the previous run has the same format as the current run, we merge the two runs together
            if (new_runs[-1]["bold"] == run["bold"] and new_runs[-1]["font_color"] == run["font_color"] and
                    new_runs[-1]["font_color"] == run["font_color"] and new_runs[-1]["font_name"] == run["font_name"]
                    and new_runs[-1]["font_size"] == run["font_size"] and new_runs[-1]["italic"] == run["italic"]
                    and new_runs[-1]["underline"] == run["underline"]
                    and new_runs[-1]["paragraph_index"] == run["paragraph_index"]):
                new_runs[-1]["text"] += run["text"]
            else:
                new_runs.append(run)

        # we want to split runs that contain more than one sentence to avoid problems later when aligning styles
        sentences = sent_tokenize(new_runs[-1]["text"])
        if len(sentences) > 1:
            new_runs[-1]["text"] = sentences[0]
            for sentence in sentences[1:]:
                new_run = new_runs[-1].copy()
                new_run["text"] = sentence
                new_runs.append(new_run)

    return new_runs


def translate_document(input_file,
                       aligner,
                       detokenizer,
                       ip="192.168.20.216",
                       temp_folder="tmp",
                       port="8000"):
    os.makedirs(temp_folder, exist_ok=True)
    # load original file, extract the paragraphs with their runs (which include style and formatting)
    doc = Document(input_file)
    paragraphs_with_runs = extract_paragraphs_with_runs(doc)

    # translate each paragraph
    translated_paragraphs = []
    for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
        paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
        translated_paragraphs.append(translate(paragraph_text, ip, port))

    out_doc = Document()

    processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]

    print("Generating alignments...")
    start_time = time.time()
    translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
                                                          translated_paragraphs, aligner,
                                                          temp_folder, detokenizer)
    print(f"Finished alignments in {time.time() - start_time} seconds")

    # flatten the sentences into a list of tokens
    translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
    # group the tokens by style/run
    translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)

    # group the runs by original paragraph
    translated_paragraphs_with_style = dict()
    for item in translated_runs_with_style:
        if item['paragraph_index'] in translated_paragraphs_with_style:
            translated_paragraphs_with_style[item['paragraph_index']].append(item)
        else:
            # first item in the paragraph, remove starting blank space we introduced in group_by_style(), where we
            # didn't know where paragraphs started and ended
            first_item_in_paragraph = item.copy()
            first_item_in_paragraph["text"] = first_item_in_paragraph["text"].lstrip(" ")
            translated_paragraphs_with_style[item['paragraph_index']] = []
            translated_paragraphs_with_style[item['paragraph_index']].append(first_item_in_paragraph)

    for paragraph_index, original_paragraph in enumerate(doc.paragraphs):
        # in case there are empty paragraphs
        if not original_paragraph.text:
            out_doc.add_paragraph(style=original_paragraph.style)
            continue

        para = out_doc.add_paragraph(style=original_paragraph.style)

        for item in translated_paragraphs_with_style[paragraph_index]:
            run = para.add_run(item["text"])
            # Preserve original run formatting
            run.bold = item['bold']
            run.italic = item['italic']
            run.underline = item['underline']
            run.font.name = item['font_name']
            run.font.size = item['font_size']
            run.font.color.rgb = item['font_color']

    out_doc.save("translated.docx")
    print("Saved file")
    return "translated.docx"