Spaces:
Paused
Paused
| #!/bin/python3 | |
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the BSD-style license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| # | |
| # LASER Language-Agnostic SEntence Representations | |
| # is a toolkit to calculate multilingual sentence embeddings | |
| # and to use them for document classification, bitext filtering | |
| # and mining | |
| # | |
| # -------------------------------------------------------- | |
| # | |
| # Tool to extract subset of mined bitexts in a tsv.gz file | |
| import os | |
| import sys | |
| import gzip | |
| import argparse | |
| ############################################################################### | |
| # | |
| # Main | |
| # | |
| ############################################################################### | |
| parser = argparse.ArgumentParser(description='Tool to extract bitext from the WikiMatrix') | |
| parser.add_argument('--encoding', default='utf-8', | |
| help='character encoding for input/output') | |
| parser.add_argument('--tsv', type=str, required=True, | |
| help='File with mined bitexts') | |
| parser.add_argument('--bitext', type=str, required=True, | |
| help='Text file after sentence splitting') | |
| parser.add_argument('--src-lang', type=str, required=True, | |
| help='Source language') | |
| parser.add_argument('--trg-lang', type=str, required=True, | |
| help='Traget language') | |
| parser.add_argument('--threshold', type=float, default=1.05, | |
| help='Threshold on margin score') | |
| parser.add_argument('--nb-sents', type=int, default=999999999, | |
| help='Maximal number of sentences') | |
| parser.add_argument('--nb-words-src', type=int, default=999999999, | |
| help='Maxmimal numer of total words in the source language') | |
| parser.add_argument('--nb-words-trg', type=int, default=999999999, | |
| help='Maxmimal numer of total words in the target language') | |
| args = parser.parse_args() | |
| print('Tool to extract bitext from the WikiMatrix') | |
| nl = 0 | |
| nw_src = 0 | |
| nw_trg = 0 | |
| print('Processing {}'.format(args.tsv)) | |
| with gzip.open(args.tsv, 'rt', encoding=args.encoding) as tsv: | |
| with open(args.bitext + '.' + args.src_lang, 'wt', encoding=args.encoding) as fsrc: | |
| with open(args.bitext + '.' + args.trg_lang, 'wt', encoding=args.encoding) as ftrg: | |
| while nl < args.nb_sents: | |
| line = tsv.readline() | |
| if not line: | |
| break | |
| fields = line.split('\t') | |
| cur_src = len(fields[1].split()) | |
| cur_trg = len(fields[2].split()) | |
| if float(fields[0]) < args.threshold: | |
| break | |
| if nw_src + cur_src > args.nb_words_src: | |
| break | |
| if nw_trg + cur_trg > args.nb_words_trg: | |
| break | |
| fsrc.write(fields[1].strip() + '\n') | |
| ftrg.write(fields[2].strip() + '\n') | |
| nw_src += cur_src | |
| nw_trg += cur_trg | |
| nl += 1 | |
| if nl % 100000 == 0: | |
| print('\r - {:d} lines read'.format(nl), end='') | |
| print('\r - wrote {:d} lines'.format(nl)) | |
| print(' - with {:d} source and {:d} target words'.format(nw_src, nw_trg)) | |
| print(' - last threshold is {:.4f}'.format(float(fields[0]))) | |