Spaces:

nvidia
/

Plan2Align-NV

Paused

Plan2Align-NV / laser /tasks /WikiMatrix /extract.py

KuangDW

Add laser2.spm using Git LFS

05d3571 8 months ago

3.18 kB

	#!/bin/python3
	# Copyright (c) Facebook, Inc. and its affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.
	#
	# LASER Language-Agnostic SEntence Representations
	# is a toolkit to calculate multilingual sentence embeddings
	# and to use them for document classification, bitext filtering
	# and mining
	#
	# --------------------------------------------------------
	#
	# Tool to extract subset of mined bitexts in a tsv.gz file

	import os
	import sys
	import gzip
	import argparse

	###############################################################################
	#
	# Main
	#
	###############################################################################

	parser = argparse.ArgumentParser(description='Tool to extract bitext from the WikiMatrix')
	parser.add_argument('--encoding', default='utf-8',
	help='character encoding for input/output')
	parser.add_argument('--tsv', type=str, required=True,
	help='File with mined bitexts')
	parser.add_argument('--bitext', type=str, required=True,
	help='Text file after sentence splitting')
	parser.add_argument('--src-lang', type=str, required=True,
	help='Source language')
	parser.add_argument('--trg-lang', type=str, required=True,
	help='Traget language')
	parser.add_argument('--threshold', type=float, default=1.05,
	help='Threshold on margin score')
	parser.add_argument('--nb-sents', type=int, default=999999999,
	help='Maximal number of sentences')
	parser.add_argument('--nb-words-src', type=int, default=999999999,
	help='Maxmimal numer of total words in the source language')
	parser.add_argument('--nb-words-trg', type=int, default=999999999,
	help='Maxmimal numer of total words in the target language')
	args = parser.parse_args()

	print('Tool to extract bitext from the WikiMatrix')

	nl = 0
	nw_src = 0
	nw_trg = 0
	print('Processing {}'.format(args.tsv))
	with gzip.open(args.tsv, 'rt', encoding=args.encoding) as tsv:
	with open(args.bitext + '.' + args.src_lang, 'wt', encoding=args.encoding) as fsrc:
	with open(args.bitext + '.' + args.trg_lang, 'wt', encoding=args.encoding) as ftrg:
	while nl < args.nb_sents:
	line = tsv.readline()
	if not line:
	break
	fields = line.split('\t')
	cur_src = len(fields[1].split())
	cur_trg = len(fields[2].split())
	if float(fields[0]) < args.threshold:
	break
	if nw_src + cur_src > args.nb_words_src:
	break
	if nw_trg + cur_trg > args.nb_words_trg:
	break
	fsrc.write(fields[1].strip() + '\n')
	ftrg.write(fields[2].strip() + '\n')
	nw_src += cur_src
	nw_trg += cur_trg
	nl += 1
	if nl % 100000 == 0:
	print('\r - {:d} lines read'.format(nl), end='')

	print('\r - wrote {:d} lines'.format(nl))
	print(' - with {:d} source and {:d} target words'.format(nw_src, nw_trg))
	print(' - last threshold is {:.4f}'.format(float(fields[0])))