Spaces:

StarPigeon
/

ViDove

Sleeping

ViDove / src /translators /translation.py

Eason Lu

adapt different languages for srt.py

1a902ed about 2 years ago

4.67 kB

	from os import getenv
	import logging
	from time import sleep
	from tqdm import tqdm
	from src.srt_util.srt import split_script
	from .LLM_task import LLM_task

	def get_translation(srt, model, video_name, prompt, chunk_size = 1000):
	script_arr, range_arr = split_script(srt.get_source_only(),chunk_size)
	translate(srt, script_arr, range_arr, model, video_name, task=prompt)
	pass

	def check_translation(sentence, translation):
	"""
	check merge sentence issue from openai translation
	"""
	sentence_count = sentence.count('\n\n') + 1
	translation_count = translation.count('\n\n') + 1

	if sentence_count != translation_count:
	return False
	else:
	return True

	# TODO{david}: prompts selector
	def prompt_selector(src_lang, tgt_lang, domain):
	language_map = {
	"EN": "English",
	"ZH": "Chinese",
	}
	src_lang = language_map[src_lang]
	tgt_lang = language_map[tgt_lang]
	prompt = f"""
	you are a translation assistant, your job is to translate a video in domain of {domain} from {src_lang} to {tgt_lang},
	you will be provided with a segement in {src_lang} parsed by line, where your translation text should keep the original
	meaning and the number of lines.
	"""
	return prompt

	def translate(srt, script_arr, range_arr, model_name, video_name=None, attempts_count=5, task=None, temp = 0.15):
	"""
	Translates the given script array into another language using the chatgpt and writes to the SRT file.

	This function takes a script array, a range array, a model name, a video name, and a video link as input. It iterates
	through sentences and range in the script and range arrays. If the translation check fails for five times, the function
	will attempt to resolve merge sentence issues and split the sentence into smaller tokens for a better translation.

	:param srt: An instance of the Subtitle class representing the SRT file.
	:param script_arr: A list of strings representing the original script sentences to be translated.
	:param range_arr: A list of tuples representing the start and end positions of sentences in the script.
	:param model_name: The name of the translation model to be used.
	:param video_name: The name of the video.
	:param attempts_count: Number of attemps of failures for unmatched sentences.
	:param task: Prompt.
	:param temp: Model temperature.
	"""

	if input is None:
	raise Exception("Warning! No Input have passed to LLM!")
	if task is None:
	task = "你是一个翻译助理，你的任务是翻译视频，你会被提供一个按行分割的英文段落，你需要在保证句意和行数的情况下输出翻译后的文本。"
	logging.info(f"translation prompt: {task}")
	previous_length = 0
	for sentence, range_ in tqdm(zip(script_arr, range_arr)):
	# update the range based on previous length
	range_ = (range_[0] + previous_length, range_[1] + previous_length)
	# using chatgpt model
	print(f"now translating sentences {range_}")
	logging.info(f"now translating sentences {range_}")
	flag = True
	while flag:
	flag = False
	try:
	translate = LLM_task(model_name, sentence, task, temp)
	# detect merge sentence issue and try to solve for five times:
	while not check_translation(sentence, translate) and attempts_count > 0:
	translate = LLM_task(model_name, sentence, task, temp)
	attempts_count -= 1

	# if failure still happen, split into smaller tokens
	if attempts_count == 0:
	single_sentences = sentence.split("\n\n")
	logging.info("merge sentence issue found for range", range_)
	translate = ""
	for i, single_sentence in enumerate(single_sentences):
	if i == len(single_sentences) - 1:
	translate += LLM_task(model_name,sentence,task,temp)
	else:
	translate += LLM_task(model_name,sentence,task,temp) + "\n\n"
	logging.info("solved by individually translation!")

	except Exception as e:
	logging.debug("An error has occurred during translation:", e)
	print("An error has occurred during translation:", e)
	print("Retrying... the script will continue after 30 seconds.")
	sleep(30)
	flag = True

	srt.set_translation(translate, range_, model_name, video_name)