Spaces:

chaowenguo
/

agwefgw

Running

App Files Files Community

agwefgw / services /captioning /caption_helper.py

chaowenguo

Upload 129 files

5690e11 verified 3 months ago

raw

history blame contribute delete

8.4 kB

	#
	# Copyright (c) Microsoft. All rights reserved.
	# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
	#

	from datetime import time
	from typing import List, Optional, Tuple

	from services.alinls.speech_process import AliRecognitionResult
	import azure.cognitiveservices.speech as speechsdk

	from services.audio.faster_whisper_recognition_service import FasterWhisperRecognitionResult
	from services.audio.sensevoice_whisper_recognition_service import SenseVoiceRecognitionResult
	from services.audio.tencent_recognition_service import TencentRecognitionResult
	from services.captioning import helper


	class Caption(object):
	def __init__(self, language: Optional[str], sequence: int, begin: time, end: time, text: str):
	self.language = language
	self.sequence = sequence
	self.begin = begin
	self.end = end
	self.text = text


	def get_captions(language: Optional[str], max_width: int, max_height: int, results: List[dict]) -> List[Caption]:
	caption_helper = CaptionHelper(language, max_width, max_height, results)
	return caption_helper.get_captions()


	class CaptionHelper(object):
	def __init__(self, language: Optional[str], max_width: int, max_height: int,
	results: List[object]):
	self._language = language
	self._max_width = max_width
	self._max_height = max_height
	self._results = results

	self._first_pass_terminators = ["?", "!", ",", ";"]
	self._second_pass_terminators = [" ", "."]

	self._captions: List[Caption] = []

	# consider adapting to use http://unicode.org/reports/tr29/#Sentence_Boundaries
	if self._language is not None:
	iso639 = self._language.split('-')[0]
	if "zh" == iso639.lower():
	self._first_pass_terminators = ["，", "、", "；", "？", "！", "?", "!", ",", ";"]
	self._second_pass_terminators = ["。", " "]
	if helper.DEFAULT_MAX_LINE_LENGTH_SBCS == self._max_width:
	self._max_width = helper.DEFAULT_MAX_LINE_LENGTH_MBCS

	def get_captions(self) -> List[Caption]:
	self.ensure_captions()
	return self._captions

	def ensure_captions(self) -> None:
	if not self._captions:
	self.add_captions_for_all_results()

	def add_captions_for_all_results(self) -> None:
	for result in self._results:
	if (hasattr(result, 'offset') and result.offset <= 0) or not self.is_final_result(result):
	continue
	text = self.get_text_or_translation(result)
	if not text:
	continue
	self.add_captions_for_final_result(result, text)

	def get_text_or_translation(self, result: object) -> Optional[str]:
	return result.text

	def add_captions_for_final_result(self, result: object, text: str) -> None:
	caption_starts_at = 0
	caption_lines: List[str] = []
	index = 0
	while index < len(text):
	index = self.skip_skippable(text, index)

	line_length = self.get_best_width(text, index)
	caption_lines.append(text[index:index + line_length].strip())
	index += line_length

	is_last_caption = index >= len(text)
	max_caption_lines = len(caption_lines) >= self._max_height

	add_caption = is_last_caption or max_caption_lines

	if add_caption:
	caption_text = '\n'.join(caption_lines)
	caption_lines.clear()

	caption_sequence = len(self._captions) + 1
	is_first_caption = 0 == caption_starts_at

	caption_begin_and_end: Tuple[time, time]
	if is_first_caption and is_last_caption:
	caption_begin_and_end = self.get_full_caption_result_timing(result)
	else:
	caption_begin_and_end = self.get_partial_result_caption_timing(result, text, caption_text,
	caption_starts_at,
	index - caption_starts_at)

	self._captions.append(
	Caption(self._language, caption_sequence, caption_begin_and_end[0], caption_begin_and_end[1],
	caption_text))

	caption_starts_at = index

	def get_best_width(self, text: str, start_index: int) -> int:
	# print("get_best_width:",text)
	remaining = len(text) - start_index
	best_width = remaining if remaining < self._max_width else self.find_best_width(self._first_pass_terminators,
	text, start_index)
	if best_width < 0:
	best_width = self.find_best_width(self._second_pass_terminators, text, start_index)
	if best_width < 0:
	best_width = self._max_width
	# print("best_width",best_width)
	return best_width

	def find_best_width(self, terminators: List[str], text: str, start_at: int) -> int:
	remaining = len(text) - start_at
	check_chars = min(remaining, self._max_width)
	best_width = -1
	for terminator in terminators:
	index = text.rfind(terminator, start_at, start_at + check_chars)
	width = index - start_at
	if width > best_width:
	best_width = width + len(terminator)
	return best_width

	def skip_skippable(self, text: str, start_index: int) -> int:
	index = start_index
	while len(text) > index and ' ' == text[index]:
	index += 1
	return index

	def get_full_caption_result_timing(self, result: object) -> Tuple[time, time]:
	if isinstance(result, speechsdk.RecognitionResult):
	begin = helper.time_from_ticks(result.offset)
	end = helper.time_from_ticks(result.offset + result.duration)
	return begin, end
	if isinstance(result, AliRecognitionResult) or isinstance(result, TencentRecognitionResult):
	begin = helper.time_from_milliseconds(result.begin_time)
	end = helper.time_from_milliseconds(result.end_time)
	return begin, end
	if isinstance(result, FasterWhisperRecognitionResult):
	begin = helper.time_from_seconds(result.begin_time)
	end = helper.time_from_seconds(result.end_time)
	return begin, end
	if isinstance(result, SenseVoiceRecognitionResult):
	begin = helper.time_from_seconds(result.begin_time)
	end = helper.time_from_seconds(result.end_time)
	return begin, end

	def get_partial_result_caption_timing(self, result: object, text: str, caption_text: str,
	caption_starts_at: int, caption_length: int) -> Tuple[time, time]:
	(result_begin, result_end) = self.get_full_caption_result_timing(result)
	result_duration = helper.subtract_times(result_end, result_begin)
	text_length = len(text)
	partial_begin = helper.add_time_and_timedelta(result_begin, result_duration * caption_starts_at / text_length)
	partial_end = helper.add_time_and_timedelta(result_begin, result_duration * (
	caption_starts_at + caption_length) / text_length)
	return partial_begin, partial_end

	def is_final_result(self, result: object) -> bool:
	if isinstance(result, speechsdk.RecognitionResult):
	return speechsdk.ResultReason.RecognizedSpeech == result.reason or speechsdk.ResultReason.RecognizedIntent == result.reason or speechsdk.ResultReason.TranslatedSpeech == result.reason
	if isinstance(result, AliRecognitionResult) or isinstance(result, TencentRecognitionResult) or isinstance(result, FasterWhisperRecognitionResult) or isinstance(result, SenseVoiceRecognitionResult):
	return True

	def lines_from_text(self, text: str) -> List[str]:
	retval: List[str] = []
	index = 0
	while index < len(text):
	index = self.skip_skippable(text, index)
	line_length = self.get_best_width(text, index)
	retval.append(text[index:index + line_length].strip())
	index += line_length
	return retval