book-of-souls-2-word-search

Runtime error

App Files Files Community

book-of-souls-2-word-search / utils.py

neuralworm

speed up search

2a65456 12 months ago

raw

history blame contribute delete

10.6 kB

	import logging
	logger = logging.getLogger(__name__)

	import json
	import re
	from gematria import calculate_gematria
	import inflect
	from datetime import datetime
	from deep_translator import GoogleTranslator

	import logging

	logger = logging.getLogger(__name__)

	import json
	import re
	from gematria import calculate_gematria
	import inflect
	from datetime import datetime
	from deep_translator import GoogleTranslator


	def process_json_files(start, end, step, rounds="1", length=0, tlang="en", strip_spaces=True,
	strip_in_braces=True, strip_diacritics=True, average_compile=False, translate_results=False):
	"""Processes JSON files to extract and process text.

	Args:
	start (int): The starting book number.
	end (int): The ending book number.
	step (int): The step/jump width.
	rounds (str, optional): The rounds specification. Defaults to "1".
	length (int, optional): The maximum result length. Defaults to 0 (no limit).
	tlang (str, optional): The target language for translation. Defaults to "en".
	strip_spaces (bool, optional): Whether to strip spaces. Defaults to True.
	strip_in_braces (bool, optional): Whether to strip text in braces. Defaults to True.
	strip_diacritics (bool, optional): Whether to strip diacritics. Defaults to True.
	average_compile (bool, optional): Whether to average-combine results. Defaults to False.
	translate_results (bool, optional): Whether to translate the results. Defaults to False.

	Returns:
	list: A list of processed results.
	"""

	base_path = "texts"
	translator = GoogleTranslator(source='auto', target=tlang)
	results = []

	for i in range(start, end + 1):
	file_name = f"{base_path}/{i:02}.json"
	try:
	with open(file_name, 'r', encoding='utf-8') as file:
	data = json.load(file)
	text_blocks = data["text"]

	full_text = ""
	for block in text_blocks:
	full_text += ' '.join(block)

	clean_text = full_text
	if strip_in_braces:
	clean_text = re.sub(r"\[.*?\]", "", clean_text, flags=re.DOTALL)
	if strip_diacritics:
	clean_text = re.sub(r"[^\u05D0-\u05EA ]+", "", clean_text)
	if strip_spaces:
	clean_text = clean_text.replace(" ", "")
	else:
	clean_text = clean_text.replace(" ", " ")
	clean_text = clean_text.replace(" ", " ")
	clean_text = clean_text.replace(" ", " ")

	text_length = len(clean_text)

	selected_characters_per_round = {}
	for round_num in map(int, rounds.split(',')):
	# Handle cases where no characters should be selected
	if not (round_num == 1 and step > text_length) and not (round_num == -1 and step > text_length):
	# Corrected logic for negative rounds and step = 1
	if round_num > 0:
	current_position = step - 1
	else:
	current_position = text_length - 1 if step == 1 else text_length - step

	completed_rounds = 0
	selected_characters = ""

	while completed_rounds < abs(round_num):
	selected_characters += clean_text[current_position % text_length]

	# Update current_position based on the sign of rounds
	current_position += step if round_num > 0 else -step

	if (round_num > 0 and current_position >= text_length * (completed_rounds + 1)) or \
	(round_num < 0 and current_position < 0):
	completed_rounds += 1

	selected_characters_per_round[round_num] = selected_characters

	if average_compile and len(selected_characters_per_round) > 1:
	result_text = ""
	keys = sorted(selected_characters_per_round.keys())
	for i in range(len(keys) - 1):
	result_text = average_gematria(selected_characters_per_round[keys[i]],
	selected_characters_per_round[keys[i + 1]])
	else:
	result_text = ''.join(selected_characters_per_round.values())

	if length != 0:
	result_text = result_text[:length]

	# Translate only if translate_results is True
	translated_text = translator.translate(result_text) if translate_results and result_text else ""

	if result_text: # Only append if result_text is not empty
	results.append({
	"book": i,
	"title": data["title"],
	"els_result_text": result_text,
	"els_result_gematria": calculate_gematria(result_text),
	"translated_text": translated_text
	})

	except FileNotFoundError:
	results.append({"error": f"File {file_name} not found."})
	except json.JSONDecodeError as e:
	results.append({"error": f"File {file_name} could not be read as JSON: {e}"})
	except KeyError as e:
	results.append({"error": f"Expected key 'text' is missing in {file_name}: {e}"})

	return results

	# Custom function to convert number to ordinal words
	def number_to_ordinal_word(number):
	ordinal_dict = {
	1: "first", 2: "second", 3: "third", 4: "fourth", 5: "fifth",
	6: "sixth", 7: "seventh", 8: "eighth", 9: "ninth", 10: "tenth",
	11: "eleventh", 12: "twelfth", 13: "thirteenth", 14: "fourteenth",
	15: "fifteenth", 16: "sixteenth", 17: "seventeenth", 18: "eighteenth",
	19: "nineteenth", 20: "twentieth", 21: "twentyfirst", 22: "twentysecond",
	23: "twentythird", 24: "twentyfourth", 25: "twentyfifth",
	26: "twentysixth", 27: "twentyseventh", 28: "twentyeighth",
	29: "twentyninth", 30: "thirtieth", 31: "thirtyfirst"
	}
	return ordinal_dict.get(number, "")



	def custom_normalize(text):
	mappings = {
	'ü': 'ue', 'ö': 'oe', 'ä': 'ae', 'ß': 'ss', 'Ü': 'Ue', 'Ö': 'Oe', 'Ä': 'Ae',
	'á': 'a', 'à': 'a', 'â': 'a', 'ã': 'a', 'å': 'aa', 'ā': 'a', 'ă': 'a', 'ą': 'a',
	'Á': 'A', 'À': 'A', 'Â': 'A', 'Ã': 'A', 'Å': 'Aa', 'Ā': 'A', 'Ă': 'A', 'Ą': 'A',
	'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e', 'ē': 'e', 'ĕ': 'e', 'ė': 'e', 'ę': 'e', 'ě': 'e',
	'É': 'E', 'È': 'E', 'Ê': 'E', 'Ë': 'E', 'Ē': 'E', 'Ĕ': 'E', 'Ė': 'E', 'Ę': 'E', 'Ě': 'E',
	'í': 'i', 'ì': 'i', 'î': 'i', 'ï': 'i', 'ī': 'i', 'ĭ': 'i', 'į': 'i', 'ı': 'i',
	'Í': 'I', 'Ì': 'I', 'Î': 'I', 'Ï': 'I', 'Ī': 'I', 'Ĭ': 'I', 'Į': 'I', 'I': 'I',
	'ó': 'o', 'ò': 'o', 'ô': 'o', 'õ': 'o', 'ø': 'oe', 'ō': 'o', 'ŏ': 'o', 'ő': 'o',
	'Ó': 'O', 'Ò': 'O', 'Ô': 'O', 'Õ': 'O', 'Ø': 'Oe', 'Ō': 'O', 'Ŏ': 'O', 'Ő': 'O',
	'ú': 'u', 'ù': 'u', 'û': 'u', 'ū': 'u', 'ŭ': 'u', 'ů': 'u', 'ű': 'u', 'ų': 'u',
	'Ú': 'U', 'Ù': 'U', 'Û': 'U', 'Ü': 'Ue', 'Ū': 'U', 'Ŭ': 'U', 'Ů': 'U', 'Ű': 'U', 'Ų': 'U',
	'ç': 'c', 'ć': 'c', 'ĉ': 'c', 'ċ': 'c', 'č': 'c',
	'Ç': 'C', 'Ć': 'C', 'Ĉ': 'C', 'Ċ': 'C', 'Č': 'C',
	'ñ': 'n', 'ń': 'n', 'ņ': 'n', 'ň': 'n', 'ŋ': 'n',
	'Ñ': 'N', 'Ń': 'N', 'Ņ': 'N', 'Ň': 'N', 'Ŋ': 'N',
	'ý': 'y', 'ÿ': 'y', 'ŷ': 'y',
	'Ý': 'Y', 'Ÿ': 'Y', 'Ŷ': 'Y',
	'ž': 'zh', 'ź': 'z', 'ż': 'z',
	'Ž': 'Zh', 'Ź': 'Z', 'Ż': 'Z',
	'ð': 'd', 'Ð': 'D', 'þ': 'th', 'Þ': 'Th', 'ł': 'l', 'Ł': 'L', 'đ': 'd', 'Đ': 'D',
	'æ': 'ae', 'Æ': 'Ae', 'œ': 'oe', 'Œ': 'Oe',
	'ś': 's', 'ŝ': 's', 'ş': 's', 'š': 's',
	'Ś': 'S', 'Ŝ': 'S', 'Ş': 'S', 'Š': 'S',
	'ť': 't', 'ţ': 't', 'ŧ': 't', 'Ť': 'T', 'Ţ': 'T', 'Ŧ': 'T',
	'ŕ': 'r', 'ř': 'r', 'Ŕ': 'R', 'Ř': 'R',
	'ľ': 'l', 'ĺ': 'l', 'ļ': 'l', 'ŀ': 'l',
	'Ľ': 'L', 'Ĺ': 'L', 'Ļ': 'L', 'Ŀ': 'L',
	'ē': 'e', 'Ē': 'E',
	'ň': 'n', 'Ň': 'N',
	'ğ': 'g', 'Ğ': 'G',
	'ġ': 'g', 'Ġ': 'G',
	'ħ': 'h', 'Ħ': 'H',
	'ı': 'i', 'İ': 'I',
	'ĵ': 'j', 'Ĵ': 'J',
	'ķ': 'k', 'Ķ': 'K',
	'ļ': 'l', 'Ļ': 'L',
	'ņ': 'n', 'Ņ': 'N',
	'ŧ': 't', 'Ŧ': 'T',
	'ŭ': 'u', 'Ŭ': 'U'
	}
	for key, value in mappings.items():
	text = text.replace(key, value)
	return text




	# Convert a numerical date to words with an ordinal day
	def date_to_words(date_string):
	# Create an inflect engine
	inf_engine = inflect.engine()

	date_obj = datetime.strptime(date_string, "%Y-%m-%d")

	# Get year in the desired format
	year = date_obj.year
	if 1900 <= year <= 1999:
	year_words = f"{inf_engine.number_to_words(year // 100, andword='') } hundred"
	if year % 100 != 0:
	year_words += f" {inf_engine.number_to_words(year % 100, andword='')}"
	else:
	year_words = inf_engine.number_to_words(year, andword='')
	year_formatted = year_words.replace(',', '') # Remove commas

	month = date_obj.strftime("%B") # Full month name
	day = date_obj.day
	day_ordinal = number_to_ordinal_word(day) # Get ordinal word for the day

	output_text = f"{day_ordinal} {month} {year_formatted}"

	return output_text



	def translate_date_to_words(date, lang='en'):
	"""Converts a date to words in the specified language."""
	if date is None:
	return "No date selected"

	date_string = date.strftime("%Y-%m-%d")
	logger.info(f"Date string: {date_string}")

	date_in_words = date_to_words(date_string)
	logger.info(f"Date in words: {date_in_words}")

	translator = GoogleTranslator(source='auto', target=lang)
	translated_date_words = translator.translate(date_in_words)
	logger.info(f"Translated date words: {translated_date_words}")

	# Normalize the text if it contains any special characters
	translated_date_words = custom_normalize(translated_date_words)
	logger.info(f"Normalized date words: {translated_date_words}")

	return translated_date_words