book-of-souls-2-word-search

Runtime error

File size: 14,637 Bytes

import json
import logging
import math
from datetime import datetime, timedelta

import gradio as gr
import pandas as pd
from deep_translator import GoogleTranslator
from gradio_calendar import Calendar

from gematria import calculate_gematria, strip_diacritics
from utils import (
    date_to_words,
    translate_date_to_words,
    process_json_files
)

# --- Constants ---
FORBIDDEN_NAMES_FILE = "c.txt"
DEFAULT_LANGUAGE = 'english'

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


# --- Helper Functions ---

def create_language_dropdown(label: str, default_value: str = DEFAULT_LANGUAGE, show_label: bool = True) -> gr.Dropdown:
    """Creates a Gradio dropdown menu for language selection.

    Args:
        label (str): The label for the dropdown.
        default_value (str, optional): The default language value. Defaults to 'en'.
        show_label (bool, optional): Whether to show the label. Defaults to True.

    Returns:
        gr.Dropdown: The Gradio dropdown component.
    """
    languages = GoogleTranslator().get_supported_languages(as_dict=True)
    return gr.Dropdown(
        choices=list(languages.keys()),
        label=label,
        value=default_value,
        show_label=show_label
    )


def calculate_gematria_sum(text: str, date_words: str) -> int:
    """Calculates the Gematria sum for a text and date words."""
    combined_input = f"{text} {date_words}"
    logger.info(f"Combined input for Gematria: {combined_input}")
    sum_value = calculate_gematria(strip_diacritics(combined_input))
    logger.info(f"Gematria sum: {sum_value}")
    return sum_value


def perform_els_search(start: int, end: int, step: int, rounds: int, length: int, tlang: str,
                       strip_spaces: bool, strip_in_braces: bool, strip_diacritics: bool, average_combine: bool,
                       search_word_yiddish: str, date_words: str) -> list:  # Accept Yiddish word
    """Performs the ELS search and filters by the Yiddish search word."""

    logger.info("Starting ELS search...")
    logger.debug(f"Search word (Yiddish): {search_word_yiddish}")  # No translation here

    if step == 0 or rounds == 0:
        logger.info("Cannot search with step 0 or rounds 0")
        return []

    results = process_json_files(start, end, step, rounds, length, tlang, strip_spaces,
                                 strip_in_braces, strip_diacritics, average_combine,
                                 translate_results=False)

    # Filter results by search word in els_result_text (Yiddish)
    filtered_results = []
    for result in results:
        logger.debug(f"Searching result: {result}")
        #if 'els_result_text' in result and search_word_yiddish in result['els_result_text']:
        if 1==1:
            filtered_results.append({
                'Date': date_words,
                'Book Result': result['els_result_text'],
                'Result': result.get('translated_text', '')
            })

    return filtered_results


def generate_json_dump(start: int, end: int, step: int, rounds: int, length: int, tlang: str,
                       strip_spaces: bool, strip_in_braces: bool, strip_diacritics_chk: bool,
                       search_phrase: str, results_df: pd.DataFrame, search_word: str,
                       start_date: datetime, end_date: datetime) -> str:
    """Generates the JSON dump with configuration, date range, and results."""
    config = {
        "Start Book": start,
        "End Book": end,
        "Step": step,
        "Rounds": rounds,
        "Length": length,
        "Target Language": tlang,
        "Strip Spaces": strip_spaces,
        "Strip Text in Braces": strip_in_braces,
        "Strip Diacritics": strip_diacritics_chk,
        "Search Phrase": search_phrase,
        "Search Word": search_word
    }
    result = {
        "Configuration": config,
        "DateRange": {
            "StartDate": start_date.strftime("%Y-%m-%d"),
            "EndDate": end_date.strftime("%Y-%m-%d")
        },
        "Results": json.loads(results_df.to_json(orient='records', force_ascii=False))
    }
    logger.info(f"Generated JSON dump: {result}")
    return json.dumps(result, indent=4, ensure_ascii=False)


def download_json_file(config_json: str, step: int, rounds: int,
                       strip_spaces: bool, strip_in_braces: bool, strip_diacritics_chk: bool) -> str:
    """Downloads the JSON config file with a descriptive name."""
    filename_suffix = ""
    if strip_spaces:
        filename_suffix += "-stSp"
    if strip_in_braces:
        filename_suffix += "-stBr"
    if strip_diacritics_chk:
        filename_suffix += "-stDc"
    file_path = f"step-{step}-rounds-{rounds}{filename_suffix}.json"
    with open(file_path, "w", encoding='utf-8') as file:
        file.write(config_json)
    logger.info(f"Downloaded JSON file to: {file_path}")
    return file_path


# --- Forbidden Names Functions ---

def load_forbidden_names(filename: str = FORBIDDEN_NAMES_FILE) -> list:
    """Loads forbidden names from the specified file."""
    try:
        with open(filename, "r", encoding='utf-8') as f:
            forbidden_names = [line.strip() for line in f]
        return forbidden_names
    except FileNotFoundError:
        print(f"Error: Forbidden names file '{filename}' not found.")
        return []


def check_name_similarity(name: str, forbidden_names: list, threshold: int = 80) -> bool:
    """Checks if a name is similar to any forbidden name."""
    from fuzzywuzzy import fuzz
    for forbidden_name in forbidden_names:
        similarity_ratio = fuzz.ratio(name.lower(), forbidden_name.lower())
        if similarity_ratio >= threshold:
            logging.info(f"Forbidden word {forbidden_name} detected in: {name}")
            return True
    return False


# --- Gradio UI ---

with gr.Blocks() as app:
    with gr.Row():
        start_date = Calendar(type="datetime", label="1. Select Start Date")
        end_date = Calendar(type="datetime", label="2. Select End Date")
        date_language_input = create_language_dropdown("3. Date Word Language", default_value=DEFAULT_LANGUAGE)
        search_word = gr.Textbox(label="4. Search Word")

    with gr.Row():
        gematria_text = gr.Textbox(label="5. Name and/or Topic", value="Hans Albert Einstein")
        gematria_btn = gr.Button("6. Calculate Journal Sum")

    gematria_result = gr.Number(label="Journal Sum")
    #TODO: journal sum is wrong, because "english" is added to it initially,
    #TODO: this only affects the interface field(s), not the result searching

    with gr.Row():
        start = gr.Number(label="Start Book", value=1)
        end = gr.Number(label="End Book", value=39)
        step = gr.Number(label="Jump Width (Steps) for ELS")
        rounds = gr.Number(label="Rounds through Books", value=1)
        float_step = gr.Number(visible=False, value=1)
        half_step_btn = gr.Button("Steps / 2")
        double_step_btn = gr.Button("Steps * 2")

        with gr.Column():
            round_x = gr.Number(label="Round (x)", value=1)
            round_y = gr.Number(label="Round (y)", value=-1)

        average_combine_chk = gr.Checkbox(label="Average-Combine Combined Rounds", value=False)
        mirror_book_numbers = gr.Checkbox(label="Mirror book numbers for negative rounds (axis=book 20)", value=False)

        rounds_combination = gr.Textbox(label="Combined Rounds", value="1,-1")

    with gr.Row():
        length = gr.Number(label="Result Length (0=inf)", value=0)
        tlang = create_language_dropdown("Target Language for Translation", default_value=DEFAULT_LANGUAGE)
        strip_spaces = gr.Checkbox(label="Strip Spaces from Books", value=True)
        strip_in_braces = gr.Checkbox(label="Strip Text in Braces from Books", value=True)
        strip_diacritics_chk = gr.Checkbox(label="Strip Diacritics from Books", value=True)
        acknowledgment_chk = gr.Checkbox(
            label="The User hereby accepts that the User will not harm or stalk anyone with this information, or bet on any of this information, in any regards.",
            value=True
        )

    translate_btn = gr.Button("7. Search with ELS")

    results_output = gr.Dataframe(headers=['Date', 'Book Result', 'Result'], label="Results")
    json_output = gr.Textbox(label="JSON Configuration Output")
    json_download_btn = gr.Button("Prepare .json for Download")
    json_file = gr.File(label="Download Config JSON", file_count="single")

    # --- Load Forbidden Names ---

    forbidden_names = load_forbidden_names()


    # --- Event Handlers ---

    def update_rounds_combination(round_x: int, round_y: int) -> str:
        """Updates the rounds_combination textbox based on round_x and round_y."""
        return f"{int(round_x)},{int(round_y)}"


    def calculate_journal_sum(text: str, date_words: str) -> tuple:
        """Calculates the journal sum and updates the step value."""
        if check_name_similarity(text, forbidden_names):
            return 0, 0, 0
        if check_name_similarity(date_words, forbidden_names):
            return 0, 0, 0
        sum_value = calculate_gematria_sum(text, date_words)
        return sum_value, sum_value, sum_value


    def update_step_half(float_step: float) -> tuple:
        """Updates the step value to half."""
        new_step = math.ceil(float_step / 2)
        return new_step, float_step / 2


    def update_step_double(float_step: float) -> tuple:
        """Updates the step value to double."""
        new_step = math.ceil(float_step * 2)
        return new_step, float_step * 2


    # Update rounds_combination when round_x or round_y changes
    round_x.change(update_rounds_combination, inputs=[round_x, round_y], outputs=rounds_combination)
    round_y.change(update_rounds_combination, inputs=[round_x, round_y], outputs=rounds_combination)


    def handle_json_download(config_json: str, step: int, rounds: int, strip_spaces: bool,
                             strip_in_braces: bool, strip_diacritics_chk: bool) -> str:
        """Handles the download of the JSON config file."""
        return download_json_file(config_json, step, rounds, strip_spaces, strip_in_braces, strip_diacritics_chk)


    def perform_search_and_create_json(start_date: datetime, end_date: datetime, date_language_input: str,
                                       search_word: str, start: int, end: int, step: int, rounds: int, length: int,
                                       tlang: str, strip_spaces: bool, strip_in_braces: bool,
                                       strip_diacritics_chk: bool,
                                       gematria_text: str, average_combine: bool) -> tuple:
        """Performs the ELS search for each date in the range, creates the JSON config, and displays the results."""
        all_results = []
        delta = timedelta(days=1)
        original_start_date = start_date
        total_steps = 0

        # Translate the search word to Yiddish ONLY ONCE (outside the loop)
        translator_yi = GoogleTranslator(source='auto', target='yi')
        search_word_yiddish = translator_yi.translate(search_word)

        seen_dates = set()  # Keep track of processed dates

        while start_date <= end_date:
            date_words_output = date_to_words(start_date.strftime("%Y-%m-%d"))

            # Only translate if the date language is not English
            if date_language_input.lower() != DEFAULT_LANGUAGE:
                date_words_output = translate_date_to_words(start_date, date_language_input)

            # Skip if date has already been processed
            if date_words_output in seen_dates:
                start_date += delta
                continue
            seen_dates.add(date_words_output)

            journal_sum, _, _ = calculate_journal_sum(gematria_text, date_words_output)
            step = journal_sum
            total_steps += step

            filtered_results = perform_els_search(start, end, step, rounds, length, tlang, strip_spaces,
                                                  strip_in_braces, strip_diacritics_chk, average_combine,
                                                  search_word_yiddish,  # Pass the translated Yiddish word
                                                  date_words_output)

            # Only add the first result for each date
            if filtered_results:
                all_results.append(filtered_results[0])

            start_date += delta

        # Process results after the loop completes
        if all_results:
            df = pd.DataFrame(all_results)

            # Deduplicate steps
            seen_steps = set()
            deduplicated_results = []
            for result in all_results:
                step_key = (result['Date'], result['Book Result'])
                if step_key not in seen_steps:
                    deduplicated_results.append(result)
                    seen_steps.add(step_key)
            df = pd.DataFrame(deduplicated_results)

            # Translate the 'Book Result' column to the target language
            translator = GoogleTranslator(source='yi', target=tlang)
            df['Result'] = df['Book Result'].apply(translator.translate)

            config_json = generate_json_dump(start, end, total_steps, rounds, length, tlang, strip_spaces,
                                             strip_in_braces, strip_diacritics_chk, gematria_text, df, search_word,
                                             original_start_date, end_date)
            return config_json, df
        else:
            return "No results found.", None


    gematria_btn.click(
        calculate_journal_sum,
        inputs=[gematria_text, date_language_input],
        outputs=[gematria_result, step, float_step]
    )

    half_step_btn.click(
        update_step_half,
        inputs=[float_step],
        outputs=[step, float_step]
    )

    double_step_btn.click(
        update_step_double,
        inputs=[float_step],
        outputs=[step, float_step]
    )

    translate_btn.click(
        perform_search_and_create_json,
        inputs=[start_date, end_date, date_language_input, search_word, start, end, step, rounds_combination, length,
                tlang, strip_spaces,
                strip_in_braces, strip_diacritics_chk, gematria_text, average_combine_chk],
        outputs=[json_output, results_output]
    )

    json_download_btn.click(
        handle_json_download,
        inputs=[json_output, step, rounds, strip_spaces, strip_in_braces, strip_diacritics_chk],
        outputs=[json_file]
    )

if __name__ == "__main__":
    app.launch(share=False)