Spaces:

rafaaa2105
/

Facebooks-m2m100_1.2B

Sleeping

File size: 5,450 Bytes

import gradio as gr
import json
import os
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import torch
import spaces

# --- Model Cache ---
# A global dictionary to cache loaded models and tokenizers to avoid reloading them.
model_cache = {}

def get_model_and_tokenizer(model_name):
    """
    Loads and caches the specified model and tokenizer.
    
    Args:
        model_name (str): The name of the model to load.
        
    Returns:
        tuple: A tuple containing the loaded model and tokenizer.
    """
    if model_name in model_cache:
        print(f"Using cached model: {model_name}")
        return model_cache[model_name]
    else:
        print(f"Loading model: {model_name}. This may take a moment...")
        model = M2M100ForConditionalGeneration.from_pretrained(model_name, device_map="auto")
        tokenizer = M2M100Tokenizer.from_pretrained(model_name)
        model_cache[model_name] = (model, tokenizer)
        print("Model loaded successfully.")
        return model, tokenizer

# A list of languages supported by the M2M100 model
supported_languages = [
    "af", "am", "ar", "ast", "az", "ba", "be", "bg", "bn", "br", "bs", "ca", "ceb",
    "cs", "cy", "da", "de", "el", "en", "es", "et", "fa", "ff", "fi", "fr", "fy",
    "ga", "gd", "gl", "gu", "ha", "he", "hi", "hr", "ht", "hu", "hy", "id", "ig",
    "ilo", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "lb", "lg", "ln",
    "lo", "lt", "lv", "mg", "mk", "ml", "mn", "mr", "ms", "my", "ne", "nl", "no",
    "ns", "oc", "or", "pa", "pl", "ps", "pt", "ro", "ru", "sd", "si", "sk", "sl",
    "so", "sq", "sr", "ss", "su", "sv", "sw", "ta", "th", "tl", "tn", "tr", "uk",
    "ur", "uz", "vi", "wo", "xh", "yi", "yo", "zh", "zu"
]

@spaces.GPU
def translate_minecraft_json(json_file, target_lang, model_name, progress=gr.Progress()):
    """
    Translates the values of a JSON file to a specified language.

    Args:
        json_file (gradio.File): The uploaded en_us.json file.
        target_lang (str): The target language code (e.g., "fr" for French).
        model_name (str): The name of the model to use for translation.
        progress (gradio.Progress): Gradio progress tracker.

    Returns:
        str: The file path of the translated JSON file.
    """
    if json_file is None:
        return None

    try:
        # Get the selected model and tokenizer
        model, tokenizer = get_model_and_tokenizer(model_name)

        # Read the uploaded JSON file
        with open(json_file.name, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Set the source language for the tokenizer
        tokenizer.src_lang = "en"

        translated_data = {}
        total_items = len(data)
        
        # Use the gradio progress tracker
        progress(0, desc=f"Starting translation to '{target_lang}' with {model_name}...")

        for i, (key, value) in enumerate(data.items()):
            if isinstance(value, str) and value.strip(): # Also check if string is not empty
                # Tokenize the text
                encoded_text = tokenizer(value, return_tensors="pt").to(model.device)

                # Generate translation
                with torch.no_grad():
                    generated_tokens = model.generate(
                        **encoded_text,
                        forced_bos_token_id=tokenizer.get_lang_id(target_lang)
                    )

                # Decode the translated text
                translated_text = tokenizer.batch_decode(
                    generated_tokens, skip_special_tokens=True
                )[0]
                translated_data[key] = translated_text
            else:
                # If the value is not a string or is empty, keep it as is
                translated_data[key] = value

            # Update progress
            progress((i + 1) / total_items, desc=f"Translated {i + 1}/{total_items} items...")


        # Create a new file for the translated data
        output_filename = f"{target_lang}_{model_name.split('/')[-1]}.json"
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(translated_data, f, ensure_ascii=False, indent=4)

        return output_filename

    except Exception as e:
        print(f"An error occurred: {e}")
        # Optionally, raise a Gradio error to show it in the UI
        raise gr.Error(f"An error occurred during translation: {e}")


# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Minecraft Language File Translator
        Upload your `en_us.json` file, select the model and target language, then click "Translate".
        """
    )
    with gr.Row():
        model_selector = gr.Radio(
            choices=["facebook/m2m100_418M", "facebook/m2m100_1.2B"],
            label="Select Model",
            value="facebook/m2m100_418M"
        )
    with gr.Row():
        json_upload = gr.File(label="Upload en_us.json")
        language_dropdown = gr.Dropdown(
            choices=supported_languages, label="Select Target Language", value="fr"
        )
    translate_button = gr.Button("Translate")
    translated_file = gr.File(label="Download Translated File")

    translate_button.click(
        translate_minecraft_json,
        inputs=[json_upload, language_dropdown, model_selector],
        outputs=translated_file
    )

if __name__ == "__main__":
    # Launch the Gradio app
    demo.launch()