{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "machine_shape": "hm", "gpuType": "A100" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "source": [ "!pip install datasets" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "y4FB2eje4ClO", "outputId": "d75dfce9-32fc-4825-c91a-01a99ee338af" }, "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting datasets\n", " Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from datasets) (3.17.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.11/dist-packages (from datasets) (1.26.4)\n", "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (18.1.0)\n", "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n", " Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from datasets) (2.2.2)\n", "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.11/dist-packages (from datasets) (2.32.3)\n", "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.11/dist-packages (from datasets) (4.67.1)\n", "Collecting xxhash (from datasets)\n", " Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n", "Collecting multiprocess<0.70.17 (from datasets)\n", " Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)\n", "Requirement already satisfied: fsspec<=2024.12.0,>=2023.1.0 in /usr/local/lib/python3.11/dist-packages (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets) (2024.10.0)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.11/dist-packages (from datasets) (3.11.13)\n", "Requirement already satisfied: huggingface-hub>=0.24.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.28.1)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from datasets) (24.2)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from datasets) (6.0.2)\n", "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (2.4.6)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (1.3.2)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (25.1.0)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (1.5.0)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (6.1.0)\n", "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (0.3.0)\n", "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (1.18.3)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.24.0->datasets) (4.12.2)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (3.4.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (2.3.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (2025.1.31)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2025.1)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2025.1)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n", "Downloading datasets-3.3.2-py3-none-any.whl (485 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m485.4/485.4 kB\u001b[0m \u001b[31m33.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m10.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.5/143.5 kB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.8/194.8 kB\u001b[0m \u001b[31m18.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: xxhash, dill, multiprocess, datasets\n", "Successfully installed datasets-3.3.2 dill-0.3.8 multiprocess-0.70.16 xxhash-3.5.0\n" ] } ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XGlb6yNNwmSR", "outputId": "69c617f8-a7a7-4c60-9597-74fa8b1a1b8d" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Dataset({\n", " features: ['translation'],\n", " num_rows: 2507\n", "})\n" ] } ], "source": [ "from datasets import load_dataset\n", "\n", "dataset = load_dataset(\"cfilt/iitb-english-hindi\")[\"test\"] # English-Hindi dataset\n", "print(dataset)\n" ] }, { "cell_type": "code", "source": [ "from transformers import MarianMTModel, MarianTokenizer\n", "\n", "model_name = \"Helsinki-NLP/opus-mt-en-hi\" # Change to \"opus-mt-hi-en\" for Hindi to English\n", "model = MarianMTModel.from_pretrained(model_name)\n", "tokenizer = MarianTokenizer.from_pretrained(model_name)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NQJD04Y_4i99", "outputId": "fc6ae6da-e727-4375-c32e-2ccc401e4b15" }, "execution_count": 17, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.11/dist-packages/transformers/models/marian/tokenization_marian.py:175: UserWarning: Recommended: pip install sacremoses.\n", " warnings.warn(\"Recommended: pip install sacremoses.\")\n" ] } ] }, { "cell_type": "code", "source": [ "en=[]\n", "hi=[]\n", "for i in dataset['translation']:\n", " en.append(i[\"en\"])\n", " hi.append(i[\"hi\"])" ], "metadata": { "id": "zZsvE-9P43DB" }, "execution_count": 18, "outputs": [] }, { "cell_type": "code", "source": [ "en[0],hi[0]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VncVkICMt9uX", "outputId": "10dacfb6-6653-40cc-db92-f8221608617a" }, "execution_count": 19, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "('A black box in your car?', 'आपकी कार में ब्लैक बॉक्स?')" ] }, "metadata": {}, "execution_count": 19 } ] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "df=pd.DataFrame({'en':en,'hi':hi})\n", "df.drop(index=0, inplace=True)\n", "df.head()\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "iFCfdlvHuCpj", "outputId": "40fcdf49-cbbe-4989-ca98-c53b7b19fe7c" }, "execution_count": 20, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " en \\\n", "1 As America's road planners struggle to find th... \n", "2 The devices, which track every mile a motorist... \n", "3 The usually dull arena of highway planning has... \n", "4 Libertarians have joined environmental groups ... \n", "5 The tea party is aghast. \n", "\n", " hi \n", "1 जबकि अमेरिका के सड़क योजनाकार, ध्वस्त होते हुए... \n", "2 यह डिवाइस, जो मोटर-चालक द्वारा वाहन चलाए गए प्... \n", "3 आम तौर पर हाईवे नियोजन जैसा उबाऊ काम भी अचानक ... \n", "4 आपने द्वारा ड्राइव किए गए मील, तथा संभवतः ड्रा... \n", "5 चाय पार्टी भौचक्की है। " ], "text/html": [ "\n", "
\n", " | en | \n", "hi | \n", "
---|---|---|
1 | \n", "As America's road planners struggle to find th... | \n", "जबकि अमेरिका के सड़क योजनाकार, ध्वस्त होते हुए... | \n", "
2 | \n", "The devices, which track every mile a motorist... | \n", "यह डिवाइस, जो मोटर-चालक द्वारा वाहन चलाए गए प्... | \n", "
3 | \n", "The usually dull arena of highway planning has... | \n", "आम तौर पर हाईवे नियोजन जैसा उबाऊ काम भी अचानक ... | \n", "
4 | \n", "Libertarians have joined environmental groups ... | \n", "आपने द्वारा ड्राइव किए गए मील, तथा संभवतः ड्रा... | \n", "
5 | \n", "The tea party is aghast. | \n", "चाय पार्टी भौचक्की है। | \n", "
/content/wandb/run-20250302_193401-s8hefz7l
"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"Epoch | \n", "Training Loss | \n", "Validation Loss | \n", "
---|---|---|
1 | \n", "No log | \n", "1.876339 | \n", "
" ] }, "metadata": {} }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.11/dist-packages/transformers/modeling_utils.py:2758: UserWarning: Moving the following attributes in the config to the generation config: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]]}. You are seeing this warning because you've set generation parameters in the model config, as opposed to in the generation config.\n", " warnings.warn(\n", "There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "TrainOutput(global_step=251, training_loss=2.212838830225971, metrics={'train_runtime': 45.16, 'train_samples_per_second': 44.376, 'train_steps_per_second': 5.558, 'total_flos': 67932323315712.0, 'train_loss': 2.212838830225971, 'epoch': 1.0})" ] }, "metadata": {}, "execution_count": 29 } ] }, { "cell_type": "code", "source": [ "def translate(text, src_lang=\"en\", tgt_lang=\"hi\"):\n", " inputs = tokenizer(text, return_tensors=\"pt\")\n", " # Move inputs to the same device as the model\n", " # Assuming model is on cuda:0 or default cuda device\n", " for k in inputs:\n", " inputs[k] = inputs[k].to(model.device)\n", " translated_tokens = model.generate(**inputs, max_length=50)\n", " return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)\n", "\n", "print(translate(\"good man\"))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "OqV0Fiud09tt", "outputId": "fcd79a5e-7dbe-4138-b7c8-226318c4ac7c" }, "execution_count": 30, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "अच्छा आदमी\n" ] } ] }, { "cell_type": "code", "source": [ "save_directory = \"translate_model\"\n", "\n", "model.save_pretrained(save_directory)\n", "\n", "tokenizer.save_pretrained(save_directory)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5pDS9v4lBynb", "outputId": "652888cf-660d-40cd-fe5a-a53d27880b0a" }, "execution_count": 33, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "('translate_model/tokenizer_config.json',\n", " 'translate_model/special_tokens_map.json',\n", " 'translate_model/vocab.json',\n", " 'translate_model/source.spm',\n", " 'translate_model/target.spm',\n", " 'translate_model/added_tokens.json')" ] }, "metadata": {}, "execution_count": 33 } ] }, { "cell_type": "code", "source": [ "import shutil\n", "\n", "\n", "folder_path = \"/content/translate_model\"\n", "zip_name = \"translate_model_finetune.zip\"\n", "\n", "\n", "shutil.make_archive(zip_name.replace('.zip', ''), 'zip', folder_path)\n", "\n", "print(f\"Folder '{folder_path}' has been zipped as '{zip_name}'.\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9SuQLZ8KMnLl", "outputId": "27e6b557-ee86-425a-a47c-7d0f8ca812ab" }, "execution_count": 35, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Folder '/content/translate_model' has been zipped as 'translate_model_finetune.zip'.\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "qP6EM7VnWXaM" }, "execution_count": null, "outputs": [] } ] }