{ "cells": [ { "cell_type": "code", "execution_count": 24, "id": "27b0322e-d6a8-4202-9f78-8d2754ebdd97", "metadata": {}, "outputs": [], "source": [ "#!pip list | grep hugging" ] }, { "cell_type": "code", "execution_count": 40, "id": "da82a90f-7098-4d0c-9fe8-3e0cfc39671d", "metadata": {}, "outputs": [], "source": [ "#!pip install transformers datasets" ] }, { "cell_type": "code", "execution_count": 12, "id": "829575c2-c292-4455-8cc6-48764e64c4b0", "metadata": {}, "outputs": [], "source": [ "#!pip install torch" ] }, { "cell_type": "code", "execution_count": 1, "id": "ba0ced0b-35cd-40fd-934f-1013d4a1364d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/milindchawre/.pyenv/versions/3.12.2/envs/hugging-face/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import transformers\n", "import datasets" ] }, { "cell_type": "code", "execution_count": 2, "id": "d196c435-fa5a-4c3b-bec4-0181aa00e8bb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'4.44.0'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transformers.__version__" ] }, { "cell_type": "code", "execution_count": 3, "id": "55ddfdaa-3a22-4eab-ad36-24355cbb7fee", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'2.21.0'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "datasets.__version__" ] }, { "cell_type": "code", "execution_count": 4, "id": "13b8669b-5cc5-40b8-bd22-a7ff44fa43f3", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": 5, "id": "a722a796-84b7-4c45-a104-3d863d52cbb5", "metadata": {}, "outputs": [], "source": [ "reviews = load_dataset('rotten_tomatoes')" ] }, { "cell_type": "code", "execution_count": 6, "id": "607c352e-70c8-4697-b9a1-a4c68e55d502", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "datasets.dataset_dict.DatasetDict" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(reviews)" ] }, { "cell_type": "code", "execution_count": 7, "id": "e0637ff3-90b9-41cf-bdd0-ba3dbe185225", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['text', 'label'],\n", " num_rows: 8530\n", " })\n", " validation: Dataset({\n", " features: ['text', 'label'],\n", " num_rows: 1066\n", " })\n", " test: Dataset({\n", " features: ['text', 'label'],\n", " num_rows: 1066\n", " })\n", "})\n" ] } ], "source": [ "print(reviews)" ] }, { "cell_type": "code", "execution_count": 8, "id": "79f60106-7628-4605-920f-6bb8375e6cb5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textlabel
0the rock is destined to be the 21st century's ...1
1the gorgeously elaborate continuation of \" the...1
2effective but too-tepid biopic1
3if you sometimes like to go to the movies to h...1
4emerges as something rare , an issue movie tha...1
.........
8525any enjoyment will be hinge from a personal th...0
8526if legendary shlockmeister ed wood had ever ma...0
8527hardly a nuanced portrait of a young woman's b...0
8528interminably bleak , to say nothing of boring .0
8529things really get weird , though not particula...0
\n", "

8530 rows × 2 columns

\n", "
" ], "text/plain": [ " text label\n", "0 the rock is destined to be the 21st century's ... 1\n", "1 the gorgeously elaborate continuation of \" the... 1\n", "2 effective but too-tepid biopic 1\n", "3 if you sometimes like to go to the movies to h... 1\n", "4 emerges as something rare , an issue movie tha... 1\n", "... ... ...\n", "8525 any enjoyment will be hinge from a personal th... 0\n", "8526 if legendary shlockmeister ed wood had ever ma... 0\n", "8527 hardly a nuanced portrait of a young woman's b... 0\n", "8528 interminably bleak , to say nothing of boring . 0\n", "8529 things really get weird , though not particula... 0\n", "\n", "[8530 rows x 2 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reviews['train'].to_pandas()" ] }, { "cell_type": "code", "execution_count": 9, "id": "8971be5a-f5ba-4cb9-be88-7fa509b201ef", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reviews['train'].to_pandas()['text'][0]" ] }, { "cell_type": "code", "execution_count": 10, "id": "74db724b-063c-4c91-9681-37231a5a09fd", "metadata": {}, "outputs": [], "source": [ "from transformers import pipeline\n", "import torch" ] }, { "cell_type": "code", "execution_count": 11, "id": "4a22ba6c-2db8-4dff-9abd-037f1a08fc7a", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).\n", "Using a pipeline without specifying a model name and revision in production is not recommended.\n", "/Users/milindchawre/.pyenv/versions/3.12.2/envs/hugging-face/lib/python3.12/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", " warnings.warn(\n" ] } ], "source": [ "classifier = pipeline(\"sentiment-analysis\", device=0)" ] }, { "cell_type": "code", "execution_count": 12, "id": "2d64c34d-c2d8-4cda-9c04-91e5497304a9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'label': 'POSITIVE', 'score': 0.9998668432235718}]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classifier(\"This was great movie!\")" ] }, { "cell_type": "code", "execution_count": 13, "id": "fc11b406-3770-44ea-9654-4ec2e2e1081b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'label': 'POSITIVE', 'score': 0.9998465776443481}]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classifier(\"This was great ok!\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "682a57fc-d2f6-4ad5-b05c-ad4a311216fa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'label': 'NEGATIVE', 'score': 0.9997976422309875}]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classifier(\"This was not that good movie!\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "3e790b3e-cb65-46a8-9bcb-74503742fcb6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'label': 'NEGATIVE', 'score': 0.9997455477714539}]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classifier(\"This was worst movie!\")" ] }, { "cell_type": "code", "execution_count": 18, "id": "56f1f502-f683-4c3a-924d-9a16e5d6c55c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'label': 'NEGATIVE', 'score': 0.9991676807403564}]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classifier(\"In the movie, the acting was fine, but the story was bad, while the costume was good but the daigloues are boring!\")" ] }, { "cell_type": "code", "execution_count": 19, "id": "a8755bd0-5c4b-4da7-84d5-fb1986188d4e", "metadata": {}, "outputs": [], "source": [ "def score(review_text):\n", " return classifier(review_text)[0]['label']" ] }, { "cell_type": "code", "execution_count": 20, "id": "1de830b3-bb2d-42ce-90b6-b2868017d499", "metadata": {}, "outputs": [], "source": [ "test_df = reviews['test'].to_pandas()" ] }, { "cell_type": "code", "execution_count": 22, "id": "0b0004ea-562c-493b-8a0c-125dd20a185f", "metadata": {}, "outputs": [], "source": [ "test_df['model_prediction'] = test_df['text'].apply(score)" ] }, { "cell_type": "code", "execution_count": 24, "id": "21715d4f-51f2-4118-b5d8-23b5d432d3b6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textlabelmodel_prediction
0lovingly photographed in the manner of a golde...1POSITIVE
1consistently clever and suspenseful .1POSITIVE
2it's like a \" big chill \" reunion of the baade...1NEGATIVE
3the story gives ample opportunity for large-sc...1POSITIVE
4red dragon \" never cuts corners .1POSITIVE
............
1061a terrible movie that some people will neverth...0NEGATIVE
1062there are many definitions of 'time waster' bu...0NEGATIVE
1063as it stands , crocodile hunter has the hurrie...0NEGATIVE
1064the thing looks like a made-for-home-video qui...0NEGATIVE
1065enigma is well-made , but it's just too dry an...0NEGATIVE
\n", "

1066 rows × 3 columns

\n", "
" ], "text/plain": [ " text label \\\n", "0 lovingly photographed in the manner of a golde... 1 \n", "1 consistently clever and suspenseful . 1 \n", "2 it's like a \" big chill \" reunion of the baade... 1 \n", "3 the story gives ample opportunity for large-sc... 1 \n", "4 red dragon \" never cuts corners . 1 \n", "... ... ... \n", "1061 a terrible movie that some people will neverth... 0 \n", "1062 there are many definitions of 'time waster' bu... 0 \n", "1063 as it stands , crocodile hunter has the hurrie... 0 \n", "1064 the thing looks like a made-for-home-video qui... 0 \n", "1065 enigma is well-made , but it's just too dry an... 0 \n", "\n", " model_prediction \n", "0 POSITIVE \n", "1 POSITIVE \n", "2 NEGATIVE \n", "3 POSITIVE \n", "4 POSITIVE \n", "... ... \n", "1061 NEGATIVE \n", "1062 NEGATIVE \n", "1063 NEGATIVE \n", "1064 NEGATIVE \n", "1065 NEGATIVE \n", "\n", "[1066 rows x 3 columns]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_df" ] }, { "cell_type": "code", "execution_count": null, "id": "3c916443-92ad-4dc6-a5bf-10aad52d8b4c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 5 }