import pandas as pd from sklearn.model_selection import train_test_split import torch from torch.utils.data import DataLoader, Dataset from transformers import BertTokenizer, BertForSequenceClassification, AdamW from transformers import get_linear_schedule_with_warmup import numpy as np from sklearn.metrics import accuracy_score, classification_report import streamlit as st # Load and preprocess the IMDb dataset data_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" df = pd.read_csv(data_url) df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0}) train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) train_df.to_csv('train.csv', index=False) test_df.to_csv('test.csv', index=False) class SentimentDataset(Dataset): def __init__(self, dataframe, tokenizer, max_len): self.tokenizer = tokenizer self.data = dataframe self.max_len = max_len def __len__(self): return len(self.data) def __getitem__(self, index): review = str(self.data.iloc[index, 0]) label = self.data.iloc[index, 1] encoding = self.tokenizer.encode_plus( review, add_special_tokens=True, max_length=self.max_len, return_token_type_ids=False, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', ) return { 'review_text': review, 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label, dtype=torch.long) } def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples): model = model.train() losses = [] correct_predictions = 0 for d in data_loader: input_ids = d["input_ids"].to(device) attention_mask = d["attention_mask"].to(device) labels = d["labels"].to(device) outputs = model( input_ids=input_ids, attention_mask=attention_mask ) loss = loss_fn(outputs.logits, labels) correct_predictions += torch.sum(torch.argmax(outputs.logits, dim=1) == labels) losses.append(loss.item()) loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() return correct_predictions.double() / n_examples, np.mean(losses) def eval_model(model, data_loader, loss_fn, device, n_examples): model = model.eval() losses = [] correct_predictions = 0 with torch.no_grad(): for d in data_loader: input_ids = d["input_ids"].to(device) attention_mask = d["attention_mask"].to(device) labels = d["labels"].to(device) outputs = model( input_ids=input_ids, attention_mask=attention_mask ) loss = loss_fn(outputs.logits, labels) correct_predictions += torch.sum(torch.argmax(outputs.logits, dim=1) == labels) losses.append(loss.item()) return correct_predictions.double() / n_examples, np.mean(losses) def create_data_loader(df, tokenizer, max_len, batch_size): ds = SentimentDataset( dataframe=df, tokenizer=tokenizer, max_len=max_len ) return DataLoader( ds, batch_size=batch_size, num_workers=4 ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') # Load data train_df = pd.read_csv('train.csv') test_df = pd.read_csv('test.csv') # Create data loaders BATCH_SIZE = 16 MAX_LEN = 128 train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE) test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE) EPOCHS = 2 optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False) total_steps = len(train_data_loader) * EPOCHS scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps ) loss_fn = torch.nn.CrossEntropyLoss().to(device) model = model.to(device) # Training loop for epoch in range(EPOCHS): train_acc, train_loss = train_epoch( model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train_df) ) print(f'Epoch {epoch + 1}/{EPOCHS}') print(f'Train loss {train_loss} accuracy {train_acc}') val_acc, val_loss = eval_model( model, test_data_loader, loss_fn, device, len(test_df) ) print(f'Val loss {val_loss} accuracy {val_acc}') # Save the model model.save_pretrained('bert-sentiment-model') tokenizer.save_pretrained('bert-sentiment-model') # Streamlit app model = BertForSequenceClassification.from_pretrained('bert-sentiment-model') tokenizer = BertTokenizer.from_pretrained('bert-sentiment-model') model = model.eval() def predict_sentiment(text): encoding = tokenizer.encode_plus( text, add_special_tokens=True, max_length=128, return_token_type_ids=False, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', ) input_ids = encoding['input_ids'] attention_mask = encoding['attention_mask'] with torch.no_grad(): outputs = model(input_ids, attention_mask=attention_mask) probabilities = torch.nn.functional.softmax(outputs.logits, dim=1) predicted_class = torch.argmax(probabilities, dim=1).item() return 'positive' if predicted_class == 1 else 'negative' st.title("Sentiment Analysis with BERT") user_input = st.text_area("Enter a movie review:") if st.button("Analyze"): sentiment = predict_sentiment(user_input) st.write(f'The sentiment of the review is: **{sentiment}**')