import datasets import evaluate import os import pandas as pd import numpy as np from datasets import Dataset from sklearn.model_selection import train_test_split from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer) os.environ["CUDA_VISIBLE_DEVICES"] = "" model_name = "cointegrated/rubert-tiny2" # Login using e.g. `huggingface-cli login` to access this dataset splits = {'train': 'train.json', 'test': 'test.json'} df = pd.read_json("hf://datasets/Den4ikAI/gibberish_dataset/" + splits["train"]) df = df.head(500) # Конвертируем датафрейм в Dataset train, test = train_test_split(df, test_size=0.2) train = Dataset.from_pandas(train) test = Dataset.from_pandas(test) # Выполняем предобработку текста tokenizer = AutoTokenizer.from_pretrained(model_name, max_len=400) def tokenize_function(examples): return tokenizer(examples['text'], padding='max_length', truncation=True) tokenized_train = train.map(tokenize_function) tokenized_test = test.map(tokenize_function) # Загружаем предобученную модель model = AutoModelForSequenceClassification.from_pretrained( model_name, num_labels=4) model.to("cpu") # Задаем параметры обучения training_args = TrainingArguments( output_dir='test_trainer_log', eval_strategy='epoch', per_device_train_batch_size=6, per_device_eval_batch_size=6, num_train_epochs=5, report_to='none' ) metric = evaluate.load('f1') def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) return metric.compute( predictions=predictions, references=labels, average='micro' ) # Выполняем обучение trainer = Trainer( model = model, args = training_args, train_dataset = tokenized_train, eval_dataset = tokenized_test, compute_metrics = compute_metrics) trainer.train() # Сохраняем модель save_directory = './pt_save_pretrained' #tokenizer.save_pretrained(save_directory) model.save_pretrained(save_directory) #alternatively save the trainer #trainer.save_model('CustomModels/CustomHamSpam')