In [None]:
!pip install transformers datasets accelerate peft bitsandbytes huggingface_hub

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2

In [None]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

#Initial setup

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training , TaskType
from datasets import load_dataset
from huggingface_hub import login
import torch

# STEP 1: Login to Hugging Face Hub
login()  # read token Paste your token here

# STEP 2: Bits and Bytes Config for 4-bit Quantized Training
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# STEP 3: Load Model and Tokenizer
base_model = "microsoft/phi-3-mini-128k-instruct"

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # ✅ Required for causal LM

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

model = prepare_model_for_kbit_training(model)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    # Updated target_modules based on typical Phi-3 architecture
    target_modules=["qkv_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 15,204,352 || all params: 3,836,283,904 || trainable%: 0.3963


In [None]:


combined_file = "/content/drive/MyDrive/Fine tune Data Analyzer/plotbot_combined.jsonl"

from datasets import load_dataset, Dataset
import json

# Read data from the JSONL files
all_data = []

with open(combined_file, 'r') as f:
    for line in f:
        all_data.append(json.loads(line))

dataset = Dataset.from_list(all_data)

print(dataset[0])

def format_phi3_prompt(example):
    return {
        "text": f"<|user|>\n{example['prompt']}\n\n<|assistant|>\n{example['completion']}"
    }

dataset = dataset.map(format_phi3_prompt)

def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["prompt", "completion", "text"])
print(tokenized_dataset[0])

{'prompt': 'Show moisturizer sales data of each month using a scatter plot.', 'completion': 'import pandas as pd\nimport matplotlib.pyplot as plt\n\nmonthList = df[\'month_number\'].tolist()\nsalesData = df[\'moisturizer\'].tolist()\nplt.scatter(monthList, salesData, label=\'moisturizer Sales data\')\nplt.xlabel(\'Month Number\')\nplt.ylabel(\'Number of units Sold\')\nplt.legend(loc=\'upper left\')\nplt.title(\'moisturizer Sales data\')\nplt.xticks(monthList)\nplt.grid(True, linewidth=1, linestyle="--")\nplt.show()'}


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

{'input_ids': [32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

#Initialy train on 1 epoch and save

after first epoch it will save at location output_dir so that u can resume 2nd epoch later on

In [None]:

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Fine tune Data Analyzer/phi3-checkpoints",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=1,
    fp16=True,
    bf16=False,
    report_to="none",
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)


trainer.train()

NameError: name 'Trainer' is not defined

#Resuming training for another epochs

In [None]:

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Fine tune Data Analyzer/phi3-checkpoints",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    learning_rate=2e-4,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=1,
    fp16=True,
    bf16=False,
    report_to="none"

)

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train(resume_from_checkpoint="/content/drive/MyDrive/Fine tune Data Analyzer/phi3-checkpoints/checkpoint-1250")





No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Step,Training Loss
1260,0.1251
1270,0.1114
1280,0.1179
1290,0.1176
1300,0.1131
1310,0.1188
1320,0.1118
1330,0.1136
1340,0.1082
1350,0.132


u can resume for another epochs with same way

In [None]:
model = model.merge_and_unload()
model.save_pretrained("/content/drive/MyDrive/Fine tune Data Analyzer/phi3-matplotlib-cpu2", safe_serialization=True)
tokenizer.save_pretrained("/content/drive/MyDrive/Fine tune Data Analyzer/phi3-matplotlib-cpu2")
