# TinyLlama Fine-Tuning Example

This notebook demonstrates how to fine-tune a TinyLlama model on a custom dataset. We'll go through the following steps:

1. Setting up the environment
2. Loading the model and tokenizer
3. Preparing the dataset
4. Fine-tuning the model
5. Evaluating the results
6. Saving and using the fine-tuned model

## 1. Setting up the environment

First, let's install the necessary libraries if they're not already installed.

In [None]:
# Install required libraries
!pip install torch transformers datasets accelerate tqdm

Import required libraries:

In [None]:
import os
import json
import torch
from transformers import (
 AutoModelForCausalLM, 
 AutoTokenizer,
 Trainer, 
 TrainingArguments,
 DataCollatorForLanguageModeling
)
from datasets import Dataset
from tqdm.notebook import tqdm

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

## 2. Loading the model and tokenizer

We'll use the TinyLlama-1.1B-Chat-v1.0 model from Hugging Face.

In [None]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
 tokenizer.pad_token = tokenizer.eos_token

# Load model with reduced precision to save memory
model = AutoModelForCausalLM.from_pretrained(
 model_name,
 torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
 low_cpu_mem_usage=True
)
model = model.to(device)

print(f"Model and tokenizer loaded: {model_name}")

## 3. Preparing the dataset

Let's load our example training data and format it properly for fine-tuning.

In [None]:
# Load example data
with open('example_training_data.json', 'r', encoding='utf-8') as f:
 data = json.load(f)

# Format data for instruction fine-tuning
formatted_data = []
for item in data:
 # Format as a chat-like conversation
 formatted_text = f"<|im_start|>user\n{item['instruction']}<|im_end|>\n<|im_start|>assistant\n{item['response']}<|im_end|>"
 formatted_data.append({"text": formatted_text})

# Create a Hugging Face dataset
dataset = Dataset.from_list(formatted_data)
print(f"Dataset created with {len(dataset)} examples")

# Show an example
print("\nExample entry:")
print(dataset[0]['text'])

In [None]:
# Tokenize the dataset
def tokenize_function(examples):
 return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Add labels for causal language modeling
def add_labels(examples):
 examples["labels"] = examples["input_ids"].copy()
 return examples

# Process dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.map(add_labels, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])

# Split into training and evaluation sets
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)
print(f"Training examples: {len(tokenized_dataset['train'])}")
print(f"Evaluation examples: {len(tokenized_dataset['test'])}")

## 4. Fine-tuning the model

Now we'll set up the training configuration and fine-tune the model.

In [None]:
# Set up training arguments
output_dir = "./fine_tuned_tinyllama"

training_args = TrainingArguments(
 output_dir=output_dir,
 overwrite_output_dir=True,
 num_train_epochs=3, # Adjust based on your dataset size
 per_device_train_batch_size=2, # Adjust based on your GPU memory
 per_device_eval_batch_size=2,
 gradient_accumulation_steps=4, # Accumulate gradients to simulate larger batch size
 learning_rate=2e-5,
 weight_decay=0.01,
 logging_dir=f"{output_dir}/logs",
 logging_steps=10,
 eval_steps=100,
 save_steps=100,
 save_total_limit=2, # Only keep the 2 best checkpoints
 evaluation_strategy="steps",
 fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available
 warmup_steps=100,
 report_to="none", # Disable reporting to wandb, etc.
)

In [None]:
# Set up data collator
data_collator = DataCollatorForLanguageModeling(
 tokenizer=tokenizer,
 mlm=False # We're doing causal language modeling, not masked language modeling
)

# Set up trainer
trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=tokenized_dataset["train"],
 eval_dataset=tokenized_dataset["test"],
 data_collator=data_collator,
)

# Train the model
print("Starting fine-tuning...")
trainer.train()

## 5. Evaluating the results

Let's evaluate the fine-tuned model on some test prompts.

In [None]:
# Save the fine-tuned model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

In [None]:
# Test the model with a few prompts
test_prompts = [
 "Explain the concept of neural networks.",
 "Write a short story about a robot that learns to feel emotions.",
 "What are three sustainable energy sources and how do they work?"
]

# Format prompts for the chat model
formatted_prompts = [f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" for prompt in test_prompts]

# Generate responses
for i, prompt in enumerate(formatted_prompts):
 print(f"\n\nPrompt {i+1}: {test_prompts[i]}")
 print("\nGenerating response...")
 
 inputs = tokenizer(prompt, return_tensors="pt").to(device)
 
 with torch.no_grad():
 outputs = model.generate(
 inputs.input_ids,
 max_new_tokens=256,
 temperature=0.7,
 do_sample=True,
 pad_token_id=tokenizer.eos_token_id
 )
 
 # Get only the newly generated text (not the prompt)
 response_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
 
 print(f"Response: {response_text}")

## 6. Using the fine-tuned model

Here's how you can load and use your fine-tuned model in the future.

In [None]:
# Load the fine-tuned model and tokenizer
def load_fine_tuned_model(model_path):
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 model = AutoModelForCausalLM.from_pretrained(
 model_path,
 torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
 )
 model = model.to(device)
 return model, tokenizer

# Example of loading the model (uncomment to run)
# fine_tuned_model, fine_tuned_tokenizer = load_fine_tuned_model(output_dir)

# Function to generate a response
def generate_response(model, tokenizer, prompt, max_length=256, temperature=0.7):
 # Format the prompt
 formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
 
 # Tokenize
 inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
 
 # Generate
 with torch.no_grad():
 outputs = model.generate(
 inputs.input_ids,
 max_new_tokens=max_length,
 temperature=temperature,
 do_sample=True,
 pad_token_id=tokenizer.eos_token_id
 )
 
 # Decode
 full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 # Extract assistant's response
 try:
 assistant_response = full_response.split("<|im_start|>assistant\n")[1].split("<|im_end|>")[0]
 except IndexError:
 assistant_response = full_response.replace(prompt, "").strip()
 
 return assistant_response

# Example usage (uncomment to run)
# response = generate_response(fine_tuned_model, fine_tuned_tokenizer, "Explain quantum computing.")
# print(response)

## Conclusion

You've successfully fine-tuned a TinyLlama model on a custom dataset! You can now use this model for various applications:

1. Integrate it into a chatbot or virtual assistant
2. Use it for content generation
3. Deploy it as part of a web application
4. Fine-tune it further on more specific data

You can also experiment with different hyperparameters and training strategies to improve results.