Instruction Fine tuning PEFT

Project Goal:

Fine-tune a base LLama model (TinyLlama 1.1B) to follow human-written instructions. This aims to enhance the model's ability to generate contextually appropriate responses based on given prompts.

Dataset:

The Dolly 15K dataset (https://huggingface.co/datasets/databricks/databricks-dolly-15k) comprises 15,000 high-quality, human-generated instruction-response pairs. Fine-tuning on dataset enables the model to better understand and execute a variety of instructions, ranging from question-answering to summarization. See instruction data examples below.

sample of dolly isntruction dataset

Relevant Literature & Resources

  1. LoRA: Low-Rank Adaptation of Large Language Models - "We propose Low-Rank Adaptation, or LoRA, which freezes the pre-trained model weights and injects trainable rank decomposition matrices into each layer of the Transformer architecture, greatly reducing the number of trainable parameters for downstream tasks" - https://arxiv.org/abs/2106.09685
  2. Finetuning LLMs using LoRA - "LoRA is an improved finetuning method where instead of finetuning all the weights that constitute the weight matrix (W) of the pre-trained large language model, two smaller matrices (A and B) that approximate the update to the matrix are fine-tuned." - https://anirbansen2709.medium.com/finetuning-llms-using-lora-77fb02cbbc48

Modeling Approach:

Example Results:

Highlighted Code Sections:


import transformers
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import DatasetDict
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader
from functools import partial
from unsloth import FastLanguageModel

# Dataset Preparation
dataset = load_dataset("databricks/databricks-dolly-15k")
def format_dolly(example):
    if example['context']:
        prompt = f"### Instruction:\n{example['instruction']}\n\n### Context:\n{example['context']}"
    else:
        prompt = f"### Instruction:\n{example['instruction']}"
    return {
        "prompt": prompt,
        "output": example["response"]
    }

formatted_dataset = dataset["train"].map(format_dolly, remove_columns=dataset["train"].column_names)
formatted_dataset = formatted_dataset.shuffle(seed=42)
train_val_test = formatted_dataset.train_test_split(test_size=0.2, seed=42)
val_test = train_val_test['test'].train_test_split(test_size=0.5, seed=42)
split_dataset = DatasetDict({
    'train': train_val_test['train'],
    'validation': val_test['train'],
    'test': val_test['test']
})

 


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True
)

# Add LoRA adapter to enable fine-tuning
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none"
)

def to_text(example):
    return {"text": f"{example['prompt']}\n{example['output']}"}

def tokenize(example):
    tokenized = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=2048,
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

train_data = small_dataset["train"].map(to_text)
tokenized_train = train_data.map(tokenize, remove_columns=train_data.column_names)

val_data = small_dataset["validation"].map(to_text)
tokenized_val = val_data.map(tokenize, remove_columns=val_data.column_names)
    
 

    from transformers import TrainerCallback
import matplotlib.pyplot as plt


training_args = TrainingArguments(
    output_dir = "/content/drive/MyDrive/genai/tinyllama-dolly-finetuned",
    num_train_epochs = 2,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 1,
    logging_steps = 10,
    save_steps = 1000,
    eval_strategy = "no",
    save_total_limit = 1,
    fp16 = True,
    push_to_hub = False,
    report_to = "none",
)

trainer3 = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_val,
    args = training_args,   
    dataset_text_field = None,
    max_seq_length = 2048,
    packing = False,
    callbacks = [LossPlotCallback()],
)

trainer3.train()

def generate_responses(model, tokenizer, prompts, max_new_tokens=100):
    model.eval()
    responses = []
    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id,
                temperature=0.7,
            )
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        responses.append(decoded[len(prompt):].strip())
    return responses