| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | import argparse |
| | import bitsandbytes as bnb |
| | from datasets import load_dataset |
| | from functools import partial |
| | import os |
| | from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM |
| | import torch |
| | from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \ |
| | DataCollatorForLanguageModeling, Trainer, TrainingArguments |
| | from datasets import load_dataset |
| |
|
| | def load_model(model_name, bnb_config): |
| | n_gpus = torch.cuda.device_count() |
| | max_memory = f'{40960}MB' |
| |
|
| | model = AutoModelForCausalLM.from_pretrained( |
| | model_name, |
| | quantization_config=bnb_config, |
| | device_map="auto", |
| | max_memory = {i: max_memory for i in range(n_gpus)}, |
| | ) |
| | tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True) |
| |
|
| | |
| | tokenizer.pad_token = tokenizer.eos_token |
| |
|
| | return model, tokenizer |
| |
|
| | |
| | from datasets import load_dataset |
| |
|
| | dataset = load_dataset("databricks/databricks-dolly-15k", split="train") |
| |
|
| | print(f'Number of prompts: {len(dataset)}') |
| | print(f'Column names are: {dataset.column_names}') |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | def create_prompt_formats(sample): |
| | """ |
| | Format various fields of the sample ('instruction', 'context', 'response') |
| | Then concatenate them using two newline characters |
| | :param sample: Sample dictionnary |
| | """ |
| |
|
| | INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request." |
| | INSTRUCTION_KEY = "### Instruction:" |
| | INPUT_KEY = "Input:" |
| | RESPONSE_KEY = "### Response:" |
| | END_KEY = "### End" |
| | |
| | blurb = f"{INTRO_BLURB}" |
| | instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}" |
| | input_context = f"{INPUT_KEY}\n{sample['context']}" if sample["context"] else None |
| | response = f"{RESPONSE_KEY}\n{sample['response']}" |
| | end = f"{END_KEY}" |
| | |
| | parts = [part for part in [blurb, instruction, input_context, response, end] if part] |
| |
|
| | formatted_prompt = "\n\n".join(parts) |
| | |
| | sample["text"] = formatted_prompt |
| |
|
| | return sample |
| |
|
| |
|
| | |
| | def get_max_length(model): |
| | conf = model.config |
| | max_length = None |
| | for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]: |
| | max_length = getattr(model.config, length_setting, None) |
| | if max_length: |
| | print(f"Found max lenth: {max_length}") |
| | break |
| | if not max_length: |
| | max_length = 1024 |
| | print(f"Using default max length: {max_length}") |
| | return max_length |
| |
|
| |
|
| | def preprocess_batch(batch, tokenizer, max_length): |
| | """ |
| | Tokenizing a batch |
| | """ |
| | return tokenizer( |
| | batch["text"], |
| | max_length=max_length, |
| | truncation=True, |
| | ) |
| |
|
| |
|
| | |
| | def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str): |
| | """Format & tokenize it so it is ready for training |
| | :param tokenizer (AutoTokenizer): Model Tokenizer |
| | :param max_length (int): Maximum number of tokens to emit from tokenizer |
| | """ |
| | |
| | |
| | print("Preprocessing dataset...") |
| | dataset = dataset.map(create_prompt_formats) |
| | |
| | |
| | _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer) |
| | dataset = dataset.map( |
| | _preprocessing_function, |
| | batched=True, |
| | remove_columns=["instruction", "context", "response", "text", "category"], |
| | ) |
| |
|
| | |
| | dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length) |
| | |
| | |
| | dataset = dataset.shuffle(seed=seed) |
| |
|
| | return dataset |
| |
|
| | |
| | def create_bnb_config(): |
| | bnb_config = BitsAndBytesConfig( |
| | load_in_4bit=True, |
| | bnb_4bit_use_double_quant=True, |
| | bnb_4bit_quant_type="nf4", |
| | bnb_4bit_compute_dtype=torch.bfloat16, |
| | ) |
| |
|
| | return bnb_config |
| |
|
| | def create_peft_config(modules): |
| | """ |
| | Create Parameter-Efficient Fine-Tuning config for your model |
| | :param modules: Names of the modules to apply Lora to |
| | """ |
| | config = LoraConfig( |
| | r=16, |
| | lora_alpha=64, |
| | target_modules=modules, |
| | lora_dropout=0.1, |
| | bias="none", |
| | task_type="CAUSAL_LM", |
| | ) |
| |
|
| | return config |
| |
|
| | |
| |
|
| | def find_all_linear_names(model): |
| | cls = bnb.nn.Linear4bit |
| | lora_module_names = set() |
| | for name, module in model.named_modules(): |
| | if isinstance(module, cls): |
| | names = name.split('.') |
| | lora_module_names.add(names[0] if len(names) == 1 else names[-1]) |
| |
|
| | if 'lm_head' in lora_module_names: |
| | lora_module_names.remove('lm_head') |
| | return list(lora_module_names) |
| |
|
| | def print_trainable_parameters(model, use_4bit=False): |
| | """ |
| | Prints the number of trainable parameters in the model. |
| | """ |
| | trainable_params = 0 |
| | all_param = 0 |
| | for _, param in model.named_parameters(): |
| | num_params = param.numel() |
| | |
| | if num_params == 0 and hasattr(param, "ds_numel"): |
| | num_params = param.ds_numel |
| |
|
| | all_param += num_params |
| | if param.requires_grad: |
| | trainable_params += num_params |
| | if use_4bit: |
| | trainable_params /= 2 |
| | print( |
| | f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}" |
| | ) |
| |
|
| | |
| |
|
| | model_name = "meta-llama/Llama-2-7b-hf" |
| |
|
| | bnb_config = create_bnb_config() |
| |
|
| | model, tokenizer = load_model(model_name, bnb_config) |
| |
|
| | print(model) |
| |
|
| | |
| |
|
| | max_length = get_max_length(model) |
| |
|
| | print(max_length) |
| |
|
| | |
| | |
| | seed = 98345 |
| |
|
| | dataset = preprocess_dataset(tokenizer, max_length, seed, dataset) |
| |
|
| |
|
| | def train(model, tokenizer, dataset, output_dir): |
| | |
| | |
| | model.gradient_checkpointing_enable() |
| |
|
| | |
| | model = prepare_model_for_kbit_training(model) |
| |
|
| | |
| | modules = find_all_linear_names(model) |
| |
|
| | |
| | peft_config = create_peft_config(modules) |
| | model = get_peft_model(model, peft_config) |
| | |
| | |
| | print_trainable_parameters(model) |
| | |
| | |
| | trainer = Trainer( |
| | model=model, |
| | train_dataset=dataset, |
| | args=TrainingArguments( |
| | per_device_train_batch_size=1, |
| | gradient_accumulation_steps=4, |
| | warmup_steps=2, |
| | max_steps=20, |
| | learning_rate=2e-4, |
| | fp16=True, |
| | logging_steps=1, |
| | output_dir="outputs", |
| | optim="paged_adamw_8bit", |
| | ), |
| | data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False) |
| | ) |
| | |
| | model.config.use_cache = False |
| | |
| | |
| | |
| | |
| | dtypes = {} |
| | for _, p in model.named_parameters(): |
| | dtype = p.dtype |
| | if dtype not in dtypes: dtypes[dtype] = 0 |
| | dtypes[dtype] += p.numel() |
| | total = 0 |
| | for k, v in dtypes.items(): total+= v |
| | for k, v in dtypes.items(): |
| | print(k, v, v/total) |
| | |
| | do_train = True |
| | |
| | |
| | print("Training...") |
| | |
| | if do_train: |
| | train_result = trainer.train() |
| | metrics = train_result.metrics |
| | trainer.log_metrics("train", metrics) |
| | trainer.save_metrics("train", metrics) |
| | trainer.save_state() |
| | print(metrics) |
| | |
| | |
| | |
| | |
| | print("Saving last checkpoint of the model...") |
| | os.makedirs(output_dir, exist_ok=True) |
| | trainer.model.save_pretrained(output_dir) |
| | |
| | |
| | del model |
| | del trainer |
| | torch.cuda.empty_cache() |
| |
|
| | output_dir = "results/llama2/final_checkpoint" |
| |
|
| | |
| | print("Run train ...") |
| | train(model, tokenizer, dataset, output_dir) |
| |
|
| |
|
| | |
| | model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16) |
| | model = model.merge_and_unload() |
| |
|
| | output_merged_dir = "results/llama2/final_merged_checkpoint" |
| | os.makedirs(output_merged_dir, exist_ok=True) |
| | model.save_pretrained(output_merged_dir, safe_serialization=True) |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| | tokenizer.save_pretrained(output_merged_dir) |
| |
|
| |
|