Text Generation
Transformers
Safetensors
English
bolmo
custom_code
Bolmo-7B / olmo_core /config.json
benjamin's picture
Upload folder using huggingface_hub
5b2811e verified
{
"model": {
"d_model": 4096,
"vocab_size": 640,
"n_layers": 32,
"block": {
"attention": {
"name": "default",
"n_heads": 32,
"bias": false,
"rope": {
"name": "default",
"theta": 500000,
"full_precision": true,
"_CLASS_": "olmo_core.nn.rope.RoPEConfig"
},
"qk_norm": {
"name": "rms",
"eps": 1e-06,
"bias": false,
"dtype": "float32",
"_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
},
"use_flash": true,
"backend": "flash_2",
"dtype": "float32",
"sliding_window": {
"pattern": [
4096,
4096,
4096,
-1
],
"force_full_attention_on_first_layer": false,
"force_full_attention_on_last_layer": true,
"_CLASS_": "olmo_core.nn.attention.SlidingWindowAttentionConfig"
},
"_CLASS_": "olmo_core.nn.attention.AttentionConfig"
},
"layer_norm": {
"name": "rms",
"eps": 1e-06,
"bias": false,
"dtype": "float32",
"_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
},
"feed_forward": {
"hidden_size": 11008,
"name": "default",
"bias": false,
"dtype": "float32",
"act_name": "silu",
"_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig"
},
"name": "reordered_norm",
"_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"
},
"lm_head": {
"name": "default",
"layer_norm": {
"name": "rms",
"eps": 1e-06,
"bias": false,
"dtype": "float32",
"_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
},
"bias": false,
"dtype": "float32",
"loss_implementation": "default",
"_CLASS_": "olmo_core.nn.lm_head.LMHeadConfig"
},
"name": "bolmo_distill",
"dtype": "float32",
"init_method": "normal",
"init_seed": 0,
"init_std": 0.02,
"freeze_params": [
"boundary_predictor.*",
"teacher_embeddings.*"
],
"local_encoder": {
"sliding_window_size": 0,
"d_model": 4096,
"n_layers": 1,
"block_config": {
"attention": {
"name": "default",
"n_heads": 16,
"dtype": "float32",
"_CLASS_": "olmo_core.nn.attention.AttentionConfig"
},
"layer_norm": {
"name": "rms",
"eps": 1e-06,
"bias": false,
"dtype": "float32",
"_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
},
"feed_forward": {
"hidden_size": 5504,
"name": "default",
"bias": false,
"dtype": "float32",
"act_name": "silu",
"_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig"
},
"xlstm": {
"num_heads": 16,
"dtype": "float32",
"_CLASS_": "olmo_core.nn.xlstm.XLSTMConfig"
},
"name": "xlstm",
"_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"
},
"cross_attn_n_heads": 0,
"cross_attn_do_project": true,
"cross_attn_init_pooling": "amax",
"pooling": "hnet",
"add_hash_embeddings": false,
"add_expanded_embeddings": true,
"hash_byte_group_size": [
3,
4,
5,
6,
7,
8
],
"hash_byte_group_vocab": [
1536,
3072,
6144,
12288,
24576,
49152
],
"hash_byte_group_nb_functions": 1,
"add_norm_after_last_block": true,
"add_norm_after_pool": false,
"add_out_projection": true,
"boundary_predictor": "hnet",
"boundary_predictor_lookahead": 1,
"represent_bytes_with_embeddings": false,
"represent_bytes_with_last_mixed_out": false,
"blt_compat": false,
"dtype": "float32",
"_CLASS_": "olmo_core.nn.bolmo.config.LocalEncoderConfig"
},
"local_decoder": {
"sliding_window_size": 0,
"d_model": 4096,
"n_layers": 4,
"cross_attn_n_heads": 0,
"block_config": {
"attention": {
"name": "default",
"n_heads": 16,
"dtype": "float32",
"_CLASS_": "olmo_core.nn.attention.AttentionConfig"
},
"layer_norm": {
"name": "rms",
"eps": 1e-06,
"bias": false,
"dtype": "float32",
"_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
},
"feed_forward": {
"hidden_size": 5504,
"name": "default",
"bias": false,
"dtype": "float32",
"act_name": "silu",
"_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig"
},
"xlstm": {
"num_heads": 16,
"dtype": "float32",
"_CLASS_": "olmo_core.nn.xlstm.XLSTMConfig"
},
"name": "xlstm",
"_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"
},
"depooling": "hnet",
"add_norm_before_first_block": true,
"add_norm_onto_residual": false,
"add_in_projection": true,
"add_projected_patch_residuals": false,
"hnet_smooth": false,
"hnet_smooth_ste": false,
"hnet_modulate": false,
"blt_compat": false,
"fuse_boundaries": true,
"no_boundaries": false,
"dtype": "float32",
"_CLASS_": "olmo_core.nn.bolmo.config.LocalDecoderConfig"
},
"share_blocks_between_teacher_and_student": false,
"_CLASS_": "olmo_core.nn.transformer.config.TransformerConfig"
},
"dataset": {
"tokenizer": {
"vocab_size": 520,
"eos_token_id": 1,
"pad_token_id": 0,
"bos_token_id": 1,
"special_tokens": [
"<pad>",
"<bos>",
"<eos>",
"<bpe_token_end>"
],
"special_tokens_first": true,
"original_identifier": "allenai/dolma2-tokenizer",
"bpe_token_end_id": 3,
"_CLASS_": "olmo_core.data.tokenizer.ByteTokenizerConfig"
},
"paths": [],
"expand_glob": false,
"include_instance_metadata": true,
"work_dir": "",
"ignore_fingerprint_mismatch": false,
"sequence_length": 4096,
"generate_doc_lengths": false,
"byte_sequence_length": 24576,
"_CLASS_": "olmo_core.data.numpy_dataset.NumpyByteFSLDatasetConfig"
},
"data_loader": {
"global_batch_size": 1572864,
"seed": 1234,
"num_workers": 24,
"ignore_fingerprint_mismatch": false,
"_CLASS_": "olmo_core.data.data_loader.NumpyDataLoaderConfig"
},
"train_module": {
"rank_microbatch_size": 49152,
"max_sequence_length": 24576,
"optim": {
"group_overrides": [
{
"params": [
"local_encoder.embedding.weight",
"local_encoder.expanded_embeddings.weight"
],
"opts": {
"weight_decay": 0.0
},
"_CLASS_": "olmo_core.optim.config.OptimGroupOverride"
},
{
"params": [
"blocks.*"
],
"opts": {
"lr": 1.83e-05
},
"_CLASS_": "olmo_core.optim.config.OptimGroupOverride"
}
],
"compile": false,
"fixed_fields": [
"initial_lr"
],
"lr": 3.66e-05,
"betas": [
0.9,
0.95
],
"eps": 1e-08,
"weight_decay": 0.1,
"_CLASS_": "olmo_core.optim.adamw.AdamWConfig"
},
"max_grad_norm": 0.5,
"scheduler": {
"lr_field": "lr",
"initial_lr_field": "initial_lr",
"units": "steps",
"alpha_f": 0.0,
"warmup_fraction": 0.1,
"warmup_min_lr": 0.0,
"_CLASS_": "olmo_core.optim.scheduler.LinearWithWarmup"
},
"compile_model": true,
"float8_config": {
"enabled": false,
"_CLASS_": "olmo_core.float8.Float8Config"
},
"dp_config": {
"name": "fsdp",
"param_dtype": "bfloat16",
"reduce_dtype": "float32",
"wrapping_strategy": "full",
"prefetch_factor": 0,
"_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerDataParallelConfig"
},
"bolmo_config": {
"tokenizer": {
"vocab_size": 520,
"eos_token_id": 1,
"pad_token_id": 0,
"bos_token_id": 1,
"special_tokens": [
"<pad>",
"<bos>",
"<eos>",
"<bpe_token_end>"
],
"special_tokens_first": true,
"original_identifier": "allenai/dolma2-tokenizer",
"bpe_token_end_id": 3,
"_CLASS_": "olmo_core.data.tokenizer.ByteTokenizerConfig"
},
"losses": [
"ce",
"boundary"
],
"loss_weights": [
1.0,
4.0
],
"binarization_temp": 1.0,
"temperature": 1.0,
"div_fn": "tvd_temp_limit",
"boundary_mode": "end",
"merge_boundary_loss": false,
"use_output_boundary_jsd": false,
"eval_add_boundary_logp": false,
"do_alm_debiasing": false,
"rep_compare_fn": "l2",
"start_ratio": 4.3,
"target_ratio": 4.3,
"gradual_boundary_compression_steps": 150000,
"encoder_loss_lookahead": 0,
"encoder_loss_no_lookahead_weight": 1.0,
"encoder_loss_lookahead_weights": [],
"patching": "dolma2",
"epsilon": 1e-06,
"skip_blocks": false,
"skip_teacher_blocks": false,
"skip_teacher": true,
"compute_teacher_ce": false,
"use_student_patch_reps_for_teacher": false,
"use_oracle_patch_reps": false,
"teacher_blocks_no_grad": true,
"student_blocks_no_grad": false,
"decoder_backprop_through_encoder": true,
"decoder_backprop_through_boundary_predictor": true,
"boundary_predictor_backprop_through_encoder": true,
"teacher_force_boundaries": false,
"boundary_threshold": "sample:0",
"xlstm_igate_bias_init": -10.0,
"skip_boundary_before_eos": true,
"_CLASS_": "olmo_core.nn.bolmo.config.BolmoConfig"
},
"label_ignore_index": -100,
"_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerTrainModuleConfig"
},
"trainer": {},
"init_seed": 12536,
"_CLASS_": "__main__.ExperimentConfig"
}