{ "model": { "d_model": 4096, "vocab_size": 640, "n_layers": 32, "block": { "attention": { "name": "default", "n_heads": 32, "bias": false, "rope": { "name": "default", "theta": 500000, "full_precision": true, "_CLASS_": "olmo_core.nn.rope.RoPEConfig" }, "qk_norm": { "name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig" }, "use_flash": true, "backend": "flash_2", "dtype": "float32", "sliding_window": { "pattern": [ 4096, 4096, 4096, -1 ], "force_full_attention_on_first_layer": false, "force_full_attention_on_last_layer": true, "_CLASS_": "olmo_core.nn.attention.SlidingWindowAttentionConfig" }, "_CLASS_": "olmo_core.nn.attention.AttentionConfig" }, "layer_norm": { "name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig" }, "feed_forward": { "hidden_size": 11008, "name": "default", "bias": false, "dtype": "float32", "act_name": "silu", "_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig" }, "name": "reordered_norm", "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig" }, "lm_head": { "name": "default", "layer_norm": { "name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig" }, "bias": false, "dtype": "float32", "loss_implementation": "default", "_CLASS_": "olmo_core.nn.lm_head.LMHeadConfig" }, "name": "bolmo_distill", "dtype": "float32", "init_method": "normal", "init_seed": 0, "init_std": 0.02, "freeze_params": [ "boundary_predictor.*", "teacher_embeddings.*" ], "local_encoder": { "sliding_window_size": 0, "d_model": 4096, "n_layers": 1, "block_config": { "attention": { "name": "default", "n_heads": 16, "dtype": "float32", "_CLASS_": "olmo_core.nn.attention.AttentionConfig" }, "layer_norm": { "name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig" }, "feed_forward": { "hidden_size": 5504, "name": "default", "bias": false, "dtype": "float32", "act_name": "silu", "_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig" }, "xlstm": { "num_heads": 16, "dtype": "float32", "_CLASS_": "olmo_core.nn.xlstm.XLSTMConfig" }, "name": "xlstm", "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig" }, "cross_attn_n_heads": 0, "cross_attn_do_project": true, "cross_attn_init_pooling": "amax", "pooling": "hnet", "add_hash_embeddings": false, "add_expanded_embeddings": true, "hash_byte_group_size": [ 3, 4, 5, 6, 7, 8 ], "hash_byte_group_vocab": [ 1536, 3072, 6144, 12288, 24576, 49152 ], "hash_byte_group_nb_functions": 1, "add_norm_after_last_block": true, "add_norm_after_pool": false, "add_out_projection": true, "boundary_predictor": "hnet", "boundary_predictor_lookahead": 1, "represent_bytes_with_embeddings": false, "represent_bytes_with_last_mixed_out": false, "blt_compat": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.bolmo.config.LocalEncoderConfig" }, "local_decoder": { "sliding_window_size": 0, "d_model": 4096, "n_layers": 4, "cross_attn_n_heads": 0, "block_config": { "attention": { "name": "default", "n_heads": 16, "dtype": "float32", "_CLASS_": "olmo_core.nn.attention.AttentionConfig" }, "layer_norm": { "name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig" }, "feed_forward": { "hidden_size": 5504, "name": "default", "bias": false, "dtype": "float32", "act_name": "silu", "_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig" }, "xlstm": { "num_heads": 16, "dtype": "float32", "_CLASS_": "olmo_core.nn.xlstm.XLSTMConfig" }, "name": "xlstm", "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig" }, "depooling": "hnet", "add_norm_before_first_block": true, "add_norm_onto_residual": false, "add_in_projection": true, "add_projected_patch_residuals": false, "hnet_smooth": false, "hnet_smooth_ste": false, "hnet_modulate": false, "blt_compat": false, "fuse_boundaries": true, "no_boundaries": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.bolmo.config.LocalDecoderConfig" }, "share_blocks_between_teacher_and_student": false, "_CLASS_": "olmo_core.nn.transformer.config.TransformerConfig" }, "dataset": { "tokenizer": { "vocab_size": 520, "eos_token_id": 1, "pad_token_id": 0, "bos_token_id": 1, "special_tokens": [ "", "", "", "" ], "special_tokens_first": true, "original_identifier": "allenai/dolma2-tokenizer", "bpe_token_end_id": 3, "_CLASS_": "olmo_core.data.tokenizer.ByteTokenizerConfig" }, "paths": [], "expand_glob": false, "include_instance_metadata": true, "work_dir": "", "ignore_fingerprint_mismatch": false, "sequence_length": 4096, "generate_doc_lengths": false, "byte_sequence_length": 24576, "_CLASS_": "olmo_core.data.numpy_dataset.NumpyByteFSLDatasetConfig" }, "data_loader": { "global_batch_size": 1572864, "seed": 1234, "num_workers": 24, "ignore_fingerprint_mismatch": false, "_CLASS_": "olmo_core.data.data_loader.NumpyDataLoaderConfig" }, "train_module": { "rank_microbatch_size": 49152, "max_sequence_length": 24576, "optim": { "group_overrides": [ { "params": [ "local_encoder.embedding.weight", "local_encoder.expanded_embeddings.weight" ], "opts": { "weight_decay": 0.0 }, "_CLASS_": "olmo_core.optim.config.OptimGroupOverride" }, { "params": [ "blocks.*" ], "opts": { "lr": 1.83e-05 }, "_CLASS_": "olmo_core.optim.config.OptimGroupOverride" } ], "compile": false, "fixed_fields": [ "initial_lr" ], "lr": 3.66e-05, "betas": [ 0.9, 0.95 ], "eps": 1e-08, "weight_decay": 0.1, "_CLASS_": "olmo_core.optim.adamw.AdamWConfig" }, "max_grad_norm": 0.5, "scheduler": { "lr_field": "lr", "initial_lr_field": "initial_lr", "units": "steps", "alpha_f": 0.0, "warmup_fraction": 0.1, "warmup_min_lr": 0.0, "_CLASS_": "olmo_core.optim.scheduler.LinearWithWarmup" }, "compile_model": true, "float8_config": { "enabled": false, "_CLASS_": "olmo_core.float8.Float8Config" }, "dp_config": { "name": "fsdp", "param_dtype": "bfloat16", "reduce_dtype": "float32", "wrapping_strategy": "full", "prefetch_factor": 0, "_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerDataParallelConfig" }, "bolmo_config": { "tokenizer": { "vocab_size": 520, "eos_token_id": 1, "pad_token_id": 0, "bos_token_id": 1, "special_tokens": [ "", "", "", "" ], "special_tokens_first": true, "original_identifier": "allenai/dolma2-tokenizer", "bpe_token_end_id": 3, "_CLASS_": "olmo_core.data.tokenizer.ByteTokenizerConfig" }, "losses": [ "ce", "boundary" ], "loss_weights": [ 1.0, 4.0 ], "binarization_temp": 1.0, "temperature": 1.0, "div_fn": "tvd_temp_limit", "boundary_mode": "end", "merge_boundary_loss": false, "use_output_boundary_jsd": false, "eval_add_boundary_logp": false, "do_alm_debiasing": false, "rep_compare_fn": "l2", "start_ratio": 4.3, "target_ratio": 4.3, "gradual_boundary_compression_steps": 150000, "encoder_loss_lookahead": 0, "encoder_loss_no_lookahead_weight": 1.0, "encoder_loss_lookahead_weights": [], "patching": "dolma2", "epsilon": 1e-06, "skip_blocks": false, "skip_teacher_blocks": false, "skip_teacher": true, "compute_teacher_ce": false, "use_student_patch_reps_for_teacher": false, "use_oracle_patch_reps": false, "teacher_blocks_no_grad": true, "student_blocks_no_grad": false, "decoder_backprop_through_encoder": true, "decoder_backprop_through_boundary_predictor": true, "boundary_predictor_backprop_through_encoder": true, "teacher_force_boundaries": false, "boundary_threshold": "sample:0", "xlstm_igate_bias_init": -10.0, "skip_boundary_before_eos": true, "_CLASS_": "olmo_core.nn.bolmo.config.BolmoConfig" }, "label_ignore_index": -100, "_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerTrainModuleConfig" }, "trainer": {}, "init_seed": 12536, "_CLASS_": "__main__.ExperimentConfig" }