{
    "model": {
        "d_model": 4096,
        "vocab_size": 640,
        "n_layers": 32,
        "block": {
            "attention": {
                "name": "default",
                "n_heads": 32,
                "bias": false,
                "rope": {
                    "name": "default",
                    "theta": 500000,
                    "full_precision": true,
                    "_CLASS_": "olmo_core.nn.rope.RoPEConfig"
                },
                "qk_norm": {
                    "name": "rms",
                    "eps": 1e-06,
                    "bias": false,
                    "dtype": "float32",
                    "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
                },
                "use_flash": true,
                "backend": "flash_2",
                "dtype": "float32",
                "sliding_window": {
                    "pattern": [
                        4096,
                        4096,
                        4096,
                        -1
                    ],
                    "force_full_attention_on_first_layer": false,
                    "force_full_attention_on_last_layer": true,
                    "_CLASS_": "olmo_core.nn.attention.SlidingWindowAttentionConfig"
                },
                "_CLASS_": "olmo_core.nn.attention.AttentionConfig"
            },
            "layer_norm": {
                "name": "rms",
                "eps": 1e-06,
                "bias": false,
                "dtype": "float32",
                "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
            },
            "feed_forward": {
                "hidden_size": 11008,
                "name": "default",
                "bias": false,
                "dtype": "float32",
                "act_name": "silu",
                "_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig"
            },
            "name": "reordered_norm",
            "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"
        },
        "lm_head": {
            "name": "default",
            "layer_norm": {
                "name": "rms",
                "eps": 1e-06,
                "bias": false,
                "dtype": "float32",
                "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
            },
            "bias": false,
            "dtype": "float32",
            "loss_implementation": "default",
            "_CLASS_": "olmo_core.nn.lm_head.LMHeadConfig"
        },
        "name": "bolmo_distill",
        "dtype": "float32",
        "init_method": "normal",
        "init_seed": 0,
        "init_std": 0.02,
        "freeze_params": [
            "boundary_predictor.*",
            "teacher_embeddings.*"
        ],
        "local_encoder": {
            "sliding_window_size": 0,
            "d_model": 4096,
            "n_layers": 1,
            "block_config": {
                "attention": {
                    "name": "default",
                    "n_heads": 16,
                    "dtype": "float32",
                    "_CLASS_": "olmo_core.nn.attention.AttentionConfig"
                },
                "layer_norm": {
                    "name": "rms",
                    "eps": 1e-06,
                    "bias": false,
                    "dtype": "float32",
                    "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
                },
                "feed_forward": {
                    "hidden_size": 5504,
                    "name": "default",
                    "bias": false,
                    "dtype": "float32",
                    "act_name": "silu",
                    "_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig"
                },
                "xlstm": {
                    "num_heads": 16,
                    "dtype": "float32",
                    "_CLASS_": "olmo_core.nn.xlstm.XLSTMConfig"
                },
                "name": "xlstm",
                "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"
            },
            "cross_attn_n_heads": 0,
            "cross_attn_do_project": true,
            "cross_attn_init_pooling": "amax",
            "pooling": "hnet",
            "add_hash_embeddings": false,
            "add_expanded_embeddings": true,
            "hash_byte_group_size": [
                3,
                4,
                5,
                6,
                7,
                8
            ],
            "hash_byte_group_vocab": [
                1536,
                3072,
                6144,
                12288,
                24576,
                49152
            ],
            "hash_byte_group_nb_functions": 1,
            "add_norm_after_last_block": true,
            "add_norm_after_pool": false,
            "add_out_projection": true,
            "boundary_predictor": "hnet",
            "boundary_predictor_lookahead": 1,
            "represent_bytes_with_embeddings": false,
            "represent_bytes_with_last_mixed_out": false,
            "blt_compat": false,
            "dtype": "float32",
            "_CLASS_": "olmo_core.nn.bolmo.config.LocalEncoderConfig"
        },
        "local_decoder": {
            "sliding_window_size": 0,
            "d_model": 4096,
            "n_layers": 4,
            "cross_attn_n_heads": 0,
            "block_config": {
                "attention": {
                    "name": "default",
                    "n_heads": 16,
                    "dtype": "float32",
                    "_CLASS_": "olmo_core.nn.attention.AttentionConfig"
                },
                "layer_norm": {
                    "name": "rms",
                    "eps": 1e-06,
                    "bias": false,
                    "dtype": "float32",
                    "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
                },
                "feed_forward": {
                    "hidden_size": 5504,
                    "name": "default",
                    "bias": false,
                    "dtype": "float32",
                    "act_name": "silu",
                    "_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig"
                },
                "xlstm": {
                    "num_heads": 16,
                    "dtype": "float32",
                    "_CLASS_": "olmo_core.nn.xlstm.XLSTMConfig"
                },
                "name": "xlstm",
                "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"
            },
            "depooling": "hnet",
            "add_norm_before_first_block": true,
            "add_norm_onto_residual": false,
            "add_in_projection": true,
            "add_projected_patch_residuals": false,
            "hnet_smooth": false,
            "hnet_smooth_ste": false,
            "hnet_modulate": false,
            "blt_compat": false,
            "fuse_boundaries": true,
            "no_boundaries": false,
            "dtype": "float32",
            "_CLASS_": "olmo_core.nn.bolmo.config.LocalDecoderConfig"
        },
        "share_blocks_between_teacher_and_student": false,
        "_CLASS_": "olmo_core.nn.transformer.config.TransformerConfig"
    },
    "dataset": {
        "tokenizer": {
            "vocab_size": 520,
            "eos_token_id": 1,
            "pad_token_id": 0,
            "bos_token_id": 1,
            "special_tokens": [
                "<pad>",
                "<bos>",
                "<eos>",
                "<bpe_token_end>"
            ],
            "special_tokens_first": true,
            "original_identifier": "allenai/dolma2-tokenizer",
            "bpe_token_end_id": 3,
            "_CLASS_": "olmo_core.data.tokenizer.ByteTokenizerConfig"
        },
        "paths": [],
        "expand_glob": false,
        "include_instance_metadata": true,
        "work_dir": "",
        "ignore_fingerprint_mismatch": false,
        "sequence_length": 4096,
        "generate_doc_lengths": false,
        "byte_sequence_length": 24576,
        "_CLASS_": "olmo_core.data.numpy_dataset.NumpyByteFSLDatasetConfig"
    },
    "data_loader": {
        "global_batch_size": 1572864,
        "seed": 1234,
        "num_workers": 24,
        "ignore_fingerprint_mismatch": false,
        "_CLASS_": "olmo_core.data.data_loader.NumpyDataLoaderConfig"
    },
    "train_module": {
        "rank_microbatch_size": 49152,
        "max_sequence_length": 24576,
        "optim": {
            "group_overrides": [
                {
                    "params": [
                        "local_encoder.embedding.weight",
                        "local_encoder.expanded_embeddings.weight"
                    ],
                    "opts": {
                        "weight_decay": 0.0
                    },
                    "_CLASS_": "olmo_core.optim.config.OptimGroupOverride"
                },
                {
                    "params": [
                        "blocks.*"
                    ],
                    "opts": {
                        "lr": 1.83e-05
                    },
                    "_CLASS_": "olmo_core.optim.config.OptimGroupOverride"
                }
            ],
            "compile": false,
            "fixed_fields": [
                "initial_lr"
            ],
            "lr": 3.66e-05,
            "betas": [
                0.9,
                0.95
            ],
            "eps": 1e-08,
            "weight_decay": 0.1,
            "_CLASS_": "olmo_core.optim.adamw.AdamWConfig"
        },
        "max_grad_norm": 0.5,
        "scheduler": {
            "lr_field": "lr",
            "initial_lr_field": "initial_lr",
            "units": "steps",
            "alpha_f": 0.0,
            "warmup_fraction": 0.1,
            "warmup_min_lr": 0.0,
            "_CLASS_": "olmo_core.optim.scheduler.LinearWithWarmup"
        },
        "compile_model": true,
        "float8_config": {
            "enabled": false,
            "_CLASS_": "olmo_core.float8.Float8Config"
        },
        "dp_config": {
            "name": "fsdp",
            "param_dtype": "bfloat16",
            "reduce_dtype": "float32",
            "wrapping_strategy": "full",
            "prefetch_factor": 0,
            "_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerDataParallelConfig"
        },
        "bolmo_config": {
            "tokenizer": {
                "vocab_size": 520,
                "eos_token_id": 1,
                "pad_token_id": 0,
                "bos_token_id": 1,
                "special_tokens": [
                    "<pad>",
                    "<bos>",
                    "<eos>",
                    "<bpe_token_end>"
                ],
                "special_tokens_first": true,
                "original_identifier": "allenai/dolma2-tokenizer",
                "bpe_token_end_id": 3,
                "_CLASS_": "olmo_core.data.tokenizer.ByteTokenizerConfig"
            },
            "losses": [
                "ce",
                "boundary"
            ],
            "loss_weights": [
                1.0,
                4.0
            ],
            "binarization_temp": 1.0,
            "temperature": 1.0,
            "div_fn": "tvd_temp_limit",
            "boundary_mode": "end",
            "merge_boundary_loss": false,
            "use_output_boundary_jsd": false,
            "eval_add_boundary_logp": false,
            "do_alm_debiasing": false,
            "rep_compare_fn": "l2",
            "start_ratio": 4.3,
            "target_ratio": 4.3,
            "gradual_boundary_compression_steps": 150000,
            "encoder_loss_lookahead": 0,
            "encoder_loss_no_lookahead_weight": 1.0,
            "encoder_loss_lookahead_weights": [],
            "patching": "dolma2",
            "epsilon": 1e-06,
            "skip_blocks": false,
            "skip_teacher_blocks": false,
            "skip_teacher": true,
            "compute_teacher_ce": false,
            "use_student_patch_reps_for_teacher": false,
            "use_oracle_patch_reps": false,
            "teacher_blocks_no_grad": true,
            "student_blocks_no_grad": false,
            "decoder_backprop_through_encoder": true,
            "decoder_backprop_through_boundary_predictor": true,
            "boundary_predictor_backprop_through_encoder": true,
            "teacher_force_boundaries": false,
            "boundary_threshold": "sample:0",
            "xlstm_igate_bias_init": -10.0,
            "skip_boundary_before_eos": true,
            "_CLASS_": "olmo_core.nn.bolmo.config.BolmoConfig"
        },
        "label_ignore_index": -100,
        "_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerTrainModuleConfig"
    },
    "trainer": {},
    "init_seed": 12536,
    "_CLASS_": "__main__.ExperimentConfig"
}