Upload folder using huggingface_hub

5b2811e verified 22 days ago

12.8 kB

	{
	"model": {
	"d_model": 4096,
	"vocab_size": 640,
	"n_layers": 32,
	"block": {
	"attention": {
	"name": "default",
	"n_heads": 32,
	"bias": false,
	"rope": {
	"name": "default",
	"theta": 500000,
	"full_precision": true,
	"_CLASS_": "olmo_core.nn.rope.RoPEConfig"
	},
	"qk_norm": {
	"name": "rms",
	"eps": 1e-06,
	"bias": false,
	"dtype": "float32",
	"_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
	},
	"use_flash": true,
	"backend": "flash_2",
	"dtype": "float32",
	"sliding_window": {
	"pattern": [
	4096,
	4096,
	4096,
	-1
	],
	"force_full_attention_on_first_layer": false,
	"force_full_attention_on_last_layer": true,
	"_CLASS_": "olmo_core.nn.attention.SlidingWindowAttentionConfig"
	},
	"_CLASS_": "olmo_core.nn.attention.AttentionConfig"
	},
	"layer_norm": {
	"name": "rms",
	"eps": 1e-06,
	"bias": false,
	"dtype": "float32",
	"_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
	},
	"feed_forward": {
	"hidden_size": 11008,
	"name": "default",
	"bias": false,
	"dtype": "float32",
	"act_name": "silu",
	"_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig"
	},
	"name": "reordered_norm",
	"_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"
	},
	"lm_head": {
	"name": "default",
	"layer_norm": {
	"name": "rms",
	"eps": 1e-06,
	"bias": false,
	"dtype": "float32",
	"_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
	},
	"bias": false,
	"dtype": "float32",
	"loss_implementation": "default",
	"_CLASS_": "olmo_core.nn.lm_head.LMHeadConfig"
	},
	"name": "bolmo_distill",
	"dtype": "float32",
	"init_method": "normal",
	"init_seed": 0,
	"init_std": 0.02,
	"freeze_params": [
	"boundary_predictor.*",
	"teacher_embeddings.*"
	],
	"local_encoder": {
	"sliding_window_size": 0,
	"d_model": 4096,
	"n_layers": 1,
	"block_config": {
	"attention": {
	"name": "default",
	"n_heads": 16,
	"dtype": "float32",
	"_CLASS_": "olmo_core.nn.attention.AttentionConfig"
	},
	"layer_norm": {
	"name": "rms",
	"eps": 1e-06,
	"bias": false,
	"dtype": "float32",
	"_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
	},
	"feed_forward": {
	"hidden_size": 5504,
	"name": "default",
	"bias": false,
	"dtype": "float32",
	"act_name": "silu",
	"_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig"
	},
	"xlstm": {
	"num_heads": 16,
	"dtype": "float32",
	"_CLASS_": "olmo_core.nn.xlstm.XLSTMConfig"
	},
	"name": "xlstm",
	"_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"
	},
	"cross_attn_n_heads": 0,
	"cross_attn_do_project": true,
	"cross_attn_init_pooling": "amax",
	"pooling": "hnet",
	"add_hash_embeddings": false,
	"add_expanded_embeddings": true,
	"hash_byte_group_size": [
	3,
	4,
	5,
	6,
	7,
	8
	],
	"hash_byte_group_vocab": [
	1536,
	3072,
	6144,
	12288,
	24576,
	49152
	],
	"hash_byte_group_nb_functions": 1,
	"add_norm_after_last_block": true,
	"add_norm_after_pool": false,
	"add_out_projection": true,
	"boundary_predictor": "hnet",
	"boundary_predictor_lookahead": 1,
	"represent_bytes_with_embeddings": false,
	"represent_bytes_with_last_mixed_out": false,
	"blt_compat": false,
	"dtype": "float32",
	"_CLASS_": "olmo_core.nn.bolmo.config.LocalEncoderConfig"
	},
	"local_decoder": {
	"sliding_window_size": 0,
	"d_model": 4096,
	"n_layers": 4,
	"cross_attn_n_heads": 0,
	"block_config": {
	"attention": {
	"name": "default",
	"n_heads": 16,
	"dtype": "float32",
	"_CLASS_": "olmo_core.nn.attention.AttentionConfig"
	},
	"layer_norm": {
	"name": "rms",
	"eps": 1e-06,
	"bias": false,
	"dtype": "float32",
	"_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
	},
	"feed_forward": {
	"hidden_size": 5504,
	"name": "default",
	"bias": false,
	"dtype": "float32",
	"act_name": "silu",
	"_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig"
	},
	"xlstm": {
	"num_heads": 16,
	"dtype": "float32",
	"_CLASS_": "olmo_core.nn.xlstm.XLSTMConfig"
	},
	"name": "xlstm",
	"_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"
	},
	"depooling": "hnet",
	"add_norm_before_first_block": true,
	"add_norm_onto_residual": false,
	"add_in_projection": true,
	"add_projected_patch_residuals": false,
	"hnet_smooth": false,
	"hnet_smooth_ste": false,
	"hnet_modulate": false,
	"blt_compat": false,
	"fuse_boundaries": true,
	"no_boundaries": false,
	"dtype": "float32",
	"_CLASS_": "olmo_core.nn.bolmo.config.LocalDecoderConfig"
	},
	"share_blocks_between_teacher_and_student": false,
	"_CLASS_": "olmo_core.nn.transformer.config.TransformerConfig"
	},
	"dataset": {
	"tokenizer": {
	"vocab_size": 520,
	"eos_token_id": 1,
	"pad_token_id": 0,
	"bos_token_id": 1,
	"special_tokens": [
	"<pad>",
	"<bos>",
	"<eos>",
	"<bpe_token_end>"
	],
	"special_tokens_first": true,
	"original_identifier": "allenai/dolma2-tokenizer",
	"bpe_token_end_id": 3,
	"_CLASS_": "olmo_core.data.tokenizer.ByteTokenizerConfig"
	},
	"paths": [],
	"expand_glob": false,
	"include_instance_metadata": true,
	"work_dir": "",
	"ignore_fingerprint_mismatch": false,
	"sequence_length": 4096,
	"generate_doc_lengths": false,
	"byte_sequence_length": 24576,
	"_CLASS_": "olmo_core.data.numpy_dataset.NumpyByteFSLDatasetConfig"
	},
	"data_loader": {
	"global_batch_size": 1572864,
	"seed": 1234,
	"num_workers": 24,
	"ignore_fingerprint_mismatch": false,
	"_CLASS_": "olmo_core.data.data_loader.NumpyDataLoaderConfig"
	},
	"train_module": {
	"rank_microbatch_size": 49152,
	"max_sequence_length": 24576,
	"optim": {
	"group_overrides": [
	{
	"params": [
	"local_encoder.embedding.weight",
	"local_encoder.expanded_embeddings.weight"
	],
	"opts": {
	"weight_decay": 0.0
	},
	"_CLASS_": "olmo_core.optim.config.OptimGroupOverride"
	},
	{
	"params": [
	"blocks.*"
	],
	"opts": {
	"lr": 1.83e-05
	},
	"_CLASS_": "olmo_core.optim.config.OptimGroupOverride"
	}
	],
	"compile": false,
	"fixed_fields": [
	"initial_lr"
	],
	"lr": 3.66e-05,
	"betas": [
	0.9,
	0.95
	],
	"eps": 1e-08,
	"weight_decay": 0.1,
	"_CLASS_": "olmo_core.optim.adamw.AdamWConfig"
	},
	"max_grad_norm": 0.5,
	"scheduler": {
	"lr_field": "lr",
	"initial_lr_field": "initial_lr",
	"units": "steps",
	"alpha_f": 0.0,
	"warmup_fraction": 0.1,
	"warmup_min_lr": 0.0,
	"_CLASS_": "olmo_core.optim.scheduler.LinearWithWarmup"
	},
	"compile_model": true,
	"float8_config": {
	"enabled": false,
	"_CLASS_": "olmo_core.float8.Float8Config"
	},
	"dp_config": {
	"name": "fsdp",
	"param_dtype": "bfloat16",
	"reduce_dtype": "float32",
	"wrapping_strategy": "full",
	"prefetch_factor": 0,
	"_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerDataParallelConfig"
	},
	"bolmo_config": {
	"tokenizer": {
	"vocab_size": 520,
	"eos_token_id": 1,
	"pad_token_id": 0,
	"bos_token_id": 1,
	"special_tokens": [
	"<pad>",
	"<bos>",
	"<eos>",
	"<bpe_token_end>"
	],
	"special_tokens_first": true,
	"original_identifier": "allenai/dolma2-tokenizer",
	"bpe_token_end_id": 3,
	"_CLASS_": "olmo_core.data.tokenizer.ByteTokenizerConfig"
	},
	"losses": [
	"ce",
	"boundary"
	],
	"loss_weights": [
	1.0,
	4.0
	],
	"binarization_temp": 1.0,
	"temperature": 1.0,
	"div_fn": "tvd_temp_limit",
	"boundary_mode": "end",
	"merge_boundary_loss": false,
	"use_output_boundary_jsd": false,
	"eval_add_boundary_logp": false,
	"do_alm_debiasing": false,
	"rep_compare_fn": "l2",
	"start_ratio": 4.3,
	"target_ratio": 4.3,
	"gradual_boundary_compression_steps": 150000,
	"encoder_loss_lookahead": 0,
	"encoder_loss_no_lookahead_weight": 1.0,
	"encoder_loss_lookahead_weights": [],
	"patching": "dolma2",
	"epsilon": 1e-06,
	"skip_blocks": false,
	"skip_teacher_blocks": false,
	"skip_teacher": true,
	"compute_teacher_ce": false,
	"use_student_patch_reps_for_teacher": false,
	"use_oracle_patch_reps": false,
	"teacher_blocks_no_grad": true,
	"student_blocks_no_grad": false,
	"decoder_backprop_through_encoder": true,
	"decoder_backprop_through_boundary_predictor": true,
	"boundary_predictor_backprop_through_encoder": true,
	"teacher_force_boundaries": false,
	"boundary_threshold": "sample:0",
	"xlstm_igate_bias_init": -10.0,
	"skip_boundary_before_eos": true,
	"_CLASS_": "olmo_core.nn.bolmo.config.BolmoConfig"
	},
	"label_ignore_index": -100,
	"_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerTrainModuleConfig"
	},
	"trainer": {},
	"init_seed": 12536,
	"_CLASS_": "__main__.ExperimentConfig"
	}