{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 565, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 240.77734375, "completions/mean_terminated_length": 240.77734375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.16800164990127087, "epoch": 0.0017699115044247787, "frac_reward_zero_std": 0.0, "grad_norm": 0.26899288624647755, "learning_rate": 0.0, "loss": -0.003, "num_tokens": 464071.0, "reward": 0.48417970538139343, "reward_std": 0.4741697609424591, "rewards/execution_accuracy_EX/mean": 0.45703125, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9883767366409302, "sampling/importance_sampling_ratio/min": 0.006748078390955925, "sampling/sampling_logp_difference/max": 4.998497486114502, "sampling/sampling_logp_difference/mean": 0.13759836554527283, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 196.4765625, "completions/mean_terminated_length": 196.4765625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.1572934128344059, "epoch": 0.0035398230088495575, "frac_reward_zero_std": 0.0, "grad_norm": 0.43014753967539887, "learning_rate": 1.7543859649122805e-08, "loss": 0.0014, "num_tokens": 977745.0, "reward": 0.517578125, "reward_std": 0.47587236762046814, "rewards/execution_accuracy_EX/mean": 0.4921875, "rewards/execution_accuracy_EX/std": 0.5009182691574097, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9887896776199341, "sampling/importance_sampling_ratio/min": 0.0009695081971585751, "sampling/sampling_logp_difference/max": 6.938721656799316, "sampling/sampling_logp_difference/mean": 0.13283054530620575, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 229.14453125, "completions/mean_terminated_length": 229.14453125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.1506793014705181, "epoch": 0.005309734513274336, "frac_reward_zero_std": 0.0, "grad_norm": 0.2460889331215385, "learning_rate": 3.508771929824561e-08, "loss": 0.0061, "num_tokens": 1405414.0, "reward": 0.5843750238418579, "reward_std": 0.47219762206077576, "rewards/execution_accuracy_EX/mean": 0.5625, "rewards/execution_accuracy_EX/std": 0.49705013632774353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9899247884750366, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.12096783518791199, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 204.56640625, "completions/mean_terminated_length": 204.56640625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.17143337801098824, "epoch": 0.007079646017699115, "frac_reward_zero_std": 0.0, "grad_norm": 0.16121964031093983, "learning_rate": 5.2631578947368416e-08, "loss": 0.0004, "num_tokens": 2009431.0, "reward": 0.47304683923721313, "reward_std": 0.47307515144348145, "rewards/execution_accuracy_EX/mean": 0.4453125, "rewards/execution_accuracy_EX/std": 0.49797385931015015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9882920980453491, "sampling/importance_sampling_ratio/min": 0.008775105699896812, "sampling/sampling_logp_difference/max": 4.735836505889893, "sampling/sampling_logp_difference/mean": 0.14046800136566162, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 222.2109375, "completions/mean_terminated_length": 222.2109375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.1771502736955881, "epoch": 0.008849557522123894, "frac_reward_zero_std": 0.0, "grad_norm": 0.4317320322048489, "learning_rate": 7.017543859649122e-08, "loss": 0.0139, "num_tokens": 2565437.0, "reward": 0.7253906726837158, "reward_std": 0.4315042495727539, "rewards/execution_accuracy_EX/mean": 0.7109375, "rewards/execution_accuracy_EX/std": 0.45421501994132996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9947786331176758, "sampling/importance_sampling_ratio/min": 0.011136677116155624, "sampling/sampling_logp_difference/max": 4.497511386871338, "sampling/sampling_logp_difference/mean": 0.13672447204589844, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 212.046875, "completions/mean_terminated_length": 212.046875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.15770719200372696, "epoch": 0.010619469026548672, "frac_reward_zero_std": 0.0, "grad_norm": 0.36976823638973716, "learning_rate": 8.771929824561403e-08, "loss": 0.0003, "num_tokens": 3020473.0, "reward": 0.45820313692092896, "reward_std": 0.471201092004776, "rewards/execution_accuracy_EX/mean": 0.4296875, "rewards/execution_accuracy_EX/std": 0.4960011839866638, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9911689758300781, "sampling/importance_sampling_ratio/min": 0.014309810474514961, "sampling/sampling_logp_difference/max": 4.246809959411621, "sampling/sampling_logp_difference/mean": 0.1317376345396042, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 214.2734375, "completions/mean_terminated_length": 214.2734375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.16596073657274246, "epoch": 0.012389380530973451, "frac_reward_zero_std": 0.0, "grad_norm": 0.4258994461942591, "learning_rate": 1.0526315789473683e-07, "loss": 0.011, "num_tokens": 3576879.0, "reward": 0.6214843988418579, "reward_std": 0.46600866317749023, "rewards/execution_accuracy_EX/mean": 0.6015625, "rewards/execution_accuracy_EX/std": 0.4905354380607605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.991350531578064, "sampling/importance_sampling_ratio/min": 0.011154413223266602, "sampling/sampling_logp_difference/max": 4.495920181274414, "sampling/sampling_logp_difference/mean": 0.13485443592071533, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 215.6171875, "completions/mean_terminated_length": 215.6171875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.1521406676620245, "epoch": 0.01415929203539823, "frac_reward_zero_std": 0.0, "grad_norm": 0.32626688667132286, "learning_rate": 1.2280701754385964e-07, "loss": -0.0054, "num_tokens": 3908813.0, "reward": 0.502734363079071, "reward_std": 0.47540730237960815, "rewards/execution_accuracy_EX/mean": 0.4765625, "rewards/execution_accuracy_EX/std": 0.5004287362098694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9880743026733398, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.13180431723594666, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 226.26953125, "completions/mean_terminated_length": 226.26953125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.15583448484539986, "epoch": 0.01592920353982301, "frac_reward_zero_std": 0.0, "grad_norm": 0.37532224908039835, "learning_rate": 1.4035087719298244e-07, "loss": 0.0151, "num_tokens": 4292898.0, "reward": 0.5843750238418579, "reward_std": 0.47219759225845337, "rewards/execution_accuracy_EX/mean": 0.5625, "rewards/execution_accuracy_EX/std": 0.49705013632774353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9915869235992432, "sampling/importance_sampling_ratio/min": 0.005292921327054501, "sampling/sampling_logp_difference/max": 5.241384983062744, "sampling/sampling_logp_difference/mean": 0.1297648698091507, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 219.03125, "completions/mean_terminated_length": 219.03125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.1605638675391674, "epoch": 0.017699115044247787, "frac_reward_zero_std": 0.0, "grad_norm": 0.32937675464045957, "learning_rate": 1.5789473684210525e-07, "loss": 0.0007, "num_tokens": 4669290.0, "reward": 0.4990234375, "reward_std": 0.4752182364463806, "rewards/execution_accuracy_EX/mean": 0.47265625, "rewards/execution_accuracy_EX/std": 0.5002297759056091, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9915338754653931, "sampling/importance_sampling_ratio/min": 0.011154407635331154, "sampling/sampling_logp_difference/max": 4.495920658111572, "sampling/sampling_logp_difference/mean": 0.1286957561969757, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 215.46484375, "completions/mean_terminated_length": 215.46484375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.1649212446063757, "epoch": 0.019469026548672566, "frac_reward_zero_std": 0.0, "grad_norm": 0.39580334353692825, "learning_rate": 1.7543859649122805e-07, "loss": -0.0117, "num_tokens": 5107425.0, "reward": 0.36542966961860657, "reward_std": 0.44827139377593994, "rewards/execution_accuracy_EX/mean": 0.33203125, "rewards/execution_accuracy_EX/std": 0.4718646705150604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9916530847549438, "sampling/importance_sampling_ratio/min": 0.011154402047395706, "sampling/sampling_logp_difference/max": 4.4959211349487305, "sampling/sampling_logp_difference/mean": 0.13201645016670227, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 209.59765625, "completions/mean_terminated_length": 209.59765625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.15810802951455116, "epoch": 0.021238938053097345, "frac_reward_zero_std": 0.0, "grad_norm": 0.19899075420567763, "learning_rate": 1.9298245614035086e-07, "loss": -0.0015, "num_tokens": 5582698.0, "reward": 0.5732421875, "reward_std": 0.47346949577331543, "rewards/execution_accuracy_EX/mean": 0.55078125, "rewards/execution_accuracy_EX/std": 0.49838894605636597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9912006855010986, "sampling/importance_sampling_ratio/min": 0.003199489787220955, "sampling/sampling_logp_difference/max": 5.7447638511657715, "sampling/sampling_logp_difference/mean": 0.1252116858959198, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 238.0703125, "completions/mean_terminated_length": 238.0703125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.18869901075959206, "epoch": 0.023008849557522124, "frac_reward_zero_std": 0.0, "grad_norm": 0.3439687256876209, "learning_rate": 2.1052631578947366e-07, "loss": 0.0054, "num_tokens": 6027484.0, "reward": 0.3505859375, "reward_std": 0.44268524646759033, "rewards/execution_accuracy_EX/mean": 0.31640625, "rewards/execution_accuracy_EX/std": 0.4659844934940338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9924343824386597, "sampling/importance_sampling_ratio/min": 0.008684132248163223, "sampling/sampling_logp_difference/max": 4.746257781982422, "sampling/sampling_logp_difference/mean": 0.1439538300037384, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 225.69921875, "completions/mean_terminated_length": 225.69921875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.15718093514442444, "epoch": 0.024778761061946902, "frac_reward_zero_std": 0.0, "grad_norm": 0.4053142697711595, "learning_rate": 2.2807017543859647e-07, "loss": -0.0031, "num_tokens": 6373727.0, "reward": 0.4916015565395355, "reward_std": 0.4747525453567505, "rewards/execution_accuracy_EX/mean": 0.46484375, "rewards/execution_accuracy_EX/std": 0.49973952770233154, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9912569522857666, "sampling/importance_sampling_ratio/min": 0.011183848604559898, "sampling/sampling_logp_difference/max": 4.493284702301025, "sampling/sampling_logp_difference/mean": 0.1230582520365715, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 190.36328125, "completions/mean_terminated_length": 190.36328125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.16634388826787472, "epoch": 0.02654867256637168, "frac_reward_zero_std": 0.0, "grad_norm": 0.4612921435624558, "learning_rate": 2.456140350877193e-07, "loss": 0.0042, "num_tokens": 6859052.0, "reward": 0.6363281011581421, "reward_std": 0.46267399191856384, "rewards/execution_accuracy_EX/mean": 0.6171875, "rewards/execution_accuracy_EX/std": 0.48702529072761536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9878740906715393, "sampling/importance_sampling_ratio/min": 0.014339085668325424, "sampling/sampling_logp_difference/max": 4.2447662353515625, "sampling/sampling_logp_difference/mean": 0.13844618201255798, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 203.71875, "completions/mean_terminated_length": 203.71875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.15448766387999058, "epoch": 0.02831858407079646, "frac_reward_zero_std": 0.0, "grad_norm": 0.28992343133582726, "learning_rate": 2.631578947368421e-07, "loss": -0.0047, "num_tokens": 7186676.0, "reward": 0.45820313692092896, "reward_std": 0.4712011218070984, "rewards/execution_accuracy_EX/mean": 0.4296875, "rewards/execution_accuracy_EX/std": 0.4960011839866638, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9857417941093445, "sampling/importance_sampling_ratio/min": 0.014740431681275368, "sampling/sampling_logp_difference/max": 4.217161178588867, "sampling/sampling_logp_difference/mean": 0.13474053144454956, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 216.94140625, "completions/mean_terminated_length": 216.94140625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.15913797728717327, "epoch": 0.03008849557522124, "frac_reward_zero_std": 0.0, "grad_norm": 0.22356548494377418, "learning_rate": 2.807017543859649e-07, "loss": -0.004, "num_tokens": 7617845.0, "reward": 0.6585937738418579, "reward_std": 0.4567192792892456, "rewards/execution_accuracy_EX/mean": 0.640625, "rewards/execution_accuracy_EX/std": 0.4807571768760681, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9925726056098938, "sampling/importance_sampling_ratio/min": 0.008900564163923264, "sampling/sampling_logp_difference/max": 4.721640586853027, "sampling/sampling_logp_difference/mean": 0.12318142503499985, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 245.546875, "completions/mean_terminated_length": 245.546875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.1641867607831955, "epoch": 0.03185840707964602, "frac_reward_zero_std": 0.0, "grad_norm": 0.3618873588349014, "learning_rate": 2.982456140350877e-07, "loss": 0.0014, "num_tokens": 8243361.0, "reward": 0.37285155057907104, "reward_std": 0.4508545994758606, "rewards/execution_accuracy_EX/mean": 0.33984375, "rewards/execution_accuracy_EX/std": 0.47458380460739136, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.991119921207428, "sampling/importance_sampling_ratio/min": 0.0025241519324481487, "sampling/sampling_logp_difference/max": 5.9818501472473145, "sampling/sampling_logp_difference/mean": 0.12859690189361572, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 210.50390625, "completions/mean_terminated_length": 210.50390625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.15631975419819355, "epoch": 0.033628318584070796, "frac_reward_zero_std": 0.0, "grad_norm": 0.37756396523373564, "learning_rate": 3.157894736842105e-07, "loss": 0.0031, "num_tokens": 8973794.0, "reward": 0.6437499523162842, "reward_std": 0.4608176648616791, "rewards/execution_accuracy_EX/mean": 0.625, "rewards/execution_accuracy_EX/std": 0.4850712716579437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.988776445388794, "sampling/importance_sampling_ratio/min": 0.0143876438960433, "sampling/sampling_logp_difference/max": 4.241385459899902, "sampling/sampling_logp_difference/mean": 0.1299668550491333, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 221.0625, "completions/mean_terminated_length": 221.0625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.16394091956317425, "epoch": 0.035398230088495575, "frac_reward_zero_std": 0.0, "grad_norm": 0.330010003255172, "learning_rate": 3.333333333333333e-07, "loss": 0.0028, "num_tokens": 9432866.0, "reward": 0.3246093690395355, "reward_std": 0.4315042495727539, "rewards/execution_accuracy_EX/mean": 0.2890625, "rewards/execution_accuracy_EX/std": 0.45421501994132996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9900156259536743, "sampling/importance_sampling_ratio/min": 0.014467723667621613, "sampling/sampling_logp_difference/max": 4.235835075378418, "sampling/sampling_logp_difference/mean": 0.13131310045719147, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 228.7734375, "completions/mean_terminated_length": 228.7734375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.1680222861468792, "epoch": 0.03716814159292035, "frac_reward_zero_std": 0.0, "grad_norm": 0.28002706033303626, "learning_rate": 3.508771929824561e-07, "loss": 0.0017, "num_tokens": 9927112.0, "reward": 0.44707030057907104, "reward_std": 0.46948158740997314, "rewards/execution_accuracy_EX/mean": 0.41796875, "rewards/execution_accuracy_EX/std": 0.49419113993644714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.991767406463623, "sampling/importance_sampling_ratio/min": 0.014291780069470406, "sampling/sampling_logp_difference/max": 4.24807071685791, "sampling/sampling_logp_difference/mean": 0.13199591636657715, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 232.265625, "completions/mean_terminated_length": 232.265625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.16237134858965874, "epoch": 0.03893805309734513, "frac_reward_zero_std": 0.0, "grad_norm": 0.4130597941325235, "learning_rate": 3.684210526315789e-07, "loss": -0.0038, "num_tokens": 10399820.0, "reward": 0.4136718809604645, "reward_std": 0.46267399191856384, "rewards/execution_accuracy_EX/mean": 0.3828125, "rewards/execution_accuracy_EX/std": 0.48702529072761536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.993659257888794, "sampling/importance_sampling_ratio/min": 0.008661825209856033, "sampling/sampling_logp_difference/max": 4.7488298416137695, "sampling/sampling_logp_difference/mean": 0.1262757033109665, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 200.19140625, "completions/mean_terminated_length": 200.19140625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.15257065370678902, "epoch": 0.04070796460176991, "frac_reward_zero_std": 0.0, "grad_norm": 0.2539925648836394, "learning_rate": 3.859649122807017e-07, "loss": -0.0034, "num_tokens": 10959933.0, "reward": 0.6029297113418579, "reward_std": 0.46948155760765076, "rewards/execution_accuracy_EX/mean": 0.58203125, "rewards/execution_accuracy_EX/std": 0.49419113993644714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9871996641159058, "sampling/importance_sampling_ratio/min": 0.01430963259190321, "sampling/sampling_logp_difference/max": 4.246822357177734, "sampling/sampling_logp_difference/mean": 0.13107885420322418, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 198.0703125, "completions/mean_terminated_length": 198.0703125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.15924336947500706, "epoch": 0.04247787610619469, "frac_reward_zero_std": 0.0, "grad_norm": 0.2149158834516334, "learning_rate": 4.035087719298245e-07, "loss": -0.002, "num_tokens": 11359039.0, "reward": 0.4693359434604645, "reward_std": 0.4726512134075165, "rewards/execution_accuracy_EX/mean": 0.44140625, "rewards/execution_accuracy_EX/std": 0.4975275993347168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.988457977771759, "sampling/importance_sampling_ratio/min": 0.00676548620685935, "sampling/sampling_logp_difference/max": 4.9959211349487305, "sampling/sampling_logp_difference/mean": 0.13254688680171967, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 229.98046875, "completions/mean_terminated_length": 229.98046875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.16456064768135548, "epoch": 0.04424778761061947, "frac_reward_zero_std": 0.0, "grad_norm": 0.4167611685117223, "learning_rate": 4.2105263157894733e-07, "loss": -0.012, "num_tokens": 11779002.0, "reward": 0.369140625, "reward_std": 0.44958022236824036, "rewards/execution_accuracy_EX/mean": 0.3359375, "rewards/execution_accuracy_EX/std": 0.4732423722743988, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9894763231277466, "sampling/importance_sampling_ratio/min": 0.018519118428230286, "sampling/sampling_logp_difference/max": 3.9889516830444336, "sampling/sampling_logp_difference/mean": 0.13387247920036316, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 234.1796875, "completions/mean_terminated_length": 234.1796875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.15004423819482327, "epoch": 0.04601769911504425, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "learning_rate": 4.3859649122807013e-07, "loss": 0.0, "num_tokens": 12292088.0, "reward": 0.6437499523162842, "reward_std": 0.4608176648616791, "rewards/execution_accuracy_EX/mean": 0.625, "rewards/execution_accuracy_EX/std": 0.4850712716579437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9890333414077759, "sampling/importance_sampling_ratio/min": 0.0052689663134515285, "sampling/sampling_logp_difference/max": 5.2459211349487305, "sampling/sampling_logp_difference/mean": 0.1237257719039917, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 219.54296875, "completions/mean_terminated_length": 219.54296875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.15362016297876835, "epoch": 0.047787610619469026, "frac_reward_zero_std": 0.0, "grad_norm": 0.26895058438899755, "learning_rate": 4.5614035087719294e-07, "loss": 0.0031, "num_tokens": 12736963.0, "reward": 0.6957031488418579, "reward_std": 0.44413506984710693, "rewards/execution_accuracy_EX/mean": 0.6796875, "rewards/execution_accuracy_EX/std": 0.4675106406211853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9897770285606384, "sampling/importance_sampling_ratio/min": 0.004099072422832251, "sampling/sampling_logp_difference/max": 5.496994495391846, "sampling/sampling_logp_difference/mean": 0.13121642172336578, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 208.80078125, "completions/mean_terminated_length": 208.80078125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.16322246752679348, "epoch": 0.049557522123893805, "frac_reward_zero_std": 0.0, "grad_norm": 0.4795479621446814, "learning_rate": 4.7368421052631574e-07, "loss": 0.0137, "num_tokens": 13086768.0, "reward": 0.5064452886581421, "reward_std": 0.4755672216415405, "rewards/execution_accuracy_EX/mean": 0.48046875, "rewards/execution_accuracy_EX/std": 0.5005971193313599, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9900233149528503, "sampling/importance_sampling_ratio/min": 0.0067833466455340385, "sampling/sampling_logp_difference/max": 4.993284702301025, "sampling/sampling_logp_difference/mean": 0.13302835822105408, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 225.50390625, "completions/mean_terminated_length": 225.50390625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.15761182643473148, "epoch": 0.05132743362831858, "frac_reward_zero_std": 0.0, "grad_norm": 0.24046164282219176, "learning_rate": 4.912280701754385e-07, "loss": -0.0056, "num_tokens": 13506401.0, "reward": 0.632617175579071, "reward_std": 0.46355465054512024, "rewards/execution_accuracy_EX/mean": 0.61328125, "rewards/execution_accuracy_EX/std": 0.4879522919654846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9904501438140869, "sampling/importance_sampling_ratio/min": 0.014360358938574791, "sampling/sampling_logp_difference/max": 4.243283748626709, "sampling/sampling_logp_difference/mean": 0.12762247025966644, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 217.421875, "completions/mean_terminated_length": 217.421875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.15651825070381165, "epoch": 0.05309734513274336, "frac_reward_zero_std": 0.0, "grad_norm": 0.3764552993060396, "learning_rate": 5.087719298245614e-07, "loss": 0.0011, "num_tokens": 13861437.0, "reward": 0.44707030057907104, "reward_std": 0.46948155760765076, "rewards/execution_accuracy_EX/mean": 0.41796875, "rewards/execution_accuracy_EX/std": 0.49419113993644714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9907035827636719, "sampling/importance_sampling_ratio/min": 0.008661828935146332, "sampling/sampling_logp_difference/max": 4.748829364776611, "sampling/sampling_logp_difference/mean": 0.1288561224937439, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 257.89453125, "completions/mean_terminated_length": 257.89453125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.1710120104253292, "epoch": 0.05486725663716814, "frac_reward_zero_std": 0.0, "grad_norm": 0.26678307296231335, "learning_rate": 5.263157894736842e-07, "loss": -0.0016, "num_tokens": 14277938.0, "reward": 0.32832032442092896, "reward_std": 0.4332149624824524, "rewards/execution_accuracy_EX/mean": 0.29296875, "rewards/execution_accuracy_EX/std": 0.45601576566696167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9946006536483765, "sampling/importance_sampling_ratio/min": 0.01841953955590725, "sampling/sampling_logp_difference/max": 3.9943432807922363, "sampling/sampling_logp_difference/mean": 0.12613239884376526, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 221.19140625, "completions/mean_terminated_length": 221.19140625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.1595402956008911, "epoch": 0.05663716814159292, "frac_reward_zero_std": 0.0, "grad_norm": 0.19593070618767436, "learning_rate": 5.43859649122807e-07, "loss": -0.0014, "num_tokens": 14805619.0, "reward": 0.4878906011581421, "reward_std": 0.47447580099105835, "rewards/execution_accuracy_EX/mean": 0.4609375, "rewards/execution_accuracy_EX/std": 0.4994482398033142, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9907799959182739, "sampling/importance_sampling_ratio/min": 0.014339092187583447, "sampling/sampling_logp_difference/max": 4.244765758514404, "sampling/sampling_logp_difference/mean": 0.1260884702205658, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 235.86328125, "completions/mean_terminated_length": 235.86328125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.16799591109156609, "epoch": 0.0584070796460177, "frac_reward_zero_std": 0.0, "grad_norm": 0.22532606473149772, "learning_rate": 5.614035087719298e-07, "loss": 0.0039, "num_tokens": 15278592.0, "reward": 0.5843749642372131, "reward_std": 0.47219759225845337, "rewards/execution_accuracy_EX/mean": 0.5625, "rewards/execution_accuracy_EX/std": 0.49705013632774353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9923399686813354, "sampling/importance_sampling_ratio/min": 0.008697102777659893, "sampling/sampling_logp_difference/max": 4.744765281677246, "sampling/sampling_logp_difference/mean": 0.13059130311012268, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 213.27734375, "completions/mean_terminated_length": 213.27734375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.15989402122795582, "epoch": 0.06017699115044248, "frac_reward_zero_std": 0.0, "grad_norm": 0.27799309147935464, "learning_rate": 5.789473684210526e-07, "loss": -0.0052, "num_tokens": 15746663.0, "reward": 0.6732421517372131, "reward_std": 0.4523758888244629, "rewards/execution_accuracy_EX/mean": 0.65625, "rewards/execution_accuracy_EX/std": 0.47588926553726196, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9851380586624146, "sampling/importance_sampling_ratio/min": 0.011232429184019566, "sampling/sampling_logp_difference/max": 4.488950252532959, "sampling/sampling_logp_difference/mean": 0.13754978775978088, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 207.76953125, "completions/mean_terminated_length": 207.76953125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.153695460408926, "epoch": 0.061946902654867256, "frac_reward_zero_std": 0.0, "grad_norm": 0.29407859317224894, "learning_rate": 5.964912280701754e-07, "loss": 0.0029, "num_tokens": 16104268.0, "reward": 0.521289050579071, "reward_std": 0.47591590881347656, "rewards/execution_accuracy_EX/mean": 0.49609375, "rewards/execution_accuracy_EX/std": 0.5009641647338867, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990626871585846, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.12581679224967957, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 229.19140625, "completions/mean_terminated_length": 229.19140625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.1576980296522379, "epoch": 0.06371681415929203, "frac_reward_zero_std": 0.0, "grad_norm": 0.29319992058837874, "learning_rate": 6.140350877192982e-07, "loss": -0.0031, "num_tokens": 16453741.0, "reward": 0.3951171934604645, "reward_std": 0.45779263973236084, "rewards/execution_accuracy_EX/mean": 0.36328125, "rewards/execution_accuracy_EX/std": 0.48188701272010803, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9867162108421326, "sampling/importance_sampling_ratio/min": 0.011136544868350029, "sampling/sampling_logp_difference/max": 4.497523307800293, "sampling/sampling_logp_difference/mean": 0.13592344522476196, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 214.5390625, "completions/mean_terminated_length": 214.5390625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.1740215141326189, "epoch": 0.06548672566371681, "frac_reward_zero_std": 0.0, "grad_norm": 0.34071497092016156, "learning_rate": 6.31578947368421e-07, "loss": -0.0136, "num_tokens": 16972215.0, "reward": 0.49531248211860657, "reward_std": 0.47499996423721313, "rewards/execution_accuracy_EX/mean": 0.46875, "rewards/execution_accuracy_EX/std": 0.5, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9905092120170593, "sampling/importance_sampling_ratio/min": 0.005264220293611288, "sampling/sampling_logp_difference/max": 5.246822357177734, "sampling/sampling_logp_difference/mean": 0.1363052874803543, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 236.515625, "completions/mean_terminated_length": 236.515625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.16431527957320213, "epoch": 0.06725663716814159, "frac_reward_zero_std": 0.0, "grad_norm": 0.17369171883723106, "learning_rate": 6.491228070175438e-07, "loss": -0.0015, "num_tokens": 17443371.0, "reward": 0.5732421875, "reward_std": 0.4734695255756378, "rewards/execution_accuracy_EX/mean": 0.55078125, "rewards/execution_accuracy_EX/std": 0.49838894605636597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9886480569839478, "sampling/importance_sampling_ratio/min": 0.008687056601047516, "sampling/sampling_logp_difference/max": 4.7459211349487305, "sampling/sampling_logp_difference/mean": 0.1330389678478241, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 209.31640625, "completions/mean_terminated_length": 209.31640625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.15421190671622753, "epoch": 0.06902654867256637, "frac_reward_zero_std": 0.0, "grad_norm": 0.3589419087860088, "learning_rate": 6.666666666666666e-07, "loss": 0.0042, "num_tokens": 17979468.0, "reward": 0.5806640386581421, "reward_std": 0.4726512134075165, "rewards/execution_accuracy_EX/mean": 0.55859375, "rewards/execution_accuracy_EX/std": 0.4975275993347168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990709662437439, "sampling/importance_sampling_ratio/min": 0.011155104264616966, "sampling/sampling_logp_difference/max": 4.495858192443848, "sampling/sampling_logp_difference/mean": 0.1269502341747284, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 210.71484375, "completions/mean_terminated_length": 210.71484375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.16004453226923943, "epoch": 0.07079646017699115, "frac_reward_zero_std": 0.0, "grad_norm": 0.49120329671078605, "learning_rate": 6.842105263157895e-07, "loss": 0.0094, "num_tokens": 18319395.0, "reward": 0.6585937738418579, "reward_std": 0.456719309091568, "rewards/execution_accuracy_EX/mean": 0.640625, "rewards/execution_accuracy_EX/std": 0.4807571768760681, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9913509488105774, "sampling/importance_sampling_ratio/min": 0.011126003228127956, "sampling/sampling_logp_difference/max": 4.498470306396484, "sampling/sampling_logp_difference/mean": 0.12969903647899628, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 206.0234375, "completions/mean_terminated_length": 206.0234375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.16033608093857765, "epoch": 0.07256637168141593, "frac_reward_zero_std": 0.0, "grad_norm": 0.4078587228416583, "learning_rate": 7.017543859649122e-07, "loss": -0.013, "num_tokens": 18998921.0, "reward": 0.6509765386581421, "reward_std": 0.4586948752403259, "rewards/execution_accuracy_EX/mean": 0.6328125, "rewards/execution_accuracy_EX/std": 0.48298248648643494, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990849494934082, "sampling/importance_sampling_ratio/min": 0.0024824803695082664, "sampling/sampling_logp_difference/max": 5.998497009277344, "sampling/sampling_logp_difference/mean": 0.1328970044851303, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 213.39453125, "completions/mean_terminated_length": 213.39453125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.15308724157512188, "epoch": 0.0743362831858407, "frac_reward_zero_std": 0.0, "grad_norm": 0.4876723398817694, "learning_rate": 7.192982456140351e-07, "loss": 0.0108, "num_tokens": 19372830.0, "reward": 0.4916015565395355, "reward_std": 0.4747525453567505, "rewards/execution_accuracy_EX/mean": 0.46484375, "rewards/execution_accuracy_EX/std": 0.49973952770233154, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.98912513256073, "sampling/importance_sampling_ratio/min": 0.014310465194284916, "sampling/sampling_logp_difference/max": 4.246764183044434, "sampling/sampling_logp_difference/mean": 0.1300293505191803, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 235.95703125, "completions/mean_terminated_length": 235.95703125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.1614510864019394, "epoch": 0.07610619469026549, "frac_reward_zero_std": 0.0, "grad_norm": 0.2473094768090876, "learning_rate": 7.368421052631578e-07, "loss": 0.0057, "num_tokens": 19849571.0, "reward": 0.32832029461860657, "reward_std": 0.4332149624824524, "rewards/execution_accuracy_EX/mean": 0.29296875, "rewards/execution_accuracy_EX/std": 0.45601576566696167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9908735752105713, "sampling/importance_sampling_ratio/min": 0.011183848604559898, "sampling/sampling_logp_difference/max": 4.493284702301025, "sampling/sampling_logp_difference/mean": 0.1290094405412674, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 209.10546875, "completions/mean_terminated_length": 209.10546875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.16792303882539272, "epoch": 0.07787610619469026, "frac_reward_zero_std": 0.0, "grad_norm": 0.6490977517753318, "learning_rate": 7.543859649122807e-07, "loss": 0.0234, "num_tokens": 20362238.0, "reward": 0.7365233898162842, "reward_std": 0.4261363446712494, "rewards/execution_accuracy_EX/mean": 0.72265625, "rewards/execution_accuracy_EX/std": 0.4485645890235901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9866508841514587, "sampling/importance_sampling_ratio/min": 0.008679235354065895, "sampling/sampling_logp_difference/max": 4.746821880340576, "sampling/sampling_logp_difference/mean": 0.1376384198665619, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 229.12890625, "completions/mean_terminated_length": 229.12890625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.17156759835779667, "epoch": 0.07964601769911504, "frac_reward_zero_std": 0.0, "grad_norm": 0.2624399039112141, "learning_rate": 7.719298245614034e-07, "loss": 0.0067, "num_tokens": 20861359.0, "reward": 0.47675782442092896, "reward_std": 0.47346949577331543, "rewards/execution_accuracy_EX/mean": 0.44921875, "rewards/execution_accuracy_EX/std": 0.49838894605636597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.994779109954834, "sampling/importance_sampling_ratio/min": 0.014779280871152878, "sampling/sampling_logp_difference/max": 4.214529037475586, "sampling/sampling_logp_difference/mean": 0.13185223937034607, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 212.03515625, "completions/mean_terminated_length": 212.03515625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.15802641212940216, "epoch": 0.08141592920353982, "frac_reward_zero_std": 0.0, "grad_norm": 0.3114338856066691, "learning_rate": 7.894736842105263e-07, "loss": 0.0052, "num_tokens": 21430072.0, "reward": 0.49531251192092896, "reward_std": 0.4749999940395355, "rewards/execution_accuracy_EX/mean": 0.46875, "rewards/execution_accuracy_EX/std": 0.5, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9872924089431763, "sampling/importance_sampling_ratio/min": 0.005292925983667374, "sampling/sampling_logp_difference/max": 5.241384029388428, "sampling/sampling_logp_difference/mean": 0.13467440009117126, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 195.765625, "completions/mean_terminated_length": 195.765625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.15841412916779518, "epoch": 0.0831858407079646, "frac_reward_zero_std": 0.0, "grad_norm": 0.2544691175388164, "learning_rate": 8.07017543859649e-07, "loss": 0.0084, "num_tokens": 21939036.0, "reward": 0.49531251192092896, "reward_std": 0.4749999940395355, "rewards/execution_accuracy_EX/mean": 0.46875, "rewards/execution_accuracy_EX/std": 0.5, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9892590641975403, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.13591071963310242, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 222.29296875, "completions/mean_terminated_length": 222.29296875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.16205408424139023, "epoch": 0.08495575221238938, "frac_reward_zero_std": 0.0, "grad_norm": 0.4450917013702582, "learning_rate": 8.245614035087719e-07, "loss": 0.0135, "num_tokens": 22428375.0, "reward": 0.569531261920929, "reward_std": 0.4738343358039856, "rewards/execution_accuracy_EX/mean": 0.546875, "rewards/execution_accuracy_EX/std": 0.4987730085849762, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9925086498260498, "sampling/importance_sampling_ratio/min": 0.018390489742159843, "sampling/sampling_logp_difference/max": 3.9959216117858887, "sampling/sampling_logp_difference/mean": 0.1286633014678955, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 218.953125, "completions/mean_terminated_length": 218.953125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.14652677439153194, "epoch": 0.08672566371681416, "frac_reward_zero_std": 0.0, "grad_norm": 0.36398969040701207, "learning_rate": 8.421052631578947e-07, "loss": 0.0007, "num_tokens": 22773851.0, "reward": 0.45820313692092896, "reward_std": 0.471201092004776, "rewards/execution_accuracy_EX/mean": 0.4296875, "rewards/execution_accuracy_EX/std": 0.4960011839866638, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9920421838760376, "sampling/importance_sampling_ratio/min": 0.00559335108846426, "sampling/sampling_logp_difference/max": 5.186176776885986, "sampling/sampling_logp_difference/mean": 0.12167568504810333, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 228.1875, "completions/mean_terminated_length": 228.1875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.16296331398189068, "epoch": 0.08849557522123894, "frac_reward_zero_std": 0.0, "grad_norm": 0.3706959212541897, "learning_rate": 8.596491228070175e-07, "loss": -0.0071, "num_tokens": 23245995.0, "reward": 0.5138671398162842, "reward_std": 0.47579970955848694, "rewards/execution_accuracy_EX/mean": 0.48828125, "rewards/execution_accuracy_EX/std": 0.5008418560028076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990206241607666, "sampling/importance_sampling_ratio/min": 0.011125700548291206, "sampling/sampling_logp_difference/max": 4.498497486114502, "sampling/sampling_logp_difference/mean": 0.1312432885169983, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 193.80078125, "completions/mean_terminated_length": 193.80078125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.14984596893191338, "epoch": 0.09026548672566372, "frac_reward_zero_std": 0.0, "grad_norm": 0.4595076261204974, "learning_rate": 8.771929824561403e-07, "loss": 0.013, "num_tokens": 23594776.0, "reward": 0.755078136920929, "reward_std": 0.41637277603149414, "rewards/execution_accuracy_EX/mean": 0.7421875, "rewards/execution_accuracy_EX/std": 0.4382871091365814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.987525224685669, "sampling/importance_sampling_ratio/min": 0.018390489742159843, "sampling/sampling_logp_difference/max": 3.9959216117858887, "sampling/sampling_logp_difference/mean": 0.12903794646263123, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 228.0703125, "completions/mean_terminated_length": 228.0703125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.16216460056602955, "epoch": 0.0920353982300885, "frac_reward_zero_std": 0.0, "grad_norm": 0.352096192636124, "learning_rate": 8.947368421052631e-07, "loss": 0.0045, "num_tokens": 24127690.0, "reward": 0.6585937738418579, "reward_std": 0.456719309091568, "rewards/execution_accuracy_EX/mean": 0.640625, "rewards/execution_accuracy_EX/std": 0.4807571768760681, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.989894449710846, "sampling/importance_sampling_ratio/min": 0.006812799721956253, "sampling/sampling_logp_difference/max": 4.988952159881592, "sampling/sampling_logp_difference/mean": 0.13485217094421387, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 209.2734375, "completions/mean_terminated_length": 209.2734375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.1585012786090374, "epoch": 0.09380530973451327, "frac_reward_zero_std": 0.0, "grad_norm": 0.2009889434955974, "learning_rate": 9.122807017543859e-07, "loss": -0.008, "num_tokens": 24534224.0, "reward": 0.6548827886581421, "reward_std": 0.45779263973236084, "rewards/execution_accuracy_EX/mean": 0.63671875, "rewards/execution_accuracy_EX/std": 0.48188701272010803, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9906299710273743, "sampling/importance_sampling_ratio/min": 0.008700824342668056, "sampling/sampling_logp_difference/max": 4.744337558746338, "sampling/sampling_logp_difference/mean": 0.13042978942394257, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 204.09765625, "completions/mean_terminated_length": 204.09765625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.14675415121018887, "epoch": 0.09557522123893805, "frac_reward_zero_std": 0.0, "grad_norm": 0.34401304041307923, "learning_rate": 9.298245614035087e-07, "loss": 0.0045, "num_tokens": 24958953.0, "reward": 0.5992187261581421, "reward_std": 0.47008487582206726, "rewards/execution_accuracy_EX/mean": 0.578125, "rewards/execution_accuracy_EX/std": 0.49482619762420654, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9833692312240601, "sampling/importance_sampling_ratio/min": 0.008987164124846458, "sampling/sampling_logp_difference/max": 4.711957931518555, "sampling/sampling_logp_difference/mean": 0.13392765820026398, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 207.64453125, "completions/mean_terminated_length": 207.64453125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.15594125725328922, "epoch": 0.09734513274336283, "frac_reward_zero_std": 0.0, "grad_norm": 0.3232663547794188, "learning_rate": 9.473684210526315e-07, "loss": -0.0031, "num_tokens": 25571086.0, "reward": 0.5361328125, "reward_std": 0.47579970955848694, "rewards/execution_accuracy_EX/mean": 0.51171875, "rewards/execution_accuracy_EX/std": 0.5008418560028076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9889358878135681, "sampling/importance_sampling_ratio/min": 0.014340179972350597, "sampling/sampling_logp_difference/max": 4.24468994140625, "sampling/sampling_logp_difference/mean": 0.12645173072814941, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 216.69140625, "completions/mean_terminated_length": 216.69140625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.16085772216320038, "epoch": 0.09911504424778761, "frac_reward_zero_std": 0.0, "grad_norm": 0.3271752358225103, "learning_rate": 9.649122807017545e-07, "loss": 0.0044, "num_tokens": 25971743.0, "reward": 0.47675782442092896, "reward_std": 0.47346949577331543, "rewards/execution_accuracy_EX/mean": 0.44921875, "rewards/execution_accuracy_EX/std": 0.49838894605636597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9919253587722778, "sampling/importance_sampling_ratio/min": 0.006748078390955925, "sampling/sampling_logp_difference/max": 4.998497486114502, "sampling/sampling_logp_difference/mean": 0.128218412399292, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 243.22265625, "completions/mean_terminated_length": 243.22265625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.15728727541863918, "epoch": 0.10088495575221239, "frac_reward_zero_std": 0.0, "grad_norm": 0.29179080050168243, "learning_rate": 9.82456140350877e-07, "loss": 0.0059, "num_tokens": 26556936.0, "reward": 0.4507812559604645, "reward_std": 0.47008487582206726, "rewards/execution_accuracy_EX/mean": 0.421875, "rewards/execution_accuracy_EX/std": 0.49482619762420654, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9931042194366455, "sampling/importance_sampling_ratio/min": 0.014340603724122047, "sampling/sampling_logp_difference/max": 4.244660377502441, "sampling/sampling_logp_difference/mean": 0.12253376841545105, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 217.87109375, "completions/mean_terminated_length": 217.87109375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.15230992063879967, "epoch": 0.10265486725663717, "frac_reward_zero_std": 0.0, "grad_norm": 0.5333505943398746, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 27050215.0, "reward": 0.6734374761581421, "reward_std": 0.45209482312202454, "rewards/execution_accuracy_EX/mean": 0.65625, "rewards/execution_accuracy_EX/std": 0.47588926553726196, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9868412017822266, "sampling/importance_sampling_ratio/min": 0.011183848604559898, "sampling/sampling_logp_difference/max": 4.493284702301025, "sampling/sampling_logp_difference/mean": 0.1287211775779724, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 222.07421875, "completions/mean_terminated_length": 222.07421875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.15885837003588676, "epoch": 0.10442477876106195, "frac_reward_zero_std": 0.0, "grad_norm": 0.3224764308639901, "learning_rate": 1e-06, "loss": 0.0099, "num_tokens": 27697418.0, "reward": 0.48417967557907104, "reward_std": 0.4741697311401367, "rewards/execution_accuracy_EX/mean": 0.45703125, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9875861406326294, "sampling/importance_sampling_ratio/min": 0.011184130795300007, "sampling/sampling_logp_difference/max": 4.493259429931641, "sampling/sampling_logp_difference/mean": 0.13573023676872253, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 201.859375, "completions/mean_terminated_length": 201.859375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.15865694545209408, "epoch": 0.10619469026548672, "frac_reward_zero_std": 0.0, "grad_norm": 0.339729340208444, "learning_rate": 1e-06, "loss": -0.0034, "num_tokens": 28109862.0, "reward": 0.8070312738418579, "reward_std": 0.38295724987983704, "rewards/execution_accuracy_EX/mean": 0.796875, "rewards/execution_accuracy_EX/std": 0.40311288833618164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9855418801307678, "sampling/importance_sampling_ratio/min": 0.004103474784642458, "sampling/sampling_logp_difference/max": 5.4959211349487305, "sampling/sampling_logp_difference/mean": 0.1409788727760315, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 206.37109375, "completions/mean_terminated_length": 206.37109375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.14121604897081852, "epoch": 0.1079646017699115, "frac_reward_zero_std": 0.0, "grad_norm": 0.2794506948692285, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 28482789.0, "reward": 0.40996092557907104, "reward_std": 0.4617617130279541, "rewards/execution_accuracy_EX/mean": 0.37890625, "rewards/execution_accuracy_EX/std": 0.4860650300979614, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9912207126617432, "sampling/importance_sampling_ratio/min": 0.008697095327079296, "sampling/sampling_logp_difference/max": 4.7447662353515625, "sampling/sampling_logp_difference/mean": 0.11767181754112244, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 201.08984375, "completions/mean_terminated_length": 201.08984375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.1559954546391964, "epoch": 0.10973451327433628, "frac_reward_zero_std": 0.0, "grad_norm": 0.6063657467020576, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 28949948.0, "reward": 0.294921875, "reward_std": 0.41637277603149414, "rewards/execution_accuracy_EX/mean": 0.2578125, "rewards/execution_accuracy_EX/std": 0.4382871091365814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9887830018997192, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.13012146949768066, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 211.2734375, "completions/mean_terminated_length": 211.2734375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.15071019157767296, "epoch": 0.11150442477876106, "frac_reward_zero_std": 0.0, "grad_norm": 0.27922915476043525, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 29324226.0, "reward": 0.6808593273162842, "reward_std": 0.44958025217056274, "rewards/execution_accuracy_EX/mean": 0.6640625, "rewards/execution_accuracy_EX/std": 0.4732423722743988, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9868204593658447, "sampling/importance_sampling_ratio/min": 0.014291773550212383, "sampling/sampling_logp_difference/max": 4.248071193695068, "sampling/sampling_logp_difference/mean": 0.1329112946987152, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 211.91015625, "completions/mean_terminated_length": 211.91015625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.15028476156294346, "epoch": 0.11327433628318584, "frac_reward_zero_std": 0.0, "grad_norm": 0.21032008894745782, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 29642747.0, "reward": 0.6845703125, "reward_std": 0.44827139377593994, "rewards/execution_accuracy_EX/mean": 0.66796875, "rewards/execution_accuracy_EX/std": 0.4718646705150604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9902558922767639, "sampling/importance_sampling_ratio/min": 0.011159200221300125, "sampling/sampling_logp_difference/max": 4.495491027832031, "sampling/sampling_logp_difference/mean": 0.12290841341018677, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 209.05078125, "completions/mean_terminated_length": 209.05078125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.15721244923770428, "epoch": 0.11504424778761062, "frac_reward_zero_std": 0.0, "grad_norm": 0.536090475913442, "learning_rate": 1e-06, "loss": -0.0127, "num_tokens": 30274040.0, "reward": 0.539843738079071, "reward_std": 0.47569799423217773, "rewards/execution_accuracy_EX/mean": 0.515625, "rewards/execution_accuracy_EX/std": 0.5007347464561462, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9889378547668457, "sampling/importance_sampling_ratio/min": 0.0143876438960433, "sampling/sampling_logp_difference/max": 4.241385459899902, "sampling/sampling_logp_difference/mean": 0.1261480450630188, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 200.1640625, "completions/mean_terminated_length": 200.1640625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.16187943145632744, "epoch": 0.1168141592920354, "frac_reward_zero_std": 0.0, "grad_norm": 0.46705033131972595, "learning_rate": 1e-06, "loss": -0.0057, "num_tokens": 30624178.0, "reward": 0.866406261920929, "reward_std": 0.33089950680732727, "rewards/execution_accuracy_EX/mean": 0.859375, "rewards/execution_accuracy_EX/std": 0.3483152687549591, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9888778924942017, "sampling/importance_sampling_ratio/min": 0.0052929287776350975, "sampling/sampling_logp_difference/max": 5.2413835525512695, "sampling/sampling_logp_difference/mean": 0.13253706693649292, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 218.359375, "completions/mean_terminated_length": 218.359375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.15289377607405186, "epoch": 0.11858407079646018, "frac_reward_zero_std": 0.0, "grad_norm": 0.3026216260708256, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 30960318.0, "reward": 0.7291015386581421, "reward_std": 0.4297545850276947, "rewards/execution_accuracy_EX/mean": 0.71484375, "rewards/execution_accuracy_EX/std": 0.4523732364177704, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9879290461540222, "sampling/importance_sampling_ratio/min": 0.008726546540856361, "sampling/sampling_logp_difference/max": 4.741385459899902, "sampling/sampling_logp_difference/mean": 0.12670288980007172, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 208.44921875, "completions/mean_terminated_length": 208.44921875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.15477639250457287, "epoch": 0.12035398230088495, "frac_reward_zero_std": 0.0, "grad_norm": 0.4698306933028935, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 31364753.0, "reward": 0.6919921636581421, "reward_std": 0.4455491304397583, "rewards/execution_accuracy_EX/mean": 0.67578125, "rewards/execution_accuracy_EX/std": 0.46899911761283875, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9852240085601807, "sampling/importance_sampling_ratio/min": 0.006765482947230339, "sampling/sampling_logp_difference/max": 4.995921611785889, "sampling/sampling_logp_difference/mean": 0.1278304159641266, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 241.53515625, "completions/mean_terminated_length": 241.53515625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.15711761824786663, "epoch": 0.12212389380530973, "frac_reward_zero_std": 0.0, "grad_norm": 0.5749773994372718, "learning_rate": 1e-06, "loss": -0.005, "num_tokens": 32008762.0, "reward": 0.517578125, "reward_std": 0.47587236762046814, "rewards/execution_accuracy_EX/mean": 0.4921875, "rewards/execution_accuracy_EX/std": 0.5009182691574097, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.989099383354187, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.12306369096040726, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 200.6640625, "completions/mean_terminated_length": 200.6640625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.15642414800822735, "epoch": 0.12389380530973451, "frac_reward_zero_std": 0.0, "grad_norm": 0.41019204934910464, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 32363556.0, "reward": 0.699414074420929, "reward_std": 0.44268524646759033, "rewards/execution_accuracy_EX/mean": 0.68359375, "rewards/execution_accuracy_EX/std": 0.4659844934940338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9870081543922424, "sampling/importance_sampling_ratio/min": 0.018361039459705353, "sampling/sampling_logp_difference/max": 3.9975242614746094, "sampling/sampling_logp_difference/mean": 0.12989136576652527, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 228.4140625, "completions/mean_terminated_length": 228.4140625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.153860317543149, "epoch": 0.1256637168141593, "frac_reward_zero_std": 0.0, "grad_norm": 0.2613845000041792, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 32825038.0, "reward": 0.49531251192092896, "reward_std": 0.47499996423721313, "rewards/execution_accuracy_EX/mean": 0.46875, "rewards/execution_accuracy_EX/std": 0.5, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9857495427131653, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.1292147934436798, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 233.3125, "completions/mean_terminated_length": 233.3125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.17075590044260025, "epoch": 0.12743362831858407, "frac_reward_zero_std": 0.0, "grad_norm": 0.46115708269394196, "learning_rate": 1e-06, "loss": -0.0077, "num_tokens": 33636990.0, "reward": 0.6957031488418579, "reward_std": 0.4441350996494293, "rewards/execution_accuracy_EX/mean": 0.6796875, "rewards/execution_accuracy_EX/std": 0.4675106406211853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9881327152252197, "sampling/importance_sampling_ratio/min": 0.011183848604559898, "sampling/sampling_logp_difference/max": 4.493284702301025, "sampling/sampling_logp_difference/mean": 0.13605903089046478, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 220.0703125, "completions/mean_terminated_length": 220.0703125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.15813160501420498, "epoch": 0.12920353982300886, "frac_reward_zero_std": 0.0, "grad_norm": 0.27069713662807277, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 34190176.0, "reward": 0.550976574420929, "reward_std": 0.4752182364463806, "rewards/execution_accuracy_EX/mean": 0.52734375, "rewards/execution_accuracy_EX/std": 0.5002297759056091, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.986320972442627, "sampling/importance_sampling_ratio/min": 0.01118975318968296, "sampling/sampling_logp_difference/max": 4.4927568435668945, "sampling/sampling_logp_difference/mean": 0.13204938173294067, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 252.2890625, "completions/mean_terminated_length": 252.2890625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.18862844444811344, "epoch": 0.13097345132743363, "frac_reward_zero_std": 0.0, "grad_norm": 0.2288468255994566, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 34728666.0, "reward": 0.643750011920929, "reward_std": 0.46081769466400146, "rewards/execution_accuracy_EX/mean": 0.625, "rewards/execution_accuracy_EX/std": 0.4850712716579437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.995506763458252, "sampling/importance_sampling_ratio/min": 0.011125700548291206, "sampling/sampling_logp_difference/max": 4.498497486114502, "sampling/sampling_logp_difference/mean": 0.13722500205039978, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 247.48046875, "completions/mean_terminated_length": 247.48046875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.18717484176158905, "epoch": 0.13274336283185842, "frac_reward_zero_std": 0.0, "grad_norm": 0.40398866495893176, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 35264965.0, "reward": 0.5658203363418579, "reward_std": 0.4741697609424591, "rewards/execution_accuracy_EX/mean": 0.54296875, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9933390021324158, "sampling/importance_sampling_ratio/min": 0.02364342100918293, "sampling/sampling_logp_difference/max": 3.7446703910827637, "sampling/sampling_logp_difference/mean": 0.13772422075271606, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 249.34765625, "completions/mean_terminated_length": 249.34765625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.168232761323452, "epoch": 0.13451327433628318, "frac_reward_zero_std": 0.0, "grad_norm": 0.3183141585051623, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 35725918.0, "reward": 0.502734363079071, "reward_std": 0.47540730237960815, "rewards/execution_accuracy_EX/mean": 0.4765625, "rewards/execution_accuracy_EX/std": 0.5004287362098694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9941282868385315, "sampling/importance_sampling_ratio/min": 0.012834830209612846, "sampling/sampling_logp_difference/max": 4.355592727661133, "sampling/sampling_logp_difference/mean": 0.12626288831233978, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 244.9140625, "completions/mean_terminated_length": 244.9140625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.17300679720938206, "epoch": 0.13628318584070798, "frac_reward_zero_std": 0.0, "grad_norm": 0.2539974690397416, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 36173432.0, "reward": 0.6437499523162842, "reward_std": 0.46081769466400146, "rewards/execution_accuracy_EX/mean": 0.625, "rewards/execution_accuracy_EX/std": 0.4850712716579437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9919909834861755, "sampling/importance_sampling_ratio/min": 0.017717216163873672, "sampling/sampling_logp_difference/max": 4.0332183837890625, "sampling/sampling_logp_difference/mean": 0.1292758584022522, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 243.56640625, "completions/mean_terminated_length": 243.56640625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.1690126322209835, "epoch": 0.13805309734513274, "frac_reward_zero_std": 0.0, "grad_norm": 0.2874219749877832, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 36734777.0, "reward": 0.3505859375, "reward_std": 0.44268524646759033, "rewards/execution_accuracy_EX/mean": 0.31640625, "rewards/execution_accuracy_EX/std": 0.4659844934940338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9782031774520874, "sampling/importance_sampling_ratio/mean": 0.9859957098960876, "sampling/importance_sampling_ratio/min": 0.011125700548291206, "sampling/sampling_logp_difference/max": 4.498497486114502, "sampling/sampling_logp_difference/mean": 0.13325968384742737, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 254.6875, "completions/mean_terminated_length": 254.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.18058686703443527, "epoch": 0.13982300884955753, "frac_reward_zero_std": 0.0, "grad_norm": 0.3546576154835264, "learning_rate": 1e-06, "loss": -0.0083, "num_tokens": 37203609.0, "reward": 0.6029297113418579, "reward_std": 0.46948158740997314, "rewards/execution_accuracy_EX/mean": 0.58203125, "rewards/execution_accuracy_EX/std": 0.49419113993644714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9901555776596069, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.13888263702392578, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 249.5078125, "completions/mean_terminated_length": 249.5078125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.16634603776037693, "epoch": 0.1415929203539823, "frac_reward_zero_std": 0.0, "grad_norm": 0.3452826158835721, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 37865579.0, "reward": 0.5361328125, "reward_std": 0.47579970955848694, "rewards/execution_accuracy_EX/mean": 0.51171875, "rewards/execution_accuracy_EX/std": 0.5008418560028076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9880659580230713, "sampling/importance_sampling_ratio/min": 0.011136550456285477, "sampling/sampling_logp_difference/max": 4.497522830963135, "sampling/sampling_logp_difference/mean": 0.133858323097229, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 243.84765625, "completions/mean_terminated_length": 243.84765625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.17802287265658379, "epoch": 0.1433628318584071, "frac_reward_zero_std": 0.0, "grad_norm": 0.09885274663321054, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 38417524.0, "reward": 0.5806640386581421, "reward_std": 0.4726512134075165, "rewards/execution_accuracy_EX/mean": 0.55859375, "rewards/execution_accuracy_EX/std": 0.4975275993347168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.991807222366333, "sampling/importance_sampling_ratio/min": 0.014511375688016415, "sampling/sampling_logp_difference/max": 4.232822418212891, "sampling/sampling_logp_difference/mean": 0.13529834151268005, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 231.14453125, "completions/mean_terminated_length": 231.14453125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.17939144186675549, "epoch": 0.14513274336283186, "frac_reward_zero_std": 0.0, "grad_norm": 0.30764387537599147, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 38935769.0, "reward": 0.443359375, "reward_std": 0.46884801983833313, "rewards/execution_accuracy_EX/mean": 0.4140625, "rewards/execution_accuracy_EX/std": 0.4935242533683777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9884452819824219, "sampling/importance_sampling_ratio/min": 0.014291773550212383, "sampling/sampling_logp_difference/max": 4.248071193695068, "sampling/sampling_logp_difference/mean": 0.1392521858215332, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 225.58984375, "completions/mean_terminated_length": 225.58984375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.1662676576524973, "epoch": 0.14690265486725665, "frac_reward_zero_std": 0.0, "grad_norm": 0.159489195026553, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 39425600.0, "reward": 0.576953113079071, "reward_std": 0.47307515144348145, "rewards/execution_accuracy_EX/mean": 0.5546875, "rewards/execution_accuracy_EX/std": 0.49797385931015015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9720790386199951, "sampling/importance_sampling_ratio/mean": 0.9862806797027588, "sampling/importance_sampling_ratio/min": 0.023676205426454544, "sampling/sampling_logp_difference/max": 3.7432847023010254, "sampling/sampling_logp_difference/mean": 0.13640713691711426, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 246.4921875, "completions/mean_terminated_length": 246.4921875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.16606100276112556, "epoch": 0.1486725663716814, "frac_reward_zero_std": 0.0, "grad_norm": 0.27365668606557403, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 40151502.0, "reward": 0.6326172351837158, "reward_std": 0.46355465054512024, "rewards/execution_accuracy_EX/mean": 0.61328125, "rewards/execution_accuracy_EX/std": 0.4879522919654846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9730298519134521, "sampling/importance_sampling_ratio/mean": 0.9869570732116699, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.13253940641880035, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 254.39453125, "completions/mean_terminated_length": 254.39453125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.16300375014543533, "epoch": 0.1504424778761062, "frac_reward_zero_std": 0.0, "grad_norm": 0.39185776025031027, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 40474675.0, "reward": 0.6957031488418579, "reward_std": 0.4441350996494293, "rewards/execution_accuracy_EX/mean": 0.6796875, "rewards/execution_accuracy_EX/std": 0.4675106406211853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9899999499320984, "sampling/importance_sampling_ratio/min": 0.0053865727968513966, "sampling/sampling_logp_difference/max": 5.223845958709717, "sampling/sampling_logp_difference/mean": 0.13284769654273987, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 242.703125, "completions/mean_terminated_length": 242.703125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.1711716093122959, "epoch": 0.15221238938053097, "frac_reward_zero_std": 0.0, "grad_norm": 0.3204005745160462, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 40898775.0, "reward": 0.5621093511581421, "reward_std": 0.47447580099105835, "rewards/execution_accuracy_EX/mean": 0.5390625, "rewards/execution_accuracy_EX/std": 0.4994482398033142, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9313511848449707, "sampling/importance_sampling_ratio/mean": 0.9848341941833496, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.14207801222801208, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 228.296875, "completions/mean_terminated_length": 228.296875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.16796389780938625, "epoch": 0.15398230088495576, "frac_reward_zero_std": 0.0, "grad_norm": 0.20056636467959543, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 41385699.0, "reward": 0.632617175579071, "reward_std": 0.46355465054512024, "rewards/execution_accuracy_EX/mean": 0.61328125, "rewards/execution_accuracy_EX/std": 0.4879522919654846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9863157868385315, "sampling/importance_sampling_ratio/min": 0.014360584318637848, "sampling/sampling_logp_difference/max": 4.243268013000488, "sampling/sampling_logp_difference/mean": 0.1338033676147461, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 259.87890625, "completions/mean_terminated_length": 259.87890625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.16970933973789215, "epoch": 0.15575221238938053, "frac_reward_zero_std": 0.0, "grad_norm": 0.4007174339861282, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 41922164.0, "reward": 0.5658203363418579, "reward_std": 0.4741697609424591, "rewards/execution_accuracy_EX/mean": 0.54296875, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9893125295639038, "sampling/importance_sampling_ratio/min": 0.015252375043928623, "sampling/sampling_logp_difference/max": 4.183020114898682, "sampling/sampling_logp_difference/mean": 0.13815277814865112, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 268.3828125, "completions/mean_terminated_length": 268.3828125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.1632604207843542, "epoch": 0.15752212389380532, "frac_reward_zero_std": 0.0, "grad_norm": 0.270997162343305, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 42477190.0, "reward": 0.5843750238418579, "reward_std": 0.47219759225845337, "rewards/execution_accuracy_EX/mean": 0.5625, "rewards/execution_accuracy_EX/std": 0.49705013632774353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9690579175949097, "sampling/importance_sampling_ratio/mean": 0.9899737238883972, "sampling/importance_sampling_ratio/min": 0.0024888834450393915, "sampling/sampling_logp_difference/max": 5.9959211349487305, "sampling/sampling_logp_difference/mean": 0.1313299685716629, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 275.8828125, "completions/mean_terminated_length": 275.8828125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.16058760695159435, "epoch": 0.1592920353982301, "frac_reward_zero_std": 0.0, "grad_norm": 0.27833649905689195, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 42977768.0, "reward": 0.4990234375, "reward_std": 0.475218266248703, "rewards/execution_accuracy_EX/mean": 0.47265625, "rewards/execution_accuracy_EX/std": 0.5002297759056091, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9903367757797241, "sampling/importance_sampling_ratio/min": 0.008687051944434643, "sampling/sampling_logp_difference/max": 4.745921611785889, "sampling/sampling_logp_difference/mean": 0.12509432435035706, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 247.38671875, "completions/mean_terminated_length": 247.38671875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.16586022078990936, "epoch": 0.16106194690265488, "frac_reward_zero_std": 0.0, "grad_norm": 0.30564575311167513, "learning_rate": 1e-06, "loss": -0.018, "num_tokens": 43354875.0, "reward": 0.614062488079071, "reward_std": 0.46748965978622437, "rewards/execution_accuracy_EX/mean": 0.59375, "rewards/execution_accuracy_EX/std": 0.49209436774253845, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9854615330696106, "sampling/importance_sampling_ratio/min": 0.004103472921997309, "sampling/sampling_logp_difference/max": 5.495921611785889, "sampling/sampling_logp_difference/mean": 0.13805580139160156, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 306.890625, "completions/mean_terminated_length": 306.890625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.16652163863182068, "epoch": 0.16283185840707964, "frac_reward_zero_std": 0.0, "grad_norm": 0.32796255150826914, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 43919183.0, "reward": 0.5509765148162842, "reward_std": 0.4752182066440582, "rewards/execution_accuracy_EX/mean": 0.52734375, "rewards/execution_accuracy_EX/std": 0.5002297759056091, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9934643507003784, "sampling/importance_sampling_ratio/min": 0.0143876438960433, "sampling/sampling_logp_difference/max": 4.241385459899902, "sampling/sampling_logp_difference/mean": 0.12311001121997833, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 233.2265625, "completions/mean_terminated_length": 233.2265625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.16081063263118267, "epoch": 0.16460176991150444, "frac_reward_zero_std": 0.0, "grad_norm": 0.3731038031078675, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 44322409.0, "reward": 0.5546875, "reward_std": 0.47499996423721313, "rewards/execution_accuracy_EX/mean": 0.53125, "rewards/execution_accuracy_EX/std": 0.5, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9858866930007935, "sampling/importance_sampling_ratio/min": 0.008679230697453022, "sampling/sampling_logp_difference/max": 4.746822357177734, "sampling/sampling_logp_difference/mean": 0.13660240173339844, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 276.6484375, "completions/mean_terminated_length": 276.6484375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.1680069100111723, "epoch": 0.1663716814159292, "frac_reward_zero_std": 0.0, "grad_norm": 0.2638305265804409, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 44854543.0, "reward": 0.6400390863418579, "reward_std": 0.4617617726325989, "rewards/execution_accuracy_EX/mean": 0.62109375, "rewards/execution_accuracy_EX/std": 0.4860650300979614, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9871376752853394, "sampling/importance_sampling_ratio/min": 0.011312469840049744, "sampling/sampling_logp_difference/max": 4.481849670410156, "sampling/sampling_logp_difference/mean": 0.1372930109500885, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 259.74609375, "completions/mean_terminated_length": 259.74609375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.15809837356209755, "epoch": 0.168141592920354, "frac_reward_zero_std": 0.0, "grad_norm": 0.4702417167635799, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 45445278.0, "reward": 0.643750011920929, "reward_std": 0.46081769466400146, "rewards/execution_accuracy_EX/mean": 0.625, "rewards/execution_accuracy_EX/std": 0.4850712716579437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9890620708465576, "sampling/importance_sampling_ratio/min": 0.014322527684271336, "sampling/sampling_logp_difference/max": 4.245921611785889, "sampling/sampling_logp_difference/mean": 0.12532582879066467, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 279.64453125, "completions/mean_terminated_length": 279.64453125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.17670874670147896, "epoch": 0.16991150442477876, "frac_reward_zero_std": 0.0, "grad_norm": 0.2779130915503557, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 45965299.0, "reward": 0.591796875, "reward_std": 0.4712011218070984, "rewards/execution_accuracy_EX/mean": 0.5703125, "rewards/execution_accuracy_EX/std": 0.4960011839866638, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9900187253952026, "sampling/importance_sampling_ratio/min": 0.011125700548291206, "sampling/sampling_logp_difference/max": 4.498497486114502, "sampling/sampling_logp_difference/mean": 0.135993093252182, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 247.79296875, "completions/mean_terminated_length": 247.79296875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.1574590727686882, "epoch": 0.17168141592920355, "frac_reward_zero_std": 0.0, "grad_norm": 0.4356211480436373, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 46372766.0, "reward": 0.699414074420929, "reward_std": 0.44268524646759033, "rewards/execution_accuracy_EX/mean": 0.68359375, "rewards/execution_accuracy_EX/std": 0.4659844934940338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9880787134170532, "sampling/importance_sampling_ratio/min": 0.014309640042483807, "sampling/sampling_logp_difference/max": 4.246821880340576, "sampling/sampling_logp_difference/mean": 0.12860193848609924, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 280.41796875, "completions/mean_terminated_length": 280.41796875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.16018960252404213, "epoch": 0.17345132743362832, "frac_reward_zero_std": 0.0, "grad_norm": 0.23488044158580396, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 46908217.0, "reward": 0.369140625, "reward_std": 0.44958022236824036, "rewards/execution_accuracy_EX/mean": 0.3359375, "rewards/execution_accuracy_EX/std": 0.4732423722743988, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9887105226516724, "sampling/importance_sampling_ratio/min": 0.011126094497740269, "sampling/sampling_logp_difference/max": 4.498462200164795, "sampling/sampling_logp_difference/mean": 0.1292063593864441, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 265.7734375, "completions/mean_terminated_length": 265.7734375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.1783987581729889, "epoch": 0.1752212389380531, "frac_reward_zero_std": 0.0, "grad_norm": 0.43843478273889336, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 47390831.0, "reward": 0.614062488079071, "reward_std": 0.46748965978622437, "rewards/execution_accuracy_EX/mean": 0.59375, "rewards/execution_accuracy_EX/std": 0.49209436774253845, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9901736974716187, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.1345309317111969, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 268.265625, "completions/mean_terminated_length": 268.265625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.16882420890033245, "epoch": 0.17699115044247787, "frac_reward_zero_std": 0.0, "grad_norm": 0.2611995951717259, "learning_rate": 1e-06, "loss": -0.0073, "num_tokens": 47884451.0, "reward": 0.6994140148162842, "reward_std": 0.44268524646759033, "rewards/execution_accuracy_EX/mean": 0.68359375, "rewards/execution_accuracy_EX/std": 0.4659844934940338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9882525205612183, "sampling/importance_sampling_ratio/min": 0.008661825209856033, "sampling/sampling_logp_difference/max": 4.7488298416137695, "sampling/sampling_logp_difference/mean": 0.13454368710517883, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 249.390625, "completions/mean_terminated_length": 249.390625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.1629339773207903, "epoch": 0.17876106194690267, "frac_reward_zero_std": 0.0, "grad_norm": 0.30815959071529064, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 48590135.0, "reward": 0.5138671398162842, "reward_std": 0.4757997393608093, "rewards/execution_accuracy_EX/mean": 0.48828125, "rewards/execution_accuracy_EX/std": 0.5008418560028076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9862339496612549, "sampling/importance_sampling_ratio/min": 0.0143876438960433, "sampling/sampling_logp_difference/max": 4.241385459899902, "sampling/sampling_logp_difference/mean": 0.1340867280960083, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 268.53515625, "completions/mean_terminated_length": 268.53515625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.17842296697199345, "epoch": 0.18053097345132743, "frac_reward_zero_std": 0.0, "grad_norm": 0.2630277587397829, "learning_rate": 1e-06, "loss": -0.0062, "num_tokens": 49049536.0, "reward": 0.6326172351837158, "reward_std": 0.46355465054512024, "rewards/execution_accuracy_EX/mean": 0.61328125, "rewards/execution_accuracy_EX/std": 0.4879522919654846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9884659647941589, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.14042013883590698, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 283.96875, "completions/mean_terminated_length": 283.96875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.1600775383412838, "epoch": 0.18230088495575222, "frac_reward_zero_std": 0.0, "grad_norm": 0.1793625261389528, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 49641960.0, "reward": 0.6177734136581421, "reward_std": 0.46676453948020935, "rewards/execution_accuracy_EX/mean": 0.59765625, "rewards/execution_accuracy_EX/std": 0.4913311004638672, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9879857897758484, "sampling/importance_sampling_ratio/min": 0.011313592083752155, "sampling/sampling_logp_difference/max": 4.48175048828125, "sampling/sampling_logp_difference/mean": 0.12989619374275208, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 239.52734375, "completions/mean_terminated_length": 239.52734375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.16701813600957394, "epoch": 0.184070796460177, "frac_reward_zero_std": 0.0, "grad_norm": 0.2907646272223414, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 50205615.0, "reward": 0.651171863079071, "reward_std": 0.45883333683013916, "rewards/execution_accuracy_EX/mean": 0.6328125, "rewards/execution_accuracy_EX/std": 0.48298248648643494, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9849092960357666, "sampling/importance_sampling_ratio/min": 0.011142690666019917, "sampling/sampling_logp_difference/max": 4.496971607208252, "sampling/sampling_logp_difference/mean": 0.13860410451889038, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 275.90625, "completions/mean_terminated_length": 275.90625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.16702409833669662, "epoch": 0.18584070796460178, "frac_reward_zero_std": 0.0, "grad_norm": 0.09613632547729432, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 50781831.0, "reward": 0.6957030892372131, "reward_std": 0.4441350996494293, "rewards/execution_accuracy_EX/mean": 0.6796875, "rewards/execution_accuracy_EX/std": 0.4675106406211853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9908092021942139, "sampling/importance_sampling_ratio/min": 0.014347114600241184, "sampling/sampling_logp_difference/max": 4.244206428527832, "sampling/sampling_logp_difference/mean": 0.1251516193151474, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 239.59375, "completions/mean_terminated_length": 239.59375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.16419068723917007, "epoch": 0.18761061946902655, "frac_reward_zero_std": 0.0, "grad_norm": 0.39530890701734583, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 51211711.0, "reward": 0.680859386920929, "reward_std": 0.44958025217056274, "rewards/execution_accuracy_EX/mean": 0.6640625, "rewards/execution_accuracy_EX/std": 0.4732423722743988, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9864935278892517, "sampling/importance_sampling_ratio/min": 0.014291773550212383, "sampling/sampling_logp_difference/max": 4.248071193695068, "sampling/sampling_logp_difference/mean": 0.13334545493125916, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 243.6328125, "completions/mean_terminated_length": 243.6328125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.1734424475580454, "epoch": 0.18938053097345134, "frac_reward_zero_std": 0.0, "grad_norm": 0.30370856163159327, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 51704449.0, "reward": 0.7884765863418579, "reward_std": 0.3960021138191223, "rewards/execution_accuracy_EX/mean": 0.77734375, "rewards/execution_accuracy_EX/std": 0.41684433817863464, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9841630458831787, "sampling/importance_sampling_ratio/min": 0.005370927508920431, "sampling/sampling_logp_difference/max": 5.226754665374756, "sampling/sampling_logp_difference/mean": 0.14041289687156677, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 243.91796875, "completions/mean_terminated_length": 243.91796875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.1818134058266878, "epoch": 0.1911504424778761, "frac_reward_zero_std": 0.0, "grad_norm": 0.3495723097955485, "learning_rate": 1e-06, "loss": -0.0063, "num_tokens": 52143356.0, "reward": 0.669726550579071, "reward_std": 0.4533010721206665, "rewards/execution_accuracy_EX/mean": 0.65234375, "rewards/execution_accuracy_EX/std": 0.4771590530872345, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9897716045379639, "sampling/importance_sampling_ratio/min": 0.011125700548291206, "sampling/sampling_logp_difference/max": 4.498497486114502, "sampling/sampling_logp_difference/mean": 0.13901342451572418, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 277.91015625, "completions/mean_terminated_length": 277.91015625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.16938803531229496, "epoch": 0.1929203539823009, "frac_reward_zero_std": 0.0, "grad_norm": 0.3644169906885502, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 52718741.0, "reward": 0.6400390863418579, "reward_std": 0.4617617428302765, "rewards/execution_accuracy_EX/mean": 0.62109375, "rewards/execution_accuracy_EX/std": 0.4860650300979614, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.988355815410614, "sampling/importance_sampling_ratio/min": 0.006754652131348848, "sampling/sampling_logp_difference/max": 4.997523784637451, "sampling/sampling_logp_difference/mean": 0.13667330145835876, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 227.5, "completions/mean_terminated_length": 227.5, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.16376989893615246, "epoch": 0.19469026548672566, "frac_reward_zero_std": 0.0, "grad_norm": 0.45963110454363654, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 53235269.0, "reward": 0.7699218988418579, "reward_std": 0.40778404474258423, "rewards/execution_accuracy_EX/mean": 0.7578125, "rewards/execution_accuracy_EX/std": 0.4292463958263397, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9892630577087402, "sampling/importance_sampling_ratio/min": 0.008661825209856033, "sampling/sampling_logp_difference/max": 4.7488298416137695, "sampling/sampling_logp_difference/mean": 0.13272030651569366, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 298.20703125, "completions/mean_terminated_length": 298.20703125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.18415713869035244, "epoch": 0.19646017699115045, "frac_reward_zero_std": 0.0, "grad_norm": 0.4609769686434164, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 53829322.0, "reward": 0.502734363079071, "reward_std": 0.47540730237960815, "rewards/execution_accuracy_EX/mean": 0.4765625, "rewards/execution_accuracy_EX/std": 0.5004287362098694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9882371425628662, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.1410694718360901, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 261.19921875, "completions/mean_terminated_length": 261.19921875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.16982518509030342, "epoch": 0.19823008849557522, "frac_reward_zero_std": 0.0, "grad_norm": 0.3012565908690922, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 54329517.0, "reward": 0.5992187261581421, "reward_std": 0.47008487582206726, "rewards/execution_accuracy_EX/mean": 0.578125, "rewards/execution_accuracy_EX/std": 0.49482619762420654, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9867724776268005, "sampling/importance_sampling_ratio/min": 0.009282294660806656, "sampling/sampling_logp_difference/max": 4.6796464920043945, "sampling/sampling_logp_difference/mean": 0.13735702633857727, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 241.89453125, "completions/mean_terminated_length": 241.89453125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.1724399346858263, "epoch": 0.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.3421487665342455, "learning_rate": 1e-06, "loss": 0.01, "num_tokens": 54818786.0, "reward": 0.6957030892372131, "reward_std": 0.44413506984710693, "rewards/execution_accuracy_EX/mean": 0.6796875, "rewards/execution_accuracy_EX/std": 0.4675106406211853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9555773735046387, "sampling/importance_sampling_ratio/mean": 0.988462507724762, "sampling/importance_sampling_ratio/min": 0.014339153654873371, "sampling/sampling_logp_difference/max": 4.2447614669799805, "sampling/sampling_logp_difference/mean": 0.13636521995067596, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 259.01171875, "completions/mean_terminated_length": 259.01171875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.1605844870209694, "epoch": 0.20176991150442478, "frac_reward_zero_std": 0.0, "grad_norm": 0.2657140773286703, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 55275397.0, "reward": 0.48046875, "reward_std": 0.4738343358039856, "rewards/execution_accuracy_EX/mean": 0.453125, "rewards/execution_accuracy_EX/std": 0.4987730085849762, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896270036697388, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.12312790751457214, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 271.6015625, "completions/mean_terminated_length": 271.6015625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.17996527440845966, "epoch": 0.20353982300884957, "frac_reward_zero_std": 0.0, "grad_norm": 0.3559506881222393, "learning_rate": 1e-06, "loss": -0.0155, "num_tokens": 55794847.0, "reward": 0.6029297113418579, "reward_std": 0.46948158740997314, "rewards/execution_accuracy_EX/mean": 0.58203125, "rewards/execution_accuracy_EX/std": 0.49419113993644714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9921287894248962, "sampling/importance_sampling_ratio/min": 0.01839187555015087, "sampling/sampling_logp_difference/max": 3.9958462715148926, "sampling/sampling_logp_difference/mean": 0.1332182139158249, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 259.36328125, "completions/mean_terminated_length": 259.36328125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.17369364015758038, "epoch": 0.20530973451327433, "frac_reward_zero_std": 0.0, "grad_norm": 0.32450382246623954, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 56206572.0, "reward": 0.5843750238418579, "reward_std": 0.47219759225845337, "rewards/execution_accuracy_EX/mean": 0.5625, "rewards/execution_accuracy_EX/std": 0.49705013632774353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9917175769805908, "sampling/importance_sampling_ratio/mean": 0.9929153919219971, "sampling/importance_sampling_ratio/min": 0.014437063597142696, "sampling/sampling_logp_difference/max": 4.237956523895264, "sampling/sampling_logp_difference/mean": 0.1287001669406891, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 262.88671875, "completions/mean_terminated_length": 262.88671875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.17147137969732285, "epoch": 0.20707964601769913, "frac_reward_zero_std": 0.0, "grad_norm": 0.2168605423484229, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 56696175.0, "reward": 0.6734374761581421, "reward_std": 0.45209482312202454, "rewards/execution_accuracy_EX/mean": 0.65625, "rewards/execution_accuracy_EX/std": 0.47588926553726196, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896504282951355, "sampling/importance_sampling_ratio/min": 0.011144374497234821, "sampling/sampling_logp_difference/max": 4.496820449829102, "sampling/sampling_logp_difference/mean": 0.1343401074409485, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 269.41796875, "completions/mean_terminated_length": 269.41796875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.19181371852755547, "epoch": 0.2088495575221239, "frac_reward_zero_std": 0.0, "grad_norm": 0.2932413456666399, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 57192138.0, "reward": 0.5101562738418579, "reward_std": 0.4756980240345001, "rewards/execution_accuracy_EX/mean": 0.484375, "rewards/execution_accuracy_EX/std": 0.5007347464561462, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9915359020233154, "sampling/importance_sampling_ratio/min": 0.018446944653987885, "sampling/sampling_logp_difference/max": 3.992856502532959, "sampling/sampling_logp_difference/mean": 0.14277532696723938, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 256.64453125, "completions/mean_terminated_length": 256.64453125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.1774155180901289, "epoch": 0.21061946902654868, "frac_reward_zero_std": 0.0, "grad_norm": 0.12749049864679354, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 57596959.0, "reward": 0.576953113079071, "reward_std": 0.47307512164115906, "rewards/execution_accuracy_EX/mean": 0.5546875, "rewards/execution_accuracy_EX/std": 0.49797385931015015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9892342686653137, "sampling/importance_sampling_ratio/min": 0.008668428286910057, "sampling/sampling_logp_difference/max": 4.748067855834961, "sampling/sampling_logp_difference/mean": 0.13478384912014008, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 267.2890625, "completions/mean_terminated_length": 267.2890625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.18676412478089333, "epoch": 0.21238938053097345, "frac_reward_zero_std": 0.0, "grad_norm": 0.36605313572657927, "learning_rate": 1e-06, "loss": -0.0062, "num_tokens": 58133417.0, "reward": 0.6251952648162842, "reward_std": 0.4652217924594879, "rewards/execution_accuracy_EX/mean": 0.60546875, "rewards/execution_accuracy_EX/std": 0.48970720171928406, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9920075535774231, "sampling/importance_sampling_ratio/min": 0.014339085668325424, "sampling/sampling_logp_difference/max": 4.2447662353515625, "sampling/sampling_logp_difference/mean": 0.13781046867370605, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 255.11328125, "completions/mean_terminated_length": 255.11328125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.1788367796689272, "epoch": 0.21415929203539824, "frac_reward_zero_std": 0.0, "grad_norm": 0.3330431759383045, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 58559350.0, "reward": 0.6363281011581421, "reward_std": 0.46267399191856384, "rewards/execution_accuracy_EX/mean": 0.6171875, "rewards/execution_accuracy_EX/std": 0.48702529072761536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9892991781234741, "sampling/importance_sampling_ratio/min": 0.010825454257428646, "sampling/sampling_logp_difference/max": 4.52585506439209, "sampling/sampling_logp_difference/mean": 0.13784188032150269, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 240.33984375, "completions/mean_terminated_length": 240.33984375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.17649833858013153, "epoch": 0.215929203539823, "frac_reward_zero_std": 0.0, "grad_norm": 0.15017924825176865, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 59040813.0, "reward": 0.6474609375, "reward_std": 0.45984160900115967, "rewards/execution_accuracy_EX/mean": 0.62890625, "rewards/execution_accuracy_EX/std": 0.48404383659362793, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9871264696121216, "sampling/importance_sampling_ratio/min": 0.008697095327079296, "sampling/sampling_logp_difference/max": 4.7447662353515625, "sampling/sampling_logp_difference/mean": 0.13799268007278442, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 271.26953125, "completions/mean_terminated_length": 271.26953125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.18275177665054798, "epoch": 0.2176991150442478, "frac_reward_zero_std": 0.0, "grad_norm": 0.3987088809267594, "learning_rate": 1e-06, "loss": -0.0059, "num_tokens": 59434258.0, "reward": 0.5658203363418579, "reward_std": 0.4741697609424591, "rewards/execution_accuracy_EX/mean": 0.54296875, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9922703504562378, "sampling/importance_sampling_ratio/min": 0.011370888911187649, "sampling/sampling_logp_difference/max": 4.476698875427246, "sampling/sampling_logp_difference/mean": 0.132550448179245, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 237.4140625, "completions/mean_terminated_length": 237.4140625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.17214470729231834, "epoch": 0.21946902654867256, "frac_reward_zero_std": 0.0, "grad_norm": 0.38100763019028927, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 60053116.0, "reward": 0.47675779461860657, "reward_std": 0.47346949577331543, "rewards/execution_accuracy_EX/mean": 0.44921875, "rewards/execution_accuracy_EX/std": 0.49838894605636597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9862518310546875, "sampling/importance_sampling_ratio/min": 0.018390560522675514, "sampling/sampling_logp_difference/max": 3.995917797088623, "sampling/sampling_logp_difference/mean": 0.13303159177303314, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 248.16796875, "completions/mean_terminated_length": 248.16796875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.17250299267470837, "epoch": 0.22123893805309736, "frac_reward_zero_std": 0.0, "grad_norm": 0.28478565039191006, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 60342135.0, "reward": 0.7142578363418579, "reward_std": 0.4365212619304657, "rewards/execution_accuracy_EX/mean": 0.69921875, "rewards/execution_accuracy_EX/std": 0.45949608087539673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9911350607872009, "sampling/importance_sampling_ratio/min": 0.023641232401132584, "sampling/sampling_logp_difference/max": 3.744762897491455, "sampling/sampling_logp_difference/mean": 0.13086634874343872, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 263.34765625, "completions/mean_terminated_length": 263.34765625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.17334261536598206, "epoch": 0.22300884955752212, "frac_reward_zero_std": 0.0, "grad_norm": 0.3397032515251031, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 60843104.0, "reward": 0.5435546636581421, "reward_std": 0.4755672216415405, "rewards/execution_accuracy_EX/mean": 0.51953125, "rewards/execution_accuracy_EX/std": 0.5005971193313599, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.969807744026184, "sampling/importance_sampling_ratio/mean": 0.9911377429962158, "sampling/importance_sampling_ratio/min": 0.01113718282431364, "sampling/sampling_logp_difference/max": 4.497466087341309, "sampling/sampling_logp_difference/mean": 0.13102008402347565, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 233.08203125, "completions/mean_terminated_length": 233.08203125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.1644600834697485, "epoch": 0.2247787610619469, "frac_reward_zero_std": 0.0, "grad_norm": 0.19006932124665715, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 61219861.0, "reward": 0.6808593273162842, "reward_std": 0.44958022236824036, "rewards/execution_accuracy_EX/mean": 0.6640625, "rewards/execution_accuracy_EX/std": 0.4732423722743988, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9873524308204651, "sampling/importance_sampling_ratio/min": 0.014340453781187534, "sampling/sampling_logp_difference/max": 4.244670867919922, "sampling/sampling_logp_difference/mean": 0.1292169690132141, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 243.5234375, "completions/mean_terminated_length": 243.5234375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.1923501305282116, "epoch": 0.22654867256637168, "frac_reward_zero_std": 0.0, "grad_norm": 0.3107724202625511, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 61738171.0, "reward": 0.5064452886581421, "reward_std": 0.47556719183921814, "rewards/execution_accuracy_EX/mean": 0.48046875, "rewards/execution_accuracy_EX/std": 0.5005971193313599, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9904196262359619, "sampling/importance_sampling_ratio/min": 0.011125722900032997, "sampling/sampling_logp_difference/max": 4.498495578765869, "sampling/sampling_logp_difference/mean": 0.14383336901664734, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 253.171875, "completions/mean_terminated_length": 253.171875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.17130203545093536, "epoch": 0.22831858407079647, "frac_reward_zero_std": 0.0, "grad_norm": 0.24078201406903801, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 62106407.0, "reward": 0.5658203363418579, "reward_std": 0.4741697609424591, "rewards/execution_accuracy_EX/mean": 0.54296875, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9352301359176636, "sampling/importance_sampling_ratio/mean": 0.9913920164108276, "sampling/importance_sampling_ratio/min": 0.011312858201563358, "sampling/sampling_logp_difference/max": 4.481815338134766, "sampling/sampling_logp_difference/mean": 0.13263952732086182, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 269.734375, "completions/mean_terminated_length": 269.734375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.1678872276097536, "epoch": 0.23008849557522124, "frac_reward_zero_std": 0.0, "grad_norm": 0.42323639693398374, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 62519971.0, "reward": 0.606640636920929, "reward_std": 0.46884801983833313, "rewards/execution_accuracy_EX/mean": 0.5859375, "rewards/execution_accuracy_EX/std": 0.4935242533683777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896361827850342, "sampling/importance_sampling_ratio/min": 0.0031865073833614588, "sampling/sampling_logp_difference/max": 5.7488298416137695, "sampling/sampling_logp_difference/mean": 0.12853194773197174, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 254.296875, "completions/mean_terminated_length": 254.296875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.174601873382926, "epoch": 0.23185840707964603, "frac_reward_zero_std": 0.0, "grad_norm": 0.4026612194940439, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 63299167.0, "reward": 0.6214843988418579, "reward_std": 0.46600866317749023, "rewards/execution_accuracy_EX/mean": 0.6015625, "rewards/execution_accuracy_EX/std": 0.4905354380607605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9881101250648499, "sampling/importance_sampling_ratio/min": 0.008679230697453022, "sampling/sampling_logp_difference/max": 4.746822357177734, "sampling/sampling_logp_difference/mean": 0.13689565658569336, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 243.44921875, "completions/mean_terminated_length": 243.44921875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.17167786695063114, "epoch": 0.2336283185840708, "frac_reward_zero_std": 0.0, "grad_norm": 0.3223429311177653, "learning_rate": 1e-06, "loss": 0.0173, "num_tokens": 63750914.0, "reward": 0.6957030892372131, "reward_std": 0.44413506984710693, "rewards/execution_accuracy_EX/mean": 0.6796875, "rewards/execution_accuracy_EX/std": 0.4675106406211853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9898079633712769, "sampling/importance_sampling_ratio/min": 0.016445204615592957, "sampling/sampling_logp_difference/max": 4.107721328735352, "sampling/sampling_logp_difference/mean": 0.13387587666511536, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 256.203125, "completions/mean_terminated_length": 256.203125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.16661197133362293, "epoch": 0.23539823008849559, "frac_reward_zero_std": 0.0, "grad_norm": 0.18602760671024346, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 64151302.0, "reward": 0.6363281011581421, "reward_std": 0.46267402172088623, "rewards/execution_accuracy_EX/mean": 0.6171875, "rewards/execution_accuracy_EX/std": 0.48702529072761536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9915856122970581, "sampling/importance_sampling_ratio/min": 0.008679230697453022, "sampling/sampling_logp_difference/max": 4.746822357177734, "sampling/sampling_logp_difference/mean": 0.1272910237312317, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 276.57421875, "completions/mean_terminated_length": 276.57421875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.16904133558273315, "epoch": 0.23716814159292035, "frac_reward_zero_std": 0.0, "grad_norm": 0.07880297011259281, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 64605833.0, "reward": 0.6474609375, "reward_std": 0.45984160900115967, "rewards/execution_accuracy_EX/mean": 0.62890625, "rewards/execution_accuracy_EX/std": 0.48404383659362793, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9918218851089478, "sampling/importance_sampling_ratio/min": 0.01118436548858881, "sampling/sampling_logp_difference/max": 4.49323844909668, "sampling/sampling_logp_difference/mean": 0.13078564405441284, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 272.41796875, "completions/mean_terminated_length": 272.41796875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.16213511303067207, "epoch": 0.23893805309734514, "frac_reward_zero_std": 0.0, "grad_norm": 0.2582515824201466, "learning_rate": 1e-06, "loss": -0.0068, "num_tokens": 64966548.0, "reward": 0.651171863079071, "reward_std": 0.45883336663246155, "rewards/execution_accuracy_EX/mean": 0.6328125, "rewards/execution_accuracy_EX/std": 0.48298248648643494, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9916924834251404, "sampling/importance_sampling_ratio/min": 0.008679230697453022, "sampling/sampling_logp_difference/max": 4.746822357177734, "sampling/sampling_logp_difference/mean": 0.12518557906150818, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 265.6015625, "completions/mean_terminated_length": 265.6015625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.17535862512886524, "epoch": 0.2407079646017699, "frac_reward_zero_std": 0.0, "grad_norm": 0.3891175004639357, "learning_rate": 1e-06, "loss": -0.0067, "num_tokens": 65456558.0, "reward": 0.40625, "reward_std": 0.4608176648616791, "rewards/execution_accuracy_EX/mean": 0.375, "rewards/execution_accuracy_EX/std": 0.4850712716579437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9876947999000549, "sampling/importance_sampling_ratio/min": 0.006748078390955925, "sampling/sampling_logp_difference/max": 4.998497486114502, "sampling/sampling_logp_difference/mean": 0.136422261595726, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 277.9375, "completions/mean_terminated_length": 277.9375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.1768400277942419, "epoch": 0.2424778761061947, "frac_reward_zero_std": 0.0, "grad_norm": 0.29718461934694185, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 65937470.0, "reward": 0.550976574420929, "reward_std": 0.4752182364463806, "rewards/execution_accuracy_EX/mean": 0.52734375, "rewards/execution_accuracy_EX/std": 0.5002297759056091, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9903590083122253, "sampling/importance_sampling_ratio/min": 0.004103484563529491, "sampling/sampling_logp_difference/max": 5.4959187507629395, "sampling/sampling_logp_difference/mean": 0.13497892022132874, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 283.0234375, "completions/mean_terminated_length": 283.0234375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.18021181970834732, "epoch": 0.24424778761061947, "frac_reward_zero_std": 0.0, "grad_norm": 0.3753146574870807, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 66447268.0, "reward": 0.5658202767372131, "reward_std": 0.4741697609424591, "rewards/execution_accuracy_EX/mean": 0.54296875, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9930406808853149, "sampling/importance_sampling_ratio/min": 0.0183610487729311, "sampling/sampling_logp_difference/max": 3.997523784637451, "sampling/sampling_logp_difference/mean": 0.13213272392749786, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 273.5859375, "completions/mean_terminated_length": 273.5859375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.17577782832086086, "epoch": 0.24601769911504426, "frac_reward_zero_std": 0.0, "grad_norm": 0.32820941355287014, "learning_rate": 1e-06, "loss": 0.0069, "num_tokens": 66945450.0, "reward": 0.576953113079071, "reward_std": 0.47307515144348145, "rewards/execution_accuracy_EX/mean": 0.5546875, "rewards/execution_accuracy_EX/std": 0.49797385931015015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9882457256317139, "sampling/importance_sampling_ratio/min": 0.01118464209139347, "sampling/sampling_logp_difference/max": 4.493213653564453, "sampling/sampling_logp_difference/mean": 0.13495786488056183, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 259.58203125, "completions/mean_terminated_length": 259.58203125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.1591511070728302, "epoch": 0.24778761061946902, "frac_reward_zero_std": 0.0, "grad_norm": 0.36036338433285736, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 67327455.0, "reward": 0.7513672113418579, "reward_std": 0.4184097945690155, "rewards/execution_accuracy_EX/mean": 0.73828125, "rewards/execution_accuracy_EX/std": 0.4404313564300537, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9880623817443848, "sampling/importance_sampling_ratio/min": 0.0031929106917232275, "sampling/sampling_logp_difference/max": 5.746822357177734, "sampling/sampling_logp_difference/mean": 0.12579825520515442, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 256.45703125, "completions/mean_terminated_length": 256.45703125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.15968062169849873, "epoch": 0.24955752212389382, "frac_reward_zero_std": 0.0, "grad_norm": 0.06441388540722605, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 67878948.0, "reward": 0.7587890625, "reward_std": 0.4142923355102539, "rewards/execution_accuracy_EX/mean": 0.74609375, "rewards/execution_accuracy_EX/std": 0.4360972046852112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9889984130859375, "sampling/importance_sampling_ratio/min": 0.0148173151537776, "sampling/sampling_logp_difference/max": 4.211958885192871, "sampling/sampling_logp_difference/mean": 0.12597008049488068, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 276.77734375, "completions/mean_terminated_length": 276.77734375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.1671825349330902, "epoch": 0.2513274336283186, "frac_reward_zero_std": 0.0, "grad_norm": 0.34538557229592415, "learning_rate": 1e-06, "loss": 0.0205, "num_tokens": 68440971.0, "reward": 0.7105468511581421, "reward_std": 0.43811774253845215, "rewards/execution_accuracy_EX/mean": 0.6953125, "rewards/execution_accuracy_EX/std": 0.4611765742301941, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.988913893699646, "sampling/importance_sampling_ratio/min": 0.011154407635331154, "sampling/sampling_logp_difference/max": 4.495920658111572, "sampling/sampling_logp_difference/mean": 0.12981252372264862, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 236.92578125, "completions/mean_terminated_length": 236.92578125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.16795746237039566, "epoch": 0.25309734513274335, "frac_reward_zero_std": 0.0, "grad_norm": 0.2575532753923275, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 68819368.0, "reward": 0.6177734136581421, "reward_std": 0.46676456928253174, "rewards/execution_accuracy_EX/mean": 0.59765625, "rewards/execution_accuracy_EX/std": 0.4913311004638672, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9547626972198486, "sampling/importance_sampling_ratio/mean": 0.98448246717453, "sampling/importance_sampling_ratio/min": 0.014282699674367905, "sampling/sampling_logp_difference/max": 4.248706340789795, "sampling/sampling_logp_difference/mean": 0.13988810777664185, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 233.71875, "completions/mean_terminated_length": 233.71875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.1615072637796402, "epoch": 0.25486725663716814, "frac_reward_zero_std": 0.0, "grad_norm": 0.2598507678217811, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 69187008.0, "reward": 0.799609363079071, "reward_std": 0.38833457231521606, "rewards/execution_accuracy_EX/mean": 0.7890625, "rewards/execution_accuracy_EX/std": 0.4087733030319214, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.98479163646698, "sampling/importance_sampling_ratio/min": 0.018343178555369377, "sampling/sampling_logp_difference/max": 3.998497486114502, "sampling/sampling_logp_difference/mean": 0.13842977583408356, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 269.00390625, "completions/mean_terminated_length": 269.00390625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.1655492577701807, "epoch": 0.25663716814159293, "frac_reward_zero_std": 0.0, "grad_norm": 0.328328829482272, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 69518417.0, "reward": 0.743945300579071, "reward_std": 0.42235618829727173, "rewards/execution_accuracy_EX/mean": 0.73046875, "rewards/execution_accuracy_EX/std": 0.44458550214767456, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9880072474479675, "sampling/importance_sampling_ratio/min": 0.01430963259190321, "sampling/sampling_logp_difference/max": 4.246822357177734, "sampling/sampling_logp_difference/mean": 0.13215236365795135, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 245.9609375, "completions/mean_terminated_length": 245.9609375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.15269946679472923, "epoch": 0.2584070796460177, "frac_reward_zero_std": 0.0, "grad_norm": 0.32541737697058953, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 70031431.0, "reward": 0.8330078125, "reward_std": 0.36231058835983276, "rewards/execution_accuracy_EX/mean": 0.82421875, "rewards/execution_accuracy_EX/std": 0.3813795745372772, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9864132404327393, "sampling/importance_sampling_ratio/min": 0.023737918585538864, "sampling/sampling_logp_difference/max": 3.7406816482543945, "sampling/sampling_logp_difference/mean": 0.1274140328168869, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 273.55078125, "completions/mean_terminated_length": 273.55078125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.16360680013895035, "epoch": 0.26017699115044246, "frac_reward_zero_std": 0.0, "grad_norm": 0.2289894642119028, "learning_rate": 1e-06, "loss": -0.0083, "num_tokens": 70396868.0, "reward": 0.666015625, "reward_std": 0.4544737637042999, "rewards/execution_accuracy_EX/mean": 0.6484375, "rewards/execution_accuracy_EX/std": 0.47839346528053284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9864295721054077, "sampling/importance_sampling_ratio/min": 0.00015113348490558565, "sampling/sampling_logp_difference/max": 8.797347068786621, "sampling/sampling_logp_difference/mean": 0.13306456804275513, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 287.44140625, "completions/mean_terminated_length": 287.44140625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.17404436320066452, "epoch": 0.26194690265486725, "frac_reward_zero_std": 0.0, "grad_norm": 0.3945935746219545, "learning_rate": 1e-06, "loss": -0.0087, "num_tokens": 70870485.0, "reward": 0.6214843392372131, "reward_std": 0.46600863337516785, "rewards/execution_accuracy_EX/mean": 0.6015625, "rewards/execution_accuracy_EX/std": 0.4905354380607605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9925150871276855, "sampling/importance_sampling_ratio/min": 0.016433831304311752, "sampling/sampling_logp_difference/max": 4.108413219451904, "sampling/sampling_logp_difference/mean": 0.1313704252243042, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 264.265625, "completions/mean_terminated_length": 264.265625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.1564411912113428, "epoch": 0.26371681415929205, "frac_reward_zero_std": 0.0, "grad_norm": 0.20620659707112282, "learning_rate": 1e-06, "loss": -0.0049, "num_tokens": 71283801.0, "reward": 0.595507800579071, "reward_std": 0.47065800428390503, "rewards/execution_accuracy_EX/mean": 0.57421875, "rewards/execution_accuracy_EX/std": 0.49542948603630066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9856451749801636, "sampling/importance_sampling_ratio/min": 0.0024849013425409794, "sampling/sampling_logp_difference/max": 5.997522354125977, "sampling/sampling_logp_difference/mean": 0.13136112689971924, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 267.58984375, "completions/mean_terminated_length": 267.58984375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.15700889751315117, "epoch": 0.26548672566371684, "frac_reward_zero_std": 0.0, "grad_norm": 0.13476004624860974, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 71625792.0, "reward": 0.6363281011581421, "reward_std": 0.46267399191856384, "rewards/execution_accuracy_EX/mean": 0.6171875, "rewards/execution_accuracy_EX/std": 0.48702529072761536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9884984493255615, "sampling/importance_sampling_ratio/mean": 0.9865537881851196, "sampling/importance_sampling_ratio/min": 0.01430963259190321, "sampling/sampling_logp_difference/max": 4.246822357177734, "sampling/sampling_logp_difference/mean": 0.13188934326171875, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 244.03515625, "completions/mean_terminated_length": 244.03515625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.16816776245832443, "epoch": 0.2672566371681416, "frac_reward_zero_std": 0.0, "grad_norm": 0.2702862154438436, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 72173017.0, "reward": 0.6697266101837158, "reward_std": 0.4533010721206665, "rewards/execution_accuracy_EX/mean": 0.65234375, "rewards/execution_accuracy_EX/std": 0.4771590530872345, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9873220920562744, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.13630010187625885, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 256.57421875, "completions/mean_terminated_length": 256.57421875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.16193786077201366, "epoch": 0.26902654867256637, "frac_reward_zero_std": 0.0, "grad_norm": 0.3475885697485575, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 72721228.0, "reward": 0.62890625, "reward_std": 0.46440389752388, "rewards/execution_accuracy_EX/mean": 0.609375, "rewards/execution_accuracy_EX/std": 0.48884621262550354, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9524896144866943, "sampling/importance_sampling_ratio/mean": 0.9891369938850403, "sampling/importance_sampling_ratio/min": 0.011183853261172771, "sampling/sampling_logp_difference/max": 4.493284225463867, "sampling/sampling_logp_difference/mean": 0.1292227804660797, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 282.72265625, "completions/mean_terminated_length": 282.72265625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.16410168260335922, "epoch": 0.27079646017699116, "frac_reward_zero_std": 0.0, "grad_norm": 0.2165331236348191, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 73422693.0, "reward": 0.6066405773162842, "reward_std": 0.46884801983833313, "rewards/execution_accuracy_EX/mean": 0.5859375, "rewards/execution_accuracy_EX/std": 0.4935242533683777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9891854524612427, "sampling/importance_sampling_ratio/min": 0.008661878295242786, "sampling/sampling_logp_difference/max": 4.748823642730713, "sampling/sampling_logp_difference/mean": 0.13202348351478577, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 311.43359375, "completions/mean_terminated_length": 311.43359375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.16593420505523682, "epoch": 0.27256637168141595, "frac_reward_zero_std": 0.0, "grad_norm": 0.3299510582742183, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 74046020.0, "reward": 0.680859386920929, "reward_std": 0.44958025217056274, "rewards/execution_accuracy_EX/mean": 0.6640625, "rewards/execution_accuracy_EX/std": 0.4732423722743988, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9876483678817749, "sampling/importance_sampling_ratio/min": 0.018390489742159843, "sampling/sampling_logp_difference/max": 3.9959216117858887, "sampling/sampling_logp_difference/mean": 0.1317213922739029, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 268.265625, "completions/mean_terminated_length": 268.265625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.17522499524056911, "epoch": 0.2743362831858407, "frac_reward_zero_std": 0.0, "grad_norm": 0.12994389119765543, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 74565416.0, "reward": 0.5138671398162842, "reward_std": 0.4757997393608093, "rewards/execution_accuracy_EX/mean": 0.48828125, "rewards/execution_accuracy_EX/std": 0.5008418560028076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9848809242248535, "sampling/importance_sampling_ratio/min": 0.0183610487729311, "sampling/sampling_logp_difference/max": 3.997523784637451, "sampling/sampling_logp_difference/mean": 0.1431981474161148, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 287.70703125, "completions/mean_terminated_length": 287.70703125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.177983570843935, "epoch": 0.2761061946902655, "frac_reward_zero_std": 0.0, "grad_norm": 0.2242206823726338, "learning_rate": 1e-06, "loss": -0.0046, "num_tokens": 75214829.0, "reward": 0.6845703125, "reward_std": 0.44827142357826233, "rewards/execution_accuracy_EX/mean": 0.66796875, "rewards/execution_accuracy_EX/std": 0.4718646705150604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9876867532730103, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.1371629387140274, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 298.37109375, "completions/mean_terminated_length": 298.37109375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.18536010943353176, "epoch": 0.2778761061946903, "frac_reward_zero_std": 0.0, "grad_norm": 0.3347005196171514, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 75730348.0, "reward": 0.5287109613418579, "reward_std": 0.47591593861579895, "rewards/execution_accuracy_EX/mean": 0.50390625, "rewards/execution_accuracy_EX/std": 0.5009641647338867, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9898300170898438, "sampling/importance_sampling_ratio/min": 0.011232499033212662, "sampling/sampling_logp_difference/max": 4.488944053649902, "sampling/sampling_logp_difference/mean": 0.141252338886261, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 274.41015625, "completions/mean_terminated_length": 274.41015625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.1710018888115883, "epoch": 0.27964601769911507, "frac_reward_zero_std": 0.0, "grad_norm": 0.2685527765456495, "learning_rate": 1e-06, "loss": -0.0047, "num_tokens": 76249333.0, "reward": 0.703125, "reward_std": 0.4411993622779846, "rewards/execution_accuracy_EX/mean": 0.6875, "rewards/execution_accuracy_EX/std": 0.4644203782081604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.989202618598938, "sampling/importance_sampling_ratio/min": 0.0067833466455340385, "sampling/sampling_logp_difference/max": 4.993284702301025, "sampling/sampling_logp_difference/mean": 0.13519498705863953, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 282.72265625, "completions/mean_terminated_length": 282.72265625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.16912869177758694, "epoch": 0.2814159292035398, "frac_reward_zero_std": 0.0, "grad_norm": 0.27739637030277553, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 76804670.0, "reward": 0.7921874523162842, "reward_std": 0.393498033285141, "rewards/execution_accuracy_EX/mean": 0.78125, "rewards/execution_accuracy_EX/std": 0.41420844197273254, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9852451682090759, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.13660919666290283, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 292.8046875, "completions/mean_terminated_length": 292.8046875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.1675608493387699, "epoch": 0.2831858407079646, "frac_reward_zero_std": 0.0, "grad_norm": 0.13822705311395195, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 77297004.0, "reward": 0.517578125, "reward_std": 0.47587236762046814, "rewards/execution_accuracy_EX/mean": 0.4921875, "rewards/execution_accuracy_EX/std": 0.5009182691574097, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9882014989852905, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.13478179275989532, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 321.77734375, "completions/mean_terminated_length": 321.77734375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.18110738322138786, "epoch": 0.2849557522123894, "frac_reward_zero_std": 0.0, "grad_norm": 0.30001543636542144, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 77913779.0, "reward": 0.6326172351837158, "reward_std": 0.46355465054512024, "rewards/execution_accuracy_EX/mean": 0.61328125, "rewards/execution_accuracy_EX/std": 0.4879522919654846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9877785444259644, "sampling/importance_sampling_ratio/min": 0.011312620714306831, "sampling/sampling_logp_difference/max": 4.481836318969727, "sampling/sampling_logp_difference/mean": 0.13775095343589783, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 319.28515625, "completions/mean_terminated_length": 319.28515625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.17902282439172268, "epoch": 0.2867256637168142, "frac_reward_zero_std": 0.0, "grad_norm": 0.28813047849773493, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 78299196.0, "reward": 0.5435546636581421, "reward_std": 0.4755672216415405, "rewards/execution_accuracy_EX/mean": 0.51953125, "rewards/execution_accuracy_EX/std": 0.5005971193313599, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9866389036178589, "sampling/importance_sampling_ratio/min": 0.011125700548291206, "sampling/sampling_logp_difference/max": 4.498497486114502, "sampling/sampling_logp_difference/mean": 0.14283457398414612, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 317.91796875, "completions/mean_terminated_length": 317.91796875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.19017246179282665, "epoch": 0.2884955752212389, "frac_reward_zero_std": 0.0, "grad_norm": 0.3032423743448608, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 78834887.0, "reward": 0.595507800579071, "reward_std": 0.4706580340862274, "rewards/execution_accuracy_EX/mean": 0.57421875, "rewards/execution_accuracy_EX/std": 0.49542948603630066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9934765696525574, "sampling/importance_sampling_ratio/min": 0.017504168674349785, "sampling/sampling_logp_difference/max": 4.045316219329834, "sampling/sampling_logp_difference/mean": 0.13656078279018402, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 298.24609375, "completions/mean_terminated_length": 298.24609375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.17399132438004017, "epoch": 0.2902654867256637, "frac_reward_zero_std": 0.0, "grad_norm": 0.2715622490557467, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 79380054.0, "reward": 0.6585937142372131, "reward_std": 0.4567192792892456, "rewards/execution_accuracy_EX/mean": 0.640625, "rewards/execution_accuracy_EX/std": 0.4807571768760681, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9891414642333984, "sampling/importance_sampling_ratio/min": 0.008668584749102592, "sampling/sampling_logp_difference/max": 4.748049736022949, "sampling/sampling_logp_difference/mean": 0.13266518712043762, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 255.90625, "completions/mean_terminated_length": 255.90625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.16244931519031525, "epoch": 0.2920353982300885, "frac_reward_zero_std": 0.0, "grad_norm": 0.14016964939870763, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 79895086.0, "reward": 0.7699218988418579, "reward_std": 0.4077840745449066, "rewards/execution_accuracy_EX/mean": 0.7578125, "rewards/execution_accuracy_EX/std": 0.4292463958263397, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9870266914367676, "sampling/importance_sampling_ratio/min": 0.011138302274048328, "sampling/sampling_logp_difference/max": 4.497365474700928, "sampling/sampling_logp_difference/mean": 0.12949419021606445, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 285.9609375, "completions/mean_terminated_length": 285.9609375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.19027838110923767, "epoch": 0.2938053097345133, "frac_reward_zero_std": 0.0, "grad_norm": 0.30834547888307096, "learning_rate": 1e-06, "loss": 0.0218, "num_tokens": 80428116.0, "reward": 0.699414074420929, "reward_std": 0.4426852762699127, "rewards/execution_accuracy_EX/mean": 0.68359375, "rewards/execution_accuracy_EX/std": 0.4659844934940338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9892711639404297, "sampling/importance_sampling_ratio/min": 0.011172729544341564, "sampling/sampling_logp_difference/max": 4.494279384613037, "sampling/sampling_logp_difference/mean": 0.14073659479618073, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 309.46875, "completions/mean_terminated_length": 309.46875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.20458186231553555, "epoch": 0.29557522123893804, "frac_reward_zero_std": 0.0, "grad_norm": 0.27987359383856286, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 80961324.0, "reward": 0.4136718809604645, "reward_std": 0.46267399191856384, "rewards/execution_accuracy_EX/mean": 0.3828125, "rewards/execution_accuracy_EX/std": 0.48702529072761536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9867178201675415, "sampling/importance_sampling_ratio/min": 0.014291773550212383, "sampling/sampling_logp_difference/max": 4.248071193695068, "sampling/sampling_logp_difference/mean": 0.15584495663642883, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 277.1640625, "completions/mean_terminated_length": 277.1640625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.17713839747011662, "epoch": 0.2973451327433628, "frac_reward_zero_std": 0.0, "grad_norm": 0.2985499406165058, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 81407158.0, "reward": 0.632617175579071, "reward_std": 0.46355465054512024, "rewards/execution_accuracy_EX/mean": 0.61328125, "rewards/execution_accuracy_EX/std": 0.4879522919654846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9890908002853394, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.13786716759204865, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 297.62109375, "completions/mean_terminated_length": 297.62109375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.17823628522455692, "epoch": 0.2991150442477876, "frac_reward_zero_std": 0.0, "grad_norm": 0.14719772917799118, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 81898245.0, "reward": 0.4619140625, "reward_std": 0.4717142879962921, "rewards/execution_accuracy_EX/mean": 0.43359375, "rewards/execution_accuracy_EX/std": 0.4965413510799408, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.98805832862854, "sampling/importance_sampling_ratio/min": 0.008699355646967888, "sampling/sampling_logp_difference/max": 4.744506359100342, "sampling/sampling_logp_difference/mean": 0.1376134157180786, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 290.34375, "completions/mean_terminated_length": 290.34375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.18065193854272366, "epoch": 0.3008849557522124, "frac_reward_zero_std": 0.0, "grad_norm": 0.45248351948725557, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 82287421.0, "reward": 0.7328125238418579, "reward_std": 0.4279654324054718, "rewards/execution_accuracy_EX/mean": 0.71875, "rewards/execution_accuracy_EX/std": 0.45048993825912476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9880305528640747, "sampling/importance_sampling_ratio/min": 0.006766550708562136, "sampling/sampling_logp_difference/max": 4.995763778686523, "sampling/sampling_logp_difference/mean": 0.13845156133174896, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 305.90625, "completions/mean_terminated_length": 305.90625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.19709263555705547, "epoch": 0.30265486725663715, "frac_reward_zero_std": 0.0, "grad_norm": 0.3640781475914555, "learning_rate": 1e-06, "loss": 0.0195, "num_tokens": 82663061.0, "reward": 0.7216796875, "reward_std": 0.4332149624824524, "rewards/execution_accuracy_EX/mean": 0.70703125, "rewards/execution_accuracy_EX/std": 0.45601576566696167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9894841313362122, "sampling/importance_sampling_ratio/min": 0.008661825209856033, "sampling/sampling_logp_difference/max": 4.7488298416137695, "sampling/sampling_logp_difference/mean": 0.14712056517601013, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1188.0, "completions/max_terminated_length": 1188.0, "completions/mean_length": 307.359375, "completions/mean_terminated_length": 307.359375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.17272526398301125, "epoch": 0.30442477876106194, "frac_reward_zero_std": 0.0, "grad_norm": 0.4270821242863743, "learning_rate": 1e-06, "loss": 0.0233, "num_tokens": 83177265.0, "reward": 0.8404296636581421, "reward_std": 0.3558422923088074, "rewards/execution_accuracy_EX/mean": 0.83203125, "rewards/execution_accuracy_EX/std": 0.3745708465576172, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9897280335426331, "sampling/importance_sampling_ratio/min": 0.014291773550212383, "sampling/sampling_logp_difference/max": 4.248071193695068, "sampling/sampling_logp_difference/mean": 0.1292266994714737, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 300.59765625, "completions/mean_terminated_length": 300.59765625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.18985807336866856, "epoch": 0.30619469026548674, "frac_reward_zero_std": 0.0, "grad_norm": 0.2663472458377981, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 83831338.0, "reward": 0.651171863079071, "reward_std": 0.45883333683013916, "rewards/execution_accuracy_EX/mean": 0.6328125, "rewards/execution_accuracy_EX/std": 0.48298248648643494, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9870967268943787, "sampling/importance_sampling_ratio/min": 0.023641685023903847, "sampling/sampling_logp_difference/max": 3.744743824005127, "sampling/sampling_logp_difference/mean": 0.14291070401668549, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 307.375, "completions/mean_terminated_length": 307.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.18669777922332287, "epoch": 0.30796460176991153, "frac_reward_zero_std": 0.0, "grad_norm": 0.20000621607629301, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 84321114.0, "reward": 0.8033202886581421, "reward_std": 0.3856732249259949, "rewards/execution_accuracy_EX/mean": 0.79296875, "rewards/execution_accuracy_EX/std": 0.40597182512283325, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896946549415588, "sampling/importance_sampling_ratio/min": 0.014280935749411583, "sampling/sampling_logp_difference/max": 4.2488298416137695, "sampling/sampling_logp_difference/mean": 0.1368214637041092, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 320.828125, "completions/mean_terminated_length": 320.828125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.18010174669325352, "epoch": 0.30973451327433627, "frac_reward_zero_std": 0.0, "grad_norm": 0.248299608727396, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 84835598.0, "reward": 0.6845703125, "reward_std": 0.44827142357826233, "rewards/execution_accuracy_EX/mean": 0.66796875, "rewards/execution_accuracy_EX/std": 0.4718646705150604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.988377571105957, "sampling/importance_sampling_ratio/min": 0.014339106157422066, "sampling/sampling_logp_difference/max": 4.244764804840088, "sampling/sampling_logp_difference/mean": 0.1342880129814148, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 325.0625, "completions/mean_terminated_length": 325.0625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.20411342196166515, "epoch": 0.31150442477876106, "frac_reward_zero_std": 0.0, "grad_norm": 0.29122051272085764, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 85382494.0, "reward": 0.591796875, "reward_std": 0.471201092004776, "rewards/execution_accuracy_EX/mean": 0.5703125, "rewards/execution_accuracy_EX/std": 0.4960011839866638, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9897286891937256, "sampling/importance_sampling_ratio/min": 0.01123241800814867, "sampling/sampling_logp_difference/max": 4.488951206207275, "sampling/sampling_logp_difference/mean": 0.1492016464471817, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 332.25390625, "completions/mean_terminated_length": 332.25390625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.19850089587271214, "epoch": 0.31327433628318585, "frac_reward_zero_std": 0.0, "grad_norm": 0.42684242218641294, "learning_rate": 1e-06, "loss": -0.0164, "num_tokens": 86069583.0, "reward": 0.5249999761581421, "reward_std": 0.47593045234680176, "rewards/execution_accuracy_EX/mean": 0.5, "rewards/execution_accuracy_EX/std": 0.5009794235229492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9907925128936768, "sampling/importance_sampling_ratio/min": 0.00017096343799494207, "sampling/sampling_logp_difference/max": 8.674060821533203, "sampling/sampling_logp_difference/mean": 0.14654508233070374, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 320.79296875, "completions/mean_terminated_length": 320.79296875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.19999371655285358, "epoch": 0.31504424778761064, "frac_reward_zero_std": 0.0, "grad_norm": 0.14002915813151406, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 86605770.0, "reward": 0.632617175579071, "reward_std": 0.46355465054512024, "rewards/execution_accuracy_EX/mean": 0.61328125, "rewards/execution_accuracy_EX/std": 0.4879522919654846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9907433986663818, "sampling/importance_sampling_ratio/min": 0.004132173955440521, "sampling/sampling_logp_difference/max": 5.488951683044434, "sampling/sampling_logp_difference/mean": 0.14724189043045044, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 335.09375, "completions/mean_terminated_length": 335.09375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.1902732029557228, "epoch": 0.3168141592920354, "frac_reward_zero_std": 0.0, "grad_norm": 0.21232421123556744, "learning_rate": 1e-06, "loss": -0.0085, "num_tokens": 87193362.0, "reward": 0.550976574420929, "reward_std": 0.4752182364463806, "rewards/execution_accuracy_EX/mean": 0.52734375, "rewards/execution_accuracy_EX/std": 0.5002297759056091, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9927315711975098, "sampling/importance_sampling_ratio/min": 0.0025241784751415253, "sampling/sampling_logp_difference/max": 5.981839656829834, "sampling/sampling_logp_difference/mean": 0.13706091046333313, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 281.734375, "completions/mean_terminated_length": 281.734375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.1805352121591568, "epoch": 0.3185840707964602, "frac_reward_zero_std": 0.0, "grad_norm": 0.2798660271881851, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 87866670.0, "reward": 0.7142578363418579, "reward_std": 0.4365212917327881, "rewards/execution_accuracy_EX/mean": 0.69921875, "rewards/execution_accuracy_EX/std": 0.45949608087539673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9869506359100342, "sampling/importance_sampling_ratio/min": 0.018529091030359268, "sampling/sampling_logp_difference/max": 3.9884133338928223, "sampling/sampling_logp_difference/mean": 0.1430865377187729, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 273.640625, "completions/mean_terminated_length": 273.640625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.17408888787031174, "epoch": 0.32035398230088497, "frac_reward_zero_std": 0.0, "grad_norm": 0.24584557137817806, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 88212738.0, "reward": 0.550976574420929, "reward_std": 0.4752182364463806, "rewards/execution_accuracy_EX/mean": 0.52734375, "rewards/execution_accuracy_EX/std": 0.5002297759056091, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.916461706161499, "sampling/importance_sampling_ratio/mean": 0.9846568703651428, "sampling/importance_sampling_ratio/min": 0.008661825209856033, "sampling/sampling_logp_difference/max": 4.7488298416137695, "sampling/sampling_logp_difference/mean": 0.14325553178787231, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 324.453125, "completions/mean_terminated_length": 324.453125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.18714475817978382, "epoch": 0.32212389380530976, "frac_reward_zero_std": 0.0, "grad_norm": 0.25074309060021294, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 88849622.0, "reward": 0.666015625, "reward_std": 0.4544737935066223, "rewards/execution_accuracy_EX/mean": 0.6484375, "rewards/execution_accuracy_EX/std": 0.47839346528053284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9900903701782227, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.13839155435562134, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 295.3515625, "completions/mean_terminated_length": 295.3515625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.17940758354961872, "epoch": 0.3238938053097345, "frac_reward_zero_std": 0.0, "grad_norm": 0.3033041423804513, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 89208944.0, "reward": 0.8070312738418579, "reward_std": 0.38295724987983704, "rewards/execution_accuracy_EX/mean": 0.796875, "rewards/execution_accuracy_EX/std": 0.40311288833618164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9876235723495483, "sampling/importance_sampling_ratio/min": 0.011136556044220924, "sampling/sampling_logp_difference/max": 4.497522354125977, "sampling/sampling_logp_difference/mean": 0.140605166554451, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 349.61328125, "completions/mean_terminated_length": 349.61328125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.18564481288194656, "epoch": 0.3256637168141593, "frac_reward_zero_std": 0.0, "grad_norm": 0.13190575739360055, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 89806845.0, "reward": 0.48417967557907104, "reward_std": 0.4741697609424591, "rewards/execution_accuracy_EX/mean": 0.45703125, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9901508092880249, "sampling/importance_sampling_ratio/min": 0.008705098181962967, "sampling/sampling_logp_difference/max": 4.743846416473389, "sampling/sampling_logp_difference/mean": 0.13562259078025818, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2023.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 327.1015625, "completions/mean_terminated_length": 327.1015625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.18473662436008453, "epoch": 0.3274336283185841, "frac_reward_zero_std": 0.0, "grad_norm": 0.17540688305886065, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 90277463.0, "reward": 0.6103515625, "reward_std": 0.46818408370018005, "rewards/execution_accuracy_EX/mean": 0.58984375, "rewards/execution_accuracy_EX/std": 0.49282538890838623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9885948300361633, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.13864898681640625, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 321.01953125, "completions/mean_terminated_length": 321.01953125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.1843369174748659, "epoch": 0.3292035398230089, "frac_reward_zero_std": 0.0, "grad_norm": 0.25954323312459404, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 90832780.0, "reward": 0.5101562738418579, "reward_std": 0.4756980240345001, "rewards/execution_accuracy_EX/mean": 0.484375, "rewards/execution_accuracy_EX/std": 0.5007347464561462, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9888851642608643, "sampling/importance_sampling_ratio/min": 0.008679230697453022, "sampling/sampling_logp_difference/max": 4.746822357177734, "sampling/sampling_logp_difference/mean": 0.13978828489780426, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 315.40234375, "completions/mean_terminated_length": 315.40234375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.18486985377967358, "epoch": 0.3309734513274336, "frac_reward_zero_std": 0.0, "grad_norm": 0.4176601204881519, "learning_rate": 1e-06, "loss": -0.0059, "num_tokens": 91302851.0, "reward": 0.5101562738418579, "reward_std": 0.4756980240345001, "rewards/execution_accuracy_EX/mean": 0.484375, "rewards/execution_accuracy_EX/std": 0.5007347464561462, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9875400066375732, "sampling/importance_sampling_ratio/min": 0.014315781183540821, "sampling/sampling_logp_difference/max": 4.246392726898193, "sampling/sampling_logp_difference/mean": 0.14399956166744232, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 279.3984375, "completions/mean_terminated_length": 279.3984375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.17375775426626205, "epoch": 0.3327433628318584, "frac_reward_zero_std": 0.0, "grad_norm": 0.3199652743452319, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 92033257.0, "reward": 0.7291015386581421, "reward_std": 0.4297545552253723, "rewards/execution_accuracy_EX/mean": 0.71484375, "rewards/execution_accuracy_EX/std": 0.4523732364177704, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9862713813781738, "sampling/importance_sampling_ratio/min": 0.014322636649012566, "sampling/sampling_logp_difference/max": 4.245913982391357, "sampling/sampling_logp_difference/mean": 0.13989365100860596, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 316.79296875, "completions/mean_terminated_length": 316.79296875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.18547409027814865, "epoch": 0.3345132743362832, "frac_reward_zero_std": 0.0, "grad_norm": 0.25144295439565795, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 92600740.0, "reward": 0.6214843392372131, "reward_std": 0.46600863337516785, "rewards/execution_accuracy_EX/mean": 0.6015625, "rewards/execution_accuracy_EX/std": 0.4905354380607605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.988899290561676, "sampling/importance_sampling_ratio/min": 0.005253662820905447, "sampling/sampling_logp_difference/max": 5.2488298416137695, "sampling/sampling_logp_difference/mean": 0.14020931720733643, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 318.97265625, "completions/mean_terminated_length": 318.97265625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.1898004561662674, "epoch": 0.336283185840708, "frac_reward_zero_std": 0.0, "grad_norm": 0.23477650025521635, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 92979277.0, "reward": 0.4990234375, "reward_std": 0.4752182364463806, "rewards/execution_accuracy_EX/mean": 0.47265625, "rewards/execution_accuracy_EX/std": 0.5002297759056091, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9899483919143677, "sampling/importance_sampling_ratio/min": 0.011142679490149021, "sampling/sampling_logp_difference/max": 4.496972560882568, "sampling/sampling_logp_difference/mean": 0.13974857330322266, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 332.5703125, "completions/mean_terminated_length": 332.5703125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.1874419655650854, "epoch": 0.3380530973451327, "frac_reward_zero_std": 0.0, "grad_norm": 0.3249122649226148, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 93469487.0, "reward": 0.40253907442092896, "reward_std": 0.4598415791988373, "rewards/execution_accuracy_EX/mean": 0.37109375, "rewards/execution_accuracy_EX/std": 0.48404383659362793, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9894042015075684, "sampling/importance_sampling_ratio/min": 0.01430966705083847, "sampling/sampling_logp_difference/max": 4.246819972991943, "sampling/sampling_logp_difference/mean": 0.1404990702867508, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 298.58203125, "completions/mean_terminated_length": 298.58203125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.17528344877064228, "epoch": 0.3398230088495575, "frac_reward_zero_std": 0.0, "grad_norm": 0.3052954277647955, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 93908388.0, "reward": 0.532421886920929, "reward_std": 0.47587236762046814, "rewards/execution_accuracy_EX/mean": 0.5078125, "rewards/execution_accuracy_EX/std": 0.5009182691574097, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9889025688171387, "sampling/importance_sampling_ratio/min": 0.018411943688988686, "sampling/sampling_logp_difference/max": 3.994755744934082, "sampling/sampling_logp_difference/mean": 0.1326378434896469, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 303.515625, "completions/mean_terminated_length": 303.515625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.18384411185979843, "epoch": 0.3415929203539823, "frac_reward_zero_std": 0.0, "grad_norm": 0.21229100105606544, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 94443736.0, "reward": 0.5249999761581421, "reward_std": 0.47593042254447937, "rewards/execution_accuracy_EX/mean": 0.5, "rewards/execution_accuracy_EX/std": 0.5009794235229492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9866931438446045, "sampling/importance_sampling_ratio/min": 0.006754668429493904, "sampling/sampling_logp_difference/max": 4.99752140045166, "sampling/sampling_logp_difference/mean": 0.14097186923027039, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 293.19921875, "completions/mean_terminated_length": 293.19921875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.17807519063353539, "epoch": 0.3433628318584071, "frac_reward_zero_std": 0.0, "grad_norm": 0.22216409705678244, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 94991131.0, "reward": 0.8255859613418579, "reward_std": 0.36851534247398376, "rewards/execution_accuracy_EX/mean": 0.81640625, "rewards/execution_accuracy_EX/std": 0.387910932302475, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9888792037963867, "sampling/importance_sampling_ratio/min": 0.014794101007282734, "sampling/sampling_logp_difference/max": 4.213526725769043, "sampling/sampling_logp_difference/mean": 0.1355728954076767, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 309.8984375, "completions/mean_terminated_length": 309.8984375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.18850590474903584, "epoch": 0.34513274336283184, "frac_reward_zero_std": 0.0, "grad_norm": 0.31659579865543813, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 95424417.0, "reward": 0.588085949420929, "reward_std": 0.4717142879962921, "rewards/execution_accuracy_EX/mean": 0.56640625, "rewards/execution_accuracy_EX/std": 0.4965413510799408, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9890519380569458, "sampling/importance_sampling_ratio/min": 0.009062101133167744, "sampling/sampling_logp_difference/max": 4.7036542892456055, "sampling/sampling_logp_difference/mean": 0.1400558054447174, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 304.72265625, "completions/mean_terminated_length": 304.72265625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.1978746559470892, "epoch": 0.34690265486725663, "frac_reward_zero_std": 0.0, "grad_norm": 0.19081941531426047, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 95891546.0, "reward": 0.5732421875, "reward_std": 0.47346949577331543, "rewards/execution_accuracy_EX/mean": 0.55078125, "rewards/execution_accuracy_EX/std": 0.49838894605636597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9867796301841736, "sampling/importance_sampling_ratio/min": 0.014291773550212383, "sampling/sampling_logp_difference/max": 4.248071193695068, "sampling/sampling_logp_difference/mean": 0.14825952053070068, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 331.51953125, "completions/mean_terminated_length": 331.51953125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.18945630080997944, "epoch": 0.3486725663716814, "frac_reward_zero_std": 0.0, "grad_norm": 0.2720554752858141, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 96555071.0, "reward": 0.49531248211860657, "reward_std": 0.47499996423721313, "rewards/execution_accuracy_EX/mean": 0.46875, "rewards/execution_accuracy_EX/std": 0.5, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9908668994903564, "sampling/importance_sampling_ratio/min": 0.01839049905538559, "sampling/sampling_logp_difference/max": 3.9959211349487305, "sampling/sampling_logp_difference/mean": 0.1396811604499817, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 299.14453125, "completions/mean_terminated_length": 299.14453125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.16827655397355556, "epoch": 0.3504424778761062, "frac_reward_zero_std": 0.0, "grad_norm": 0.15794168883137946, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 97219860.0, "reward": 0.569531261920929, "reward_std": 0.473834365606308, "rewards/execution_accuracy_EX/mean": 0.546875, "rewards/execution_accuracy_EX/std": 0.4987730085849762, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9841498136520386, "sampling/importance_sampling_ratio/min": 0.011208107694983482, "sampling/sampling_logp_difference/max": 4.49111795425415, "sampling/sampling_logp_difference/mean": 0.13843360543251038, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 315.87890625, "completions/mean_terminated_length": 315.87890625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.19387279450893402, "epoch": 0.35221238938053095, "frac_reward_zero_std": 0.0, "grad_norm": 0.3254534248620124, "learning_rate": 1e-06, "loss": -0.0079, "num_tokens": 97785973.0, "reward": 0.5064452886581421, "reward_std": 0.4755672216415405, "rewards/execution_accuracy_EX/mean": 0.48046875, "rewards/execution_accuracy_EX/std": 0.5005971193313599, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.989777147769928, "sampling/importance_sampling_ratio/min": 0.006762932054698467, "sampling/sampling_logp_difference/max": 4.996298789978027, "sampling/sampling_logp_difference/mean": 0.14550894498825073, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 308.40234375, "completions/mean_terminated_length": 308.40234375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.17666900530457497, "epoch": 0.35398230088495575, "frac_reward_zero_std": 0.0, "grad_norm": 0.23838401560311998, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 98297340.0, "reward": 0.666015625, "reward_std": 0.4544737637042999, "rewards/execution_accuracy_EX/mean": 0.6484375, "rewards/execution_accuracy_EX/std": 0.47839346528053284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9885034561157227, "sampling/importance_sampling_ratio/min": 0.005275059957057238, "sampling/sampling_logp_difference/max": 5.244765281677246, "sampling/sampling_logp_difference/mean": 0.13679823279380798, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 293.91796875, "completions/mean_terminated_length": 293.91796875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.17220030166208744, "epoch": 0.35575221238938054, "frac_reward_zero_std": 0.0, "grad_norm": 0.2635345589288893, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 98771959.0, "reward": 0.6177734136581421, "reward_std": 0.46676450967788696, "rewards/execution_accuracy_EX/mean": 0.59765625, "rewards/execution_accuracy_EX/std": 0.4913311004638672, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9877549409866333, "sampling/importance_sampling_ratio/min": 0.011232408694922924, "sampling/sampling_logp_difference/max": 4.488952159881592, "sampling/sampling_logp_difference/mean": 0.13670043647289276, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 284.97265625, "completions/mean_terminated_length": 284.97265625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.1810233872383833, "epoch": 0.35752212389380533, "frac_reward_zero_std": 0.0, "grad_norm": 0.27506573550641483, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 99245024.0, "reward": 0.6623046398162842, "reward_std": 0.4556131064891815, "rewards/execution_accuracy_EX/mean": 0.64453125, "rewards/execution_accuracy_EX/std": 0.4795927405357361, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9841594099998474, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.14774857461452484, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 290.62109375, "completions/mean_terminated_length": 290.62109375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.16862306743860245, "epoch": 0.35929203539823007, "frac_reward_zero_std": 0.0, "grad_norm": 0.22912455934549136, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 99713983.0, "reward": 0.62890625, "reward_std": 0.46440389752388, "rewards/execution_accuracy_EX/mean": 0.609375, "rewards/execution_accuracy_EX/std": 0.48884621262550354, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.956079363822937, "sampling/importance_sampling_ratio/mean": 0.9820801019668579, "sampling/importance_sampling_ratio/min": 0.014339372515678406, "sampling/sampling_logp_difference/max": 4.244746208190918, "sampling/sampling_logp_difference/mean": 0.14202579855918884, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 283.74609375, "completions/mean_terminated_length": 283.74609375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.17279032245278358, "epoch": 0.36106194690265486, "frac_reward_zero_std": 0.0, "grad_norm": 0.2651987711798711, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 100210862.0, "reward": 0.7105468511581421, "reward_std": 0.43811774253845215, "rewards/execution_accuracy_EX/mean": 0.6953125, "rewards/execution_accuracy_EX/std": 0.4611765742301941, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9854060411453247, "sampling/importance_sampling_ratio/min": 0.011183853261172771, "sampling/sampling_logp_difference/max": 4.493284225463867, "sampling/sampling_logp_difference/mean": 0.1415708065032959, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 286.84375, "completions/mean_terminated_length": 286.84375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.1592701580375433, "epoch": 0.36283185840707965, "frac_reward_zero_std": 0.0, "grad_norm": 0.3158506886213264, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 100524550.0, "reward": 0.7810547351837158, "reward_std": 0.40085992217063904, "rewards/execution_accuracy_EX/mean": 0.76953125, "rewards/execution_accuracy_EX/std": 0.4219578504562378, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9852101802825928, "sampling/importance_sampling_ratio/min": 0.014291773550212383, "sampling/sampling_logp_difference/max": 4.248071193695068, "sampling/sampling_logp_difference/mean": 0.13460731506347656, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 305.68359375, "completions/mean_terminated_length": 305.68359375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.16670113801956177, "epoch": 0.36460176991150445, "frac_reward_zero_std": 0.0, "grad_norm": 0.25008339032931437, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 100935333.0, "reward": 0.6474609375, "reward_std": 0.4598415791988373, "rewards/execution_accuracy_EX/mean": 0.62890625, "rewards/execution_accuracy_EX/std": 0.48404383659362793, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9877009391784668, "sampling/importance_sampling_ratio/min": 0.008668432012200356, "sampling/sampling_logp_difference/max": 4.748067378997803, "sampling/sampling_logp_difference/mean": 0.13257518410682678, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 280.5390625, "completions/mean_terminated_length": 280.5390625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.16295744851231575, "epoch": 0.3663716814159292, "frac_reward_zero_std": 0.0, "grad_norm": 0.1806119419910666, "learning_rate": 1e-06, "loss": -0.0038, "num_tokens": 101529167.0, "reward": 0.5806640386581421, "reward_std": 0.4726512134075165, "rewards/execution_accuracy_EX/mean": 0.55859375, "rewards/execution_accuracy_EX/std": 0.4975275993347168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.982230544090271, "sampling/importance_sampling_ratio/min": 0.011154789477586746, "sampling/sampling_logp_difference/max": 4.495886325836182, "sampling/sampling_logp_difference/mean": 0.13882498443126678, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 297.73046875, "completions/mean_terminated_length": 297.73046875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.15727944858372211, "epoch": 0.368141592920354, "frac_reward_zero_std": 0.0, "grad_norm": 0.22748012679787208, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 101903258.0, "reward": 0.5064452886581421, "reward_std": 0.47556719183921814, "rewards/execution_accuracy_EX/mean": 0.48046875, "rewards/execution_accuracy_EX/std": 0.5005971193313599, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9867826700210571, "sampling/importance_sampling_ratio/min": 0.011182126589119434, "sampling/sampling_logp_difference/max": 4.493438720703125, "sampling/sampling_logp_difference/mean": 0.12806008756160736, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 295.12890625, "completions/mean_terminated_length": 295.12890625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.16728811338543892, "epoch": 0.36991150442477877, "frac_reward_zero_std": 0.0, "grad_norm": 0.36207057041573587, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 102396491.0, "reward": 0.5546875, "reward_std": 0.4749999940395355, "rewards/execution_accuracy_EX/mean": 0.53125, "rewards/execution_accuracy_EX/std": 0.5, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9869174957275391, "sampling/importance_sampling_ratio/min": 1.6849011444719508e-05, "sampling/sampling_logp_difference/max": 10.991218566894531, "sampling/sampling_logp_difference/mean": 0.13170340657234192, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 311.76953125, "completions/mean_terminated_length": 311.76953125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.16339107044041157, "epoch": 0.37168141592920356, "frac_reward_zero_std": 0.0, "grad_norm": 0.24713288062062294, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 102701504.0, "reward": 0.5992187261581421, "reward_std": 0.47008490562438965, "rewards/execution_accuracy_EX/mean": 0.578125, "rewards/execution_accuracy_EX/std": 0.49482619762420654, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9879264831542969, "sampling/importance_sampling_ratio/min": 0.008668403141200542, "sampling/sampling_logp_difference/max": 4.74807071685791, "sampling/sampling_logp_difference/mean": 0.13112956285476685, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 309.16015625, "completions/mean_terminated_length": 309.16015625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.1704132743179798, "epoch": 0.3734513274336283, "frac_reward_zero_std": 0.0, "grad_norm": 0.20578594883163423, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 103171033.0, "reward": 0.6400390863418579, "reward_std": 0.4617617428302765, "rewards/execution_accuracy_EX/mean": 0.62109375, "rewards/execution_accuracy_EX/std": 0.4860650300979614, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9840466976165771, "sampling/importance_sampling_ratio/min": 0.011193334124982357, "sampling/sampling_logp_difference/max": 4.49243688583374, "sampling/sampling_logp_difference/mean": 0.13917119801044464, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 293.46484375, "completions/mean_terminated_length": 293.46484375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.16136296652257442, "epoch": 0.3752212389380531, "frac_reward_zero_std": 0.0, "grad_norm": 0.37329204569315916, "learning_rate": 1e-06, "loss": -0.0038, "num_tokens": 103767616.0, "reward": 0.699414074420929, "reward_std": 0.4426852762699127, "rewards/execution_accuracy_EX/mean": 0.68359375, "rewards/execution_accuracy_EX/std": 0.4659844934940338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9859615564346313, "sampling/importance_sampling_ratio/min": 0.005257649812847376, "sampling/sampling_logp_difference/max": 5.248071193695068, "sampling/sampling_logp_difference/mean": 0.1347169727087021, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 293.265625, "completions/mean_terminated_length": 293.265625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.1644913423806429, "epoch": 0.3769911504424779, "frac_reward_zero_std": 0.0, "grad_norm": 0.2170555057496567, "learning_rate": 1e-06, "loss": -0.0055, "num_tokens": 104458020.0, "reward": 0.36542969942092896, "reward_std": 0.44827139377593994, "rewards/execution_accuracy_EX/mean": 0.33203125, "rewards/execution_accuracy_EX/std": 0.4718646705150604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9585728645324707, "sampling/importance_sampling_ratio/mean": 0.9880084991455078, "sampling/importance_sampling_ratio/min": 0.01430963259190321, "sampling/sampling_logp_difference/max": 4.246822357177734, "sampling/sampling_logp_difference/mean": 0.1330907642841339, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 306.6484375, "completions/mean_terminated_length": 306.6484375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.15919774770736694, "epoch": 0.3787610619469027, "frac_reward_zero_std": 0.0, "grad_norm": 0.30691417661315207, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 104781370.0, "reward": 0.606640636920929, "reward_std": 0.46884801983833313, "rewards/execution_accuracy_EX/mean": 0.5859375, "rewards/execution_accuracy_EX/std": 0.4935242533683777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9755750894546509, "sampling/importance_sampling_ratio/mean": 0.9810018539428711, "sampling/importance_sampling_ratio/min": 0.01113677304238081, "sampling/sampling_logp_difference/max": 4.49750280380249, "sampling/sampling_logp_difference/mean": 0.1426813304424286, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 292.5, "completions/mean_terminated_length": 292.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.15499872714281082, "epoch": 0.3805309734513274, "frac_reward_zero_std": 0.0, "grad_norm": 0.19549322050060405, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 105198698.0, "reward": 0.643750011920929, "reward_std": 0.46081769466400146, "rewards/execution_accuracy_EX/mean": 0.625, "rewards/execution_accuracy_EX/std": 0.4850712716579437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.982332170009613, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.1391640603542328, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 288.47265625, "completions/mean_terminated_length": 288.47265625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.15948248468339443, "epoch": 0.3823008849557522, "frac_reward_zero_std": 0.0, "grad_norm": 0.3898832423180779, "learning_rate": 1e-06, "loss": -0.0117, "num_tokens": 105588195.0, "reward": 0.5658203363418579, "reward_std": 0.4741697609424591, "rewards/execution_accuracy_EX/mean": 0.54296875, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9816546440124512, "sampling/importance_sampling_ratio/min": 0.01118500530719757, "sampling/sampling_logp_difference/max": 4.493181228637695, "sampling/sampling_logp_difference/mean": 0.13936170935630798, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 300.71484375, "completions/mean_terminated_length": 300.71484375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.14708868972957134, "epoch": 0.384070796460177, "frac_reward_zero_std": 0.0, "grad_norm": 0.21553840642650957, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 106114490.0, "reward": 0.725390613079071, "reward_std": 0.4315042495727539, "rewards/execution_accuracy_EX/mean": 0.7109375, "rewards/execution_accuracy_EX/std": 0.45421501994132996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9822921752929688, "sampling/importance_sampling_ratio/min": 0.01114028412848711, "sampling/sampling_logp_difference/max": 4.497187614440918, "sampling/sampling_logp_difference/mean": 0.1331436038017273, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 315.27734375, "completions/mean_terminated_length": 315.27734375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.1578066125512123, "epoch": 0.3858407079646018, "frac_reward_zero_std": 0.0, "grad_norm": 0.42582165404524236, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 106655121.0, "reward": 0.6177734136581421, "reward_std": 0.46676453948020935, "rewards/execution_accuracy_EX/mean": 0.59765625, "rewards/execution_accuracy_EX/std": 0.4913311004638672, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9849480390548706, "sampling/importance_sampling_ratio/min": 0.011137150228023529, "sampling/sampling_logp_difference/max": 4.497468948364258, "sampling/sampling_logp_difference/mean": 0.1338350921869278, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 291.64453125, "completions/mean_terminated_length": 291.64453125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.17081346735358238, "epoch": 0.38761061946902653, "frac_reward_zero_std": 0.0, "grad_norm": 0.25850475824639435, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 107048630.0, "reward": 0.569531261920929, "reward_std": 0.4738343358039856, "rewards/execution_accuracy_EX/mean": 0.546875, "rewards/execution_accuracy_EX/std": 0.4987730085849762, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9837124943733215, "sampling/importance_sampling_ratio/min": 0.005275054834783077, "sampling/sampling_logp_difference/max": 5.2447662353515625, "sampling/sampling_logp_difference/mean": 0.14238256216049194, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 323.19921875, "completions/mean_terminated_length": 323.19921875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.15954815410077572, "epoch": 0.3893805309734513, "frac_reward_zero_std": 0.0, "grad_norm": 0.2405263523095734, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 107513225.0, "reward": 0.6585937738418579, "reward_std": 0.4567192792892456, "rewards/execution_accuracy_EX/mean": 0.640625, "rewards/execution_accuracy_EX/std": 0.4807571768760681, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9837608337402344, "sampling/importance_sampling_ratio/min": 0.001958009321242571, "sampling/sampling_logp_difference/max": 6.2358269691467285, "sampling/sampling_logp_difference/mean": 0.14068523049354553, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 312.58984375, "completions/mean_terminated_length": 312.58984375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.161805747076869, "epoch": 0.3911504424778761, "frac_reward_zero_std": 0.0, "grad_norm": 0.2756924935552442, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 107975808.0, "reward": 0.5472656488418579, "reward_std": 0.47540730237960815, "rewards/execution_accuracy_EX/mean": 0.5234375, "rewards/execution_accuracy_EX/std": 0.5004287362098694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9832560420036316, "sampling/importance_sampling_ratio/min": 0.008837749250233173, "sampling/sampling_logp_difference/max": 4.728723049163818, "sampling/sampling_logp_difference/mean": 0.1391669511795044, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 295.43359375, "completions/mean_terminated_length": 295.43359375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.15584200993180275, "epoch": 0.3929203539823009, "frac_reward_zero_std": 0.0, "grad_norm": 0.30114187415305094, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 108476511.0, "reward": 0.5732421875, "reward_std": 0.47346949577331543, "rewards/execution_accuracy_EX/mean": 0.55078125, "rewards/execution_accuracy_EX/std": 0.49838894605636597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9799416661262512, "sampling/importance_sampling_ratio/min": 0.005254166200757027, "sampling/sampling_logp_difference/max": 5.248733997344971, "sampling/sampling_logp_difference/mean": 0.14466512203216553, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 319.4296875, "completions/mean_terminated_length": 319.4296875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.15634971857070923, "epoch": 0.39469026548672564, "frac_reward_zero_std": 0.0, "grad_norm": 0.201304991818416, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 109067853.0, "reward": 0.4990234375, "reward_std": 0.4752182364463806, "rewards/execution_accuracy_EX/mean": 0.47265625, "rewards/execution_accuracy_EX/std": 0.5002297759056091, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9853391647338867, "sampling/importance_sampling_ratio/min": 0.008726605214178562, "sampling/sampling_logp_difference/max": 4.7413787841796875, "sampling/sampling_logp_difference/mean": 0.13357853889465332, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 327.59375, "completions/mean_terminated_length": 327.59375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.16013413295149803, "epoch": 0.39646017699115044, "frac_reward_zero_std": 0.0, "grad_norm": 0.3197331562916165, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 109578021.0, "reward": 0.5138671398162842, "reward_std": 0.4757997393608093, "rewards/execution_accuracy_EX/mean": 0.48828125, "rewards/execution_accuracy_EX/std": 0.5008418560028076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9839286804199219, "sampling/importance_sampling_ratio/min": 0.014280935749411583, "sampling/sampling_logp_difference/max": 4.2488298416137695, "sampling/sampling_logp_difference/mean": 0.1407632827758789, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 343.03125, "completions/mean_terminated_length": 343.03125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.1597651895135641, "epoch": 0.39823008849557523, "frac_reward_zero_std": 0.0, "grad_norm": 0.2844288120097009, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 109993165.0, "reward": 0.5361328125, "reward_std": 0.4757997393608093, "rewards/execution_accuracy_EX/mean": 0.51171875, "rewards/execution_accuracy_EX/std": 0.5008418560028076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9862627387046814, "sampling/importance_sampling_ratio/min": 0.0032341775950044394, "sampling/sampling_logp_difference/max": 5.733980655670166, "sampling/sampling_logp_difference/mean": 0.13292866945266724, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 311.453125, "completions/mean_terminated_length": 311.453125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.16206582821905613, "epoch": 0.4, "frac_reward_zero_std": 0.0, "grad_norm": 0.28634636089839877, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 110330561.0, "reward": 0.443359375, "reward_std": 0.46884801983833313, "rewards/execution_accuracy_EX/mean": 0.4140625, "rewards/execution_accuracy_EX/std": 0.4935242533683777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9361305236816406, "sampling/importance_sampling_ratio/mean": 0.981826663017273, "sampling/importance_sampling_ratio/min": 0.011188104748725891, "sampling/sampling_logp_difference/max": 4.492904186248779, "sampling/sampling_logp_difference/mean": 0.14049078524112701, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 357.38671875, "completions/mean_terminated_length": 357.38671875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.17608636245131493, "epoch": 0.40176991150442476, "frac_reward_zero_std": 0.0, "grad_norm": 0.395427305686896, "learning_rate": 1e-06, "loss": -0.0079, "num_tokens": 110757860.0, "reward": 0.6177734732627869, "reward_std": 0.46676450967788696, "rewards/execution_accuracy_EX/mean": 0.59765625, "rewards/execution_accuracy_EX/std": 0.4913311004638672, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9860670566558838, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.1404687464237213, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 315.37890625, "completions/mean_terminated_length": 315.37890625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.16146675869822502, "epoch": 0.40353982300884955, "frac_reward_zero_std": 0.0, "grad_norm": 0.2997694980144929, "learning_rate": 1e-06, "loss": -0.0102, "num_tokens": 111470133.0, "reward": 0.6623046398162842, "reward_std": 0.4556131064891815, "rewards/execution_accuracy_EX/mean": 0.64453125, "rewards/execution_accuracy_EX/std": 0.4795927405357361, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9435782432556152, "sampling/importance_sampling_ratio/mean": 0.9826586246490479, "sampling/importance_sampling_ratio/min": 0.0015113846166059375, "sampling/sampling_logp_difference/max": 6.494729042053223, "sampling/sampling_logp_difference/mean": 0.14155453443527222, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 327.3828125, "completions/mean_terminated_length": 327.3828125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.17297273315489292, "epoch": 0.40530973451327434, "frac_reward_zero_std": 0.0, "grad_norm": 0.20821698597680982, "learning_rate": 1e-06, "loss": -0.0047, "num_tokens": 111948439.0, "reward": 0.6623046398162842, "reward_std": 0.4556131064891815, "rewards/execution_accuracy_EX/mean": 0.64453125, "rewards/execution_accuracy_EX/std": 0.4795927405357361, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9845682382583618, "sampling/importance_sampling_ratio/min": 0.0067481910809874535, "sampling/sampling_logp_difference/max": 4.998480796813965, "sampling/sampling_logp_difference/mean": 0.14390261471271515, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 347.421875, "completions/mean_terminated_length": 347.421875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.17069001123309135, "epoch": 0.40707964601769914, "frac_reward_zero_std": 0.0, "grad_norm": 0.28389833200173975, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 112647955.0, "reward": 0.6548827886581421, "reward_std": 0.45779263973236084, "rewards/execution_accuracy_EX/mean": 0.63671875, "rewards/execution_accuracy_EX/std": 0.48188701272010803, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.985714316368103, "sampling/importance_sampling_ratio/min": 0.005253662820905447, "sampling/sampling_logp_difference/max": 5.2488298416137695, "sampling/sampling_logp_difference/mean": 0.1376417577266693, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 329.04296875, "completions/mean_terminated_length": 329.04296875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.1747161727398634, "epoch": 0.4088495575221239, "frac_reward_zero_std": 0.0, "grad_norm": 0.3246365694686947, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 113204430.0, "reward": 0.773632824420929, "reward_std": 0.40552324056625366, "rewards/execution_accuracy_EX/mean": 0.76171875, "rewards/execution_accuracy_EX/std": 0.4268665909767151, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9791052341461182, "sampling/importance_sampling_ratio/mean": 0.9838868975639343, "sampling/importance_sampling_ratio/min": 0.004096913617104292, "sampling/sampling_logp_difference/max": 5.49752140045166, "sampling/sampling_logp_difference/mean": 0.14427624642848969, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 336.01953125, "completions/mean_terminated_length": 336.01953125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.18153868429362774, "epoch": 0.41061946902654867, "frac_reward_zero_std": 0.0, "grad_norm": 0.15723601636185833, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 113646803.0, "reward": 0.6214843392372131, "reward_std": 0.46600863337516785, "rewards/execution_accuracy_EX/mean": 0.6015625, "rewards/execution_accuracy_EX/std": 0.4905354380607605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9836591482162476, "sampling/importance_sampling_ratio/min": 0.018361039459705353, "sampling/sampling_logp_difference/max": 3.9975242614746094, "sampling/sampling_logp_difference/mean": 0.1475595384836197, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 327.80859375, "completions/mean_terminated_length": 327.80859375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.18472612835466862, "epoch": 0.41238938053097346, "frac_reward_zero_std": 0.0, "grad_norm": 0.24229994102963986, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 114200626.0, "reward": 0.517578125, "reward_std": 0.47587236762046814, "rewards/execution_accuracy_EX/mean": 0.4921875, "rewards/execution_accuracy_EX/std": 0.5009182691574097, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9879395365715027, "sampling/importance_sampling_ratio/min": 0.011183853261172771, "sampling/sampling_logp_difference/max": 4.493284225463867, "sampling/sampling_logp_difference/mean": 0.14690710604190826, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 350.03515625, "completions/mean_terminated_length": 350.03515625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.19433023780584335, "epoch": 0.41415929203539825, "frac_reward_zero_std": 0.0, "grad_norm": 0.39571142053642266, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 114684171.0, "reward": 0.5101562738418579, "reward_std": 0.4756980240345001, "rewards/execution_accuracy_EX/mean": 0.484375, "rewards/execution_accuracy_EX/std": 0.5007347464561462, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9859143495559692, "sampling/importance_sampling_ratio/min": 0.011154402047395706, "sampling/sampling_logp_difference/max": 4.4959211349487305, "sampling/sampling_logp_difference/mean": 0.1544232964515686, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 320.578125, "completions/mean_terminated_length": 320.578125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.18371405452489853, "epoch": 0.415929203539823, "frac_reward_zero_std": 0.0, "grad_norm": 0.41757357450018373, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 115064591.0, "reward": 0.7365233898162842, "reward_std": 0.426136314868927, "rewards/execution_accuracy_EX/mean": 0.72265625, "rewards/execution_accuracy_EX/std": 0.4485645890235901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9850724935531616, "sampling/importance_sampling_ratio/min": 0.014291773550212383, "sampling/sampling_logp_difference/max": 4.248071193695068, "sampling/sampling_logp_difference/mean": 0.1499912142753601, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 332.62890625, "completions/mean_terminated_length": 332.62890625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.17574547603726387, "epoch": 0.4176991150442478, "frac_reward_zero_std": 0.0, "grad_norm": 0.23931763668179823, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 115538416.0, "reward": 0.44707033038139343, "reward_std": 0.46948155760765076, "rewards/execution_accuracy_EX/mean": 0.41796875, "rewards/execution_accuracy_EX/std": 0.49419113993644714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9719640016555786, "sampling/importance_sampling_ratio/mean": 0.9866256713867188, "sampling/importance_sampling_ratio/min": 0.011183848604559898, "sampling/sampling_logp_difference/max": 4.493284702301025, "sampling/sampling_logp_difference/mean": 0.13938941061496735, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 311.1875, "completions/mean_terminated_length": 311.1875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.17596963420510292, "epoch": 0.4194690265486726, "frac_reward_zero_std": 0.0, "grad_norm": 0.27855657077561136, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 116191104.0, "reward": 0.591796875, "reward_std": 0.471201092004776, "rewards/execution_accuracy_EX/mean": 0.5703125, "rewards/execution_accuracy_EX/std": 0.4960011839866638, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9817546606063843, "sampling/importance_sampling_ratio/min": 0.008679230697453022, "sampling/sampling_logp_difference/max": 4.746822357177734, "sampling/sampling_logp_difference/mean": 0.15246939659118652, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 333.82421875, "completions/mean_terminated_length": 333.82421875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.17818690091371536, "epoch": 0.42123893805309737, "frac_reward_zero_std": 0.0, "grad_norm": 0.22771774066606904, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 116660739.0, "reward": 0.5732421875, "reward_std": 0.47346949577331543, "rewards/execution_accuracy_EX/mean": 0.55078125, "rewards/execution_accuracy_EX/std": 0.49838894605636597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9877098798751831, "sampling/importance_sampling_ratio/min": 0.011156401596963406, "sampling/sampling_logp_difference/max": 4.495741844177246, "sampling/sampling_logp_difference/mean": 0.14342039823532104, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 351.83984375, "completions/mean_terminated_length": 351.83984375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.18319394811987877, "epoch": 0.4230088495575221, "frac_reward_zero_std": 0.0, "grad_norm": 0.22092969991800032, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 117055402.0, "reward": 0.4730468690395355, "reward_std": 0.47307512164115906, "rewards/execution_accuracy_EX/mean": 0.4453125, "rewards/execution_accuracy_EX/std": 0.49797385931015015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9876081943511963, "sampling/importance_sampling_ratio/min": 0.01430963259190321, "sampling/sampling_logp_difference/max": 4.246822357177734, "sampling/sampling_logp_difference/mean": 0.1455935686826706, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 287.22265625, "completions/mean_terminated_length": 287.22265625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.182206267490983, "epoch": 0.4247787610619469, "frac_reward_zero_std": 0.0, "grad_norm": 0.3294145446674225, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 117603347.0, "reward": 0.5695312023162842, "reward_std": 0.4738343358039856, "rewards/execution_accuracy_EX/mean": 0.546875, "rewards/execution_accuracy_EX/std": 0.4987730085849762, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9850952625274658, "sampling/importance_sampling_ratio/min": 0.011199885047972202, "sampling/sampling_logp_difference/max": 4.491851806640625, "sampling/sampling_logp_difference/mean": 0.14532500505447388, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 341.5859375, "completions/mean_terminated_length": 341.5859375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.17782233096659184, "epoch": 0.4265486725663717, "frac_reward_zero_std": 0.0, "grad_norm": 0.38340099039298753, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 118058649.0, "reward": 0.6029296517372131, "reward_std": 0.46948158740997314, "rewards/execution_accuracy_EX/mean": 0.58203125, "rewards/execution_accuracy_EX/std": 0.49419113993644714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9879711866378784, "sampling/importance_sampling_ratio/min": 0.011154710315167904, "sampling/sampling_logp_difference/max": 4.495893478393555, "sampling/sampling_logp_difference/mean": 0.13813140988349915, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 297.38671875, "completions/mean_terminated_length": 297.38671875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.1814308688044548, "epoch": 0.4283185840707965, "frac_reward_zero_std": 0.0, "grad_norm": 0.31709294912088215, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 118582044.0, "reward": 0.6771484017372131, "reward_std": 0.4508545994758606, "rewards/execution_accuracy_EX/mean": 0.66015625, "rewards/execution_accuracy_EX/std": 0.47458380460739136, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9851583242416382, "sampling/importance_sampling_ratio/min": 0.010942353866994381, "sampling/sampling_logp_difference/max": 4.5151143074035645, "sampling/sampling_logp_difference/mean": 0.14765186607837677, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 333.078125, "completions/mean_terminated_length": 333.078125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.19774733111262321, "epoch": 0.4300884955752212, "frac_reward_zero_std": 0.0, "grad_norm": 0.36169428919513547, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 119040768.0, "reward": 0.6177734136581421, "reward_std": 0.46676453948020935, "rewards/execution_accuracy_EX/mean": 0.59765625, "rewards/execution_accuracy_EX/std": 0.4913311004638672, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9877321124076843, "sampling/importance_sampling_ratio/min": 0.003735928563401103, "sampling/sampling_logp_difference/max": 5.58975887298584, "sampling/sampling_logp_difference/mean": 0.1538747102022171, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 316.59765625, "completions/mean_terminated_length": 316.59765625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.18043517507612705, "epoch": 0.431858407079646, "frac_reward_zero_std": 0.0, "grad_norm": 0.17978239581415803, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 119574377.0, "reward": 0.6363281011581421, "reward_std": 0.46267399191856384, "rewards/execution_accuracy_EX/mean": 0.6171875, "rewards/execution_accuracy_EX/std": 0.48702529072761536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9867154359817505, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.14469178020954132, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 322.3125, "completions/mean_terminated_length": 322.3125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.18301732279360294, "epoch": 0.4336283185840708, "frac_reward_zero_std": 0.0, "grad_norm": 0.2973983629660076, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 120106969.0, "reward": 0.6177734136581421, "reward_std": 0.46676453948020935, "rewards/execution_accuracy_EX/mean": 0.59765625, "rewards/execution_accuracy_EX/std": 0.4913311004638672, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9860332012176514, "sampling/importance_sampling_ratio/min": 0.006767266895622015, "sampling/sampling_logp_difference/max": 4.995657920837402, "sampling/sampling_logp_difference/mean": 0.14499324560165405, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 285.00390625, "completions/mean_terminated_length": 285.00390625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.16773789562284946, "epoch": 0.4353982300884956, "frac_reward_zero_std": 0.0, "grad_norm": 0.25450630742592706, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 120518362.0, "reward": 0.740234375, "reward_std": 0.4242667853832245, "rewards/execution_accuracy_EX/mean": 0.7265625, "rewards/execution_accuracy_EX/std": 0.446596622467041, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9257066249847412, "sampling/importance_sampling_ratio/mean": 0.9835507869720459, "sampling/importance_sampling_ratio/min": 0.014291773550212383, "sampling/sampling_logp_difference/max": 4.248071193695068, "sampling/sampling_logp_difference/mean": 0.14187446236610413, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 321.71484375, "completions/mean_terminated_length": 321.71484375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.179510110989213, "epoch": 0.43716814159292033, "frac_reward_zero_std": 0.0, "grad_norm": 0.21790185822960076, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 121005025.0, "reward": 0.49160152673721313, "reward_std": 0.4747525155544281, "rewards/execution_accuracy_EX/mean": 0.46484375, "rewards/execution_accuracy_EX/std": 0.49973952770233154, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9858831167221069, "sampling/importance_sampling_ratio/min": 0.01842484436929226, "sampling/sampling_logp_difference/max": 3.9940552711486816, "sampling/sampling_logp_difference/mean": 0.14202405512332916, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 322.28515625, "completions/mean_terminated_length": 322.28515625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.15929356217384338, "epoch": 0.4389380530973451, "frac_reward_zero_std": 0.0, "grad_norm": 0.16521585929931884, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 121496202.0, "reward": 0.595507800579071, "reward_std": 0.47065800428390503, "rewards/execution_accuracy_EX/mean": 0.57421875, "rewards/execution_accuracy_EX/std": 0.49542948603630066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9857891798019409, "sampling/importance_sampling_ratio/min": 0.011142482981085777, "sampling/sampling_logp_difference/max": 4.496990203857422, "sampling/sampling_logp_difference/mean": 0.12880417704582214, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 353.7578125, "completions/mean_terminated_length": 353.7578125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.1773077417165041, "epoch": 0.4407079646017699, "frac_reward_zero_std": 0.0, "grad_norm": 0.19366559964761546, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 122071564.0, "reward": 0.40625, "reward_std": 0.4608176648616791, "rewards/execution_accuracy_EX/mean": 0.375, "rewards/execution_accuracy_EX/std": 0.4850712716579437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9875608682632446, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.13819408416748047, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 289.19140625, "completions/mean_terminated_length": 289.19140625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.1673753820359707, "epoch": 0.4424778761061947, "frac_reward_zero_std": 0.0, "grad_norm": 0.38442505347754724, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 122564893.0, "reward": 0.6697266101837158, "reward_std": 0.4533011019229889, "rewards/execution_accuracy_EX/mean": 0.65234375, "rewards/execution_accuracy_EX/std": 0.4771590530872345, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9876739978790283, "sampling/importance_sampling_ratio/mean": 0.9846241474151611, "sampling/importance_sampling_ratio/min": 0.014339085668325424, "sampling/sampling_logp_difference/max": 4.2447662353515625, "sampling/sampling_logp_difference/mean": 0.13869065046310425, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 304.8046875, "completions/mean_terminated_length": 304.8046875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.16497358679771423, "epoch": 0.44424778761061945, "frac_reward_zero_std": 0.0, "grad_norm": 0.27246430304229335, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 123116107.0, "reward": 0.7216796875, "reward_std": 0.4332149624824524, "rewards/execution_accuracy_EX/mean": 0.70703125, "rewards/execution_accuracy_EX/std": 0.45601576566696167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9840646982192993, "sampling/importance_sampling_ratio/min": 0.004092916380614042, "sampling/sampling_logp_difference/max": 5.498497486114502, "sampling/sampling_logp_difference/mean": 0.1388365626335144, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 312.69140625, "completions/mean_terminated_length": 312.69140625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.1695473212748766, "epoch": 0.44601769911504424, "frac_reward_zero_std": 0.0, "grad_norm": 0.3348062185289668, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 123649148.0, "reward": 0.6251952648162842, "reward_std": 0.4652217924594879, "rewards/execution_accuracy_EX/mean": 0.60546875, "rewards/execution_accuracy_EX/std": 0.48970720171928406, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9839234352111816, "sampling/importance_sampling_ratio/min": 0.01440563052892685, "sampling/sampling_logp_difference/max": 4.24013614654541, "sampling/sampling_logp_difference/mean": 0.13801856338977814, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 334.0390625, "completions/mean_terminated_length": 334.0390625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.16055668145418167, "epoch": 0.44778761061946903, "frac_reward_zero_std": 0.0, "grad_norm": 0.16926424042417285, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 123986230.0, "reward": 0.625195324420929, "reward_std": 0.4652218222618103, "rewards/execution_accuracy_EX/mean": 0.60546875, "rewards/execution_accuracy_EX/std": 0.48970720171928406, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.984278678894043, "sampling/importance_sampling_ratio/min": 0.011232423596084118, "sampling/sampling_logp_difference/max": 4.488950729370117, "sampling/sampling_logp_difference/mean": 0.1337374448776245, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 307.45703125, "completions/mean_terminated_length": 307.45703125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.16157903522253036, "epoch": 0.4495575221238938, "frac_reward_zero_std": 0.0, "grad_norm": 0.26307459078243706, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 124512651.0, "reward": 0.6771484613418579, "reward_std": 0.450854629278183, "rewards/execution_accuracy_EX/mean": 0.66015625, "rewards/execution_accuracy_EX/std": 0.47458380460739136, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9835008382797241, "sampling/importance_sampling_ratio/min": 0.011125700548291206, "sampling/sampling_logp_difference/max": 4.498497486114502, "sampling/sampling_logp_difference/mean": 0.13886550068855286, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 301.69140625, "completions/mean_terminated_length": 301.69140625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.14815949089825153, "epoch": 0.45132743362831856, "frac_reward_zero_std": 0.0, "grad_norm": 0.13015455893654557, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 124877356.0, "reward": 0.5101562738418579, "reward_std": 0.4756980240345001, "rewards/execution_accuracy_EX/mean": 0.484375, "rewards/execution_accuracy_EX/std": 0.5007347464561462, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.983404278755188, "sampling/importance_sampling_ratio/min": 0.006765482947230339, "sampling/sampling_logp_difference/max": 4.995921611785889, "sampling/sampling_logp_difference/mean": 0.1309133768081665, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 322.87109375, "completions/mean_terminated_length": 322.87109375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.16214213706552982, "epoch": 0.45309734513274336, "frac_reward_zero_std": 0.0, "grad_norm": 0.32262561716499366, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 125303627.0, "reward": 0.5064452886581421, "reward_std": 0.4755672216415405, "rewards/execution_accuracy_EX/mean": 0.48046875, "rewards/execution_accuracy_EX/std": 0.5005971193313599, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9820327162742615, "sampling/importance_sampling_ratio/min": 0.01214428897947073, "sampling/sampling_logp_difference/max": 4.410896301269531, "sampling/sampling_logp_difference/mean": 0.14292088150978088, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 300.06640625, "completions/mean_terminated_length": 300.06640625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.1590729933232069, "epoch": 0.45486725663716815, "frac_reward_zero_std": 0.0, "grad_norm": 0.2181450860924923, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 125907052.0, "reward": 0.7253906726837158, "reward_std": 0.4315042495727539, "rewards/execution_accuracy_EX/mean": 0.7109375, "rewards/execution_accuracy_EX/std": 0.45421501994132996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9819579124450684, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.13893938064575195, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 323.55859375, "completions/mean_terminated_length": 323.55859375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.15674197487533092, "epoch": 0.45663716814159294, "frac_reward_zero_std": 0.0, "grad_norm": 0.3799954308113515, "learning_rate": 1e-06, "loss": 0.0087, "num_tokens": 126344987.0, "reward": 0.606640636920929, "reward_std": 0.46884801983833313, "rewards/execution_accuracy_EX/mean": 0.5859375, "rewards/execution_accuracy_EX/std": 0.4935242533683777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9868935942649841, "sampling/importance_sampling_ratio/min": 0.008657840080559254, "sampling/sampling_logp_difference/max": 4.7492899894714355, "sampling/sampling_logp_difference/mean": 0.13239502906799316, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 315.70703125, "completions/mean_terminated_length": 315.70703125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.15376710519194603, "epoch": 0.4584070796460177, "frac_reward_zero_std": 0.0, "grad_norm": 0.08317564004118783, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 126751984.0, "reward": 0.6400390267372131, "reward_std": 0.4617617428302765, "rewards/execution_accuracy_EX/mean": 0.62109375, "rewards/execution_accuracy_EX/std": 0.4860650300979614, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9747483730316162, "sampling/importance_sampling_ratio/mean": 0.9853553175926208, "sampling/importance_sampling_ratio/min": 0.006758376490324736, "sampling/sampling_logp_difference/max": 4.996972560882568, "sampling/sampling_logp_difference/mean": 0.1312606930732727, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 361.0859375, "completions/mean_terminated_length": 361.0859375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.17142193764448166, "epoch": 0.46017699115044247, "frac_reward_zero_std": 0.0, "grad_norm": 0.39186474780294517, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 127317318.0, "reward": 0.5621093511581421, "reward_std": 0.47447583079338074, "rewards/execution_accuracy_EX/mean": 0.5390625, "rewards/execution_accuracy_EX/std": 0.4994482398033142, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9859663248062134, "sampling/importance_sampling_ratio/min": 0.008855233900249004, "sampling/sampling_logp_difference/max": 4.726746559143066, "sampling/sampling_logp_difference/mean": 0.139002725481987, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 319.04296875, "completions/mean_terminated_length": 319.04296875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.15111440606415272, "epoch": 0.46194690265486726, "frac_reward_zero_std": 0.0, "grad_norm": 0.1436068113243909, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 127721953.0, "reward": 0.7513672113418579, "reward_std": 0.4184097647666931, "rewards/execution_accuracy_EX/mean": 0.73828125, "rewards/execution_accuracy_EX/std": 0.4404313564300537, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9850014448165894, "sampling/importance_sampling_ratio/min": 0.011125700548291206, "sampling/sampling_logp_difference/max": 4.498497486114502, "sampling/sampling_logp_difference/mean": 0.13237908482551575, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 313.9921875, "completions/mean_terminated_length": 313.9921875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.1543640997260809, "epoch": 0.46371681415929206, "frac_reward_zero_std": 0.0, "grad_norm": 0.30275091627194683, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 128086383.0, "reward": 0.755078136920929, "reward_std": 0.41637277603149414, "rewards/execution_accuracy_EX/mean": 0.7421875, "rewards/execution_accuracy_EX/std": 0.4382871091365814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9859907627105713, "sampling/importance_sampling_ratio/mean": 0.9828374981880188, "sampling/importance_sampling_ratio/min": 0.005257649812847376, "sampling/sampling_logp_difference/max": 5.248071193695068, "sampling/sampling_logp_difference/mean": 0.1326378881931305, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 335.09375, "completions/mean_terminated_length": 335.09375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.16963398829102516, "epoch": 0.4654867256637168, "frac_reward_zero_std": 0.0, "grad_norm": 0.18390525599939167, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 128510423.0, "reward": 0.532421886920929, "reward_std": 0.47587236762046814, "rewards/execution_accuracy_EX/mean": 0.5078125, "rewards/execution_accuracy_EX/std": 0.5009182691574097, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9863378405570984, "sampling/importance_sampling_ratio/min": 0.014339092187583447, "sampling/sampling_logp_difference/max": 4.244765758514404, "sampling/sampling_logp_difference/mean": 0.14008203148841858, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 274.6484375, "completions/mean_terminated_length": 274.6484375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.15089268796145916, "epoch": 0.4672566371681416, "frac_reward_zero_std": 0.0, "grad_norm": 0.2774874574360264, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 129029341.0, "reward": 0.7328125238418579, "reward_std": 0.4279654622077942, "rewards/execution_accuracy_EX/mean": 0.71875, "rewards/execution_accuracy_EX/std": 0.45048993825912476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9828546047210693, "sampling/importance_sampling_ratio/min": 0.01623484492301941, "sampling/sampling_logp_difference/max": 4.120595455169678, "sampling/sampling_logp_difference/mean": 0.13830263912677765, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 336.5625, "completions/mean_terminated_length": 336.5625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.15643581748008728, "epoch": 0.4690265486725664, "frac_reward_zero_std": 0.0, "grad_norm": 0.26534418305967483, "learning_rate": 1e-06, "loss": -0.0064, "num_tokens": 129427373.0, "reward": 0.7291015386581421, "reward_std": 0.4297545552253723, "rewards/execution_accuracy_EX/mean": 0.71484375, "rewards/execution_accuracy_EX/std": 0.4523732364177704, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9860031008720398, "sampling/importance_sampling_ratio/min": 0.004114307928830385, "sampling/sampling_logp_difference/max": 5.493284702301025, "sampling/sampling_logp_difference/mean": 0.13032007217407227, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 349.44140625, "completions/mean_terminated_length": 349.44140625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.1617734730243683, "epoch": 0.47079646017699117, "frac_reward_zero_std": 0.0, "grad_norm": 0.30356793750029853, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 129892142.0, "reward": 0.666015625, "reward_std": 0.4544737935066223, "rewards/execution_accuracy_EX/mean": 0.6484375, "rewards/execution_accuracy_EX/std": 0.47839346528053284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9848423004150391, "sampling/importance_sampling_ratio/min": 0.01123324315994978, "sampling/sampling_logp_difference/max": 4.488877773284912, "sampling/sampling_logp_difference/mean": 0.13442406058311462, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 357.18359375, "completions/mean_terminated_length": 357.18359375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.1611873358488083, "epoch": 0.4725663716814159, "frac_reward_zero_std": 0.0, "grad_norm": 0.28912781415223693, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 130367613.0, "reward": 0.6400390863418579, "reward_std": 0.4617617428302765, "rewards/execution_accuracy_EX/mean": 0.62109375, "rewards/execution_accuracy_EX/std": 0.4860650300979614, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.961456060409546, "sampling/importance_sampling_ratio/mean": 0.9876816272735596, "sampling/importance_sampling_ratio/min": 0.011342701502144337, "sampling/sampling_logp_difference/max": 4.479180812835693, "sampling/sampling_logp_difference/mean": 0.13081085681915283, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 335.24609375, "completions/mean_terminated_length": 335.24609375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.15762670896947384, "epoch": 0.4743362831858407, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 130714908.0, "reward": 0.5843749642372131, "reward_std": 0.47219759225845337, "rewards/execution_accuracy_EX/mean": 0.5625, "rewards/execution_accuracy_EX/std": 0.49705013632774353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9845083951950073, "sampling/importance_sampling_ratio/min": 0.003295636037364602, "sampling/sampling_logp_difference/max": 5.715156078338623, "sampling/sampling_logp_difference/mean": 0.1356452852487564, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 318.171875, "completions/mean_terminated_length": 318.171875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.16317665576934814, "epoch": 0.4761061946902655, "frac_reward_zero_std": 0.0, "grad_norm": 0.21386895021090507, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 131188024.0, "reward": 0.6400390863418579, "reward_std": 0.4617617428302765, "rewards/execution_accuracy_EX/mean": 0.62109375, "rewards/execution_accuracy_EX/std": 0.4860650300979614, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9177110195159912, "sampling/importance_sampling_ratio/mean": 0.9831749200820923, "sampling/importance_sampling_ratio/min": 0.0028142144437879324, "sampling/sampling_logp_difference/max": 5.873072147369385, "sampling/sampling_logp_difference/mean": 0.14006644487380981, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 318.9921875, "completions/mean_terminated_length": 318.9921875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.15535980463027954, "epoch": 0.4778761061946903, "frac_reward_zero_std": 0.0, "grad_norm": 0.42667060502469817, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 131626358.0, "reward": 0.7105468511581421, "reward_std": 0.43811774253845215, "rewards/execution_accuracy_EX/mean": 0.6953125, "rewards/execution_accuracy_EX/std": 0.4611765742301941, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9823930263519287, "sampling/importance_sampling_ratio/min": 0.01123306155204773, "sampling/sampling_logp_difference/max": 4.488893985748291, "sampling/sampling_logp_difference/mean": 0.1382257342338562, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 330.00390625, "completions/mean_terminated_length": 330.00390625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.15658987313508987, "epoch": 0.479646017699115, "frac_reward_zero_std": 0.0, "grad_norm": 0.0727885528061531, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 132231719.0, "reward": 0.5843750238418579, "reward_std": 0.47219759225845337, "rewards/execution_accuracy_EX/mean": 0.5625, "rewards/execution_accuracy_EX/std": 0.49705013632774353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9706207513809204, "sampling/importance_sampling_ratio/mean": 0.9849157333374023, "sampling/importance_sampling_ratio/min": 0.018390489742159843, "sampling/sampling_logp_difference/max": 3.9959216117858887, "sampling/sampling_logp_difference/mean": 0.13399212062358856, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 329.05859375, "completions/mean_terminated_length": 329.05859375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.17054855078458786, "epoch": 0.4814159292035398, "frac_reward_zero_std": 0.0, "grad_norm": 0.2497833425558903, "learning_rate": 1e-06, "loss": -0.0048, "num_tokens": 132795046.0, "reward": 0.5101562142372131, "reward_std": 0.4756980240345001, "rewards/execution_accuracy_EX/mean": 0.484375, "rewards/execution_accuracy_EX/std": 0.5007347464561462, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9843830466270447, "sampling/importance_sampling_ratio/min": 0.014309653080999851, "sampling/sampling_logp_difference/max": 4.24682092666626, "sampling/sampling_logp_difference/mean": 0.1444835662841797, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 363.8359375, "completions/mean_terminated_length": 363.8359375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.16747299954295158, "epoch": 0.4831858407079646, "frac_reward_zero_std": 0.0, "grad_norm": 0.26470996479696285, "learning_rate": 1e-06, "loss": -0.0043, "num_tokens": 133427404.0, "reward": 0.5732421875, "reward_std": 0.47346949577331543, "rewards/execution_accuracy_EX/mean": 0.55078125, "rewards/execution_accuracy_EX/std": 0.49838894605636597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9838159084320068, "sampling/importance_sampling_ratio/min": 0.01124486792832613, "sampling/sampling_logp_difference/max": 4.4878435134887695, "sampling/sampling_logp_difference/mean": 0.1417458951473236, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 325.0390625, "completions/mean_terminated_length": 325.0390625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.1463457401841879, "epoch": 0.4849557522123894, "frac_reward_zero_std": 0.0, "grad_norm": 0.11071843423636837, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 133968694.0, "reward": 0.6957031488418579, "reward_std": 0.44413506984710693, "rewards/execution_accuracy_EX/mean": 0.6796875, "rewards/execution_accuracy_EX/std": 0.4675106406211853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9844371676445007, "sampling/importance_sampling_ratio/min": 0.014294745400547981, "sampling/sampling_logp_difference/max": 4.247863292694092, "sampling/sampling_logp_difference/mean": 0.1296854466199875, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 311.1796875, "completions/mean_terminated_length": 311.1796875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.14501554891467094, "epoch": 0.48672566371681414, "frac_reward_zero_std": 0.0, "grad_norm": 0.12567377006957567, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 134473476.0, "reward": 0.6363281011581421, "reward_std": 0.46267399191856384, "rewards/execution_accuracy_EX/mean": 0.6171875, "rewards/execution_accuracy_EX/std": 0.48702529072761536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9830794334411621, "sampling/importance_sampling_ratio/min": 0.014313235878944397, "sampling/sampling_logp_difference/max": 4.246570587158203, "sampling/sampling_logp_difference/mean": 0.1320023238658905, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 321.14453125, "completions/mean_terminated_length": 321.14453125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.14876681938767433, "epoch": 0.48849557522123893, "frac_reward_zero_std": 0.0, "grad_norm": 0.20503261543219914, "learning_rate": 1e-06, "loss": -0.0025, "num_tokens": 134942761.0, "reward": 0.5732421875, "reward_std": 0.47346949577331543, "rewards/execution_accuracy_EX/mean": 0.55078125, "rewards/execution_accuracy_EX/std": 0.49838894605636597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9824311137199402, "sampling/importance_sampling_ratio/min": 0.0143876438960433, "sampling/sampling_logp_difference/max": 4.241385459899902, "sampling/sampling_logp_difference/mean": 0.13451460003852844, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 300.83203125, "completions/mean_terminated_length": 300.83203125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.14324429631233215, "epoch": 0.4902654867256637, "frac_reward_zero_std": 0.0, "grad_norm": 0.22762235222323923, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 135427998.0, "reward": 0.740234375, "reward_std": 0.4242667853832245, "rewards/execution_accuracy_EX/mean": 0.7265625, "rewards/execution_accuracy_EX/std": 0.446596622467041, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.978734016418457, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.14036285877227783, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 339.51953125, "completions/mean_terminated_length": 339.51953125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.13792935758829117, "epoch": 0.4920353982300885, "frac_reward_zero_std": 0.0, "grad_norm": 0.22668747679870122, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 136001715.0, "reward": 0.6771484613418579, "reward_std": 0.450854629278183, "rewards/execution_accuracy_EX/mean": 0.66015625, "rewards/execution_accuracy_EX/std": 0.47458380460739136, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9820734262466431, "sampling/importance_sampling_ratio/min": 0.008748387917876244, "sampling/sampling_logp_difference/max": 4.738885879516602, "sampling/sampling_logp_difference/mean": 0.12939125299453735, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 312.015625, "completions/mean_terminated_length": 312.015625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.1376123521476984, "epoch": 0.49380530973451325, "frac_reward_zero_std": 0.0, "grad_norm": 0.2882416970979156, "learning_rate": 1e-06, "loss": -0.0083, "num_tokens": 136525559.0, "reward": 0.6808593273162842, "reward_std": 0.44958025217056274, "rewards/execution_accuracy_EX/mean": 0.6640625, "rewards/execution_accuracy_EX/std": 0.4732423722743988, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9782224297523499, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.13522344827651978, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 338.38671875, "completions/mean_terminated_length": 338.38671875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.1472346018999815, "epoch": 0.49557522123893805, "frac_reward_zero_std": 0.0, "grad_norm": 0.32968065198006247, "learning_rate": 1e-06, "loss": 0.0157, "num_tokens": 137101626.0, "reward": 0.740234375, "reward_std": 0.4242667853832245, "rewards/execution_accuracy_EX/mean": 0.7265625, "rewards/execution_accuracy_EX/std": 0.446596622467041, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.984591007232666, "sampling/importance_sampling_ratio/min": 0.011125885881483555, "sampling/sampling_logp_difference/max": 4.498480796813965, "sampling/sampling_logp_difference/mean": 0.13329678773880005, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 288.8046875, "completions/mean_terminated_length": 288.8046875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.13036941178143024, "epoch": 0.49734513274336284, "frac_reward_zero_std": 0.0, "grad_norm": 0.2993157126682758, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 137503208.0, "reward": 0.725390613079071, "reward_std": 0.4315042495727539, "rewards/execution_accuracy_EX/mean": 0.7109375, "rewards/execution_accuracy_EX/std": 0.45421501994132996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9811557531356812, "sampling/importance_sampling_ratio/min": 0.008661936968564987, "sampling/sampling_logp_difference/max": 4.748816967010498, "sampling/sampling_logp_difference/mean": 0.12963128089904785, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 308.6953125, "completions/mean_terminated_length": 308.6953125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.13289682008326054, "epoch": 0.49911504424778763, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 137919930.0, "reward": 0.703125, "reward_std": 0.44119933247566223, "rewards/execution_accuracy_EX/mean": 0.6875, "rewards/execution_accuracy_EX/std": 0.4644203782081604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9788047075271606, "sampling/importance_sampling_ratio/min": 0.01113810669630766, "sampling/sampling_logp_difference/max": 4.497383117675781, "sampling/sampling_logp_difference/mean": 0.1310933232307434, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 334.78125, "completions/mean_terminated_length": 334.78125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.1308655822649598, "epoch": 0.5008849557522124, "frac_reward_zero_std": 0.0, "grad_norm": 0.28872882366535085, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 138393138.0, "reward": 0.5249999761581421, "reward_std": 0.47593045234680176, "rewards/execution_accuracy_EX/mean": 0.5, "rewards/execution_accuracy_EX/std": 0.5009794235229492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9815727472305298, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.12588366866111755, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 339.453125, "completions/mean_terminated_length": 339.453125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.1365262884646654, "epoch": 0.5026548672566372, "frac_reward_zero_std": 0.0, "grad_norm": 0.22924419395278106, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 138840390.0, "reward": 0.6103515625, "reward_std": 0.46818408370018005, "rewards/execution_accuracy_EX/mean": 0.58984375, "rewards/execution_accuracy_EX/std": 0.49282538890838623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9821449518203735, "sampling/importance_sampling_ratio/min": 0.014339229092001915, "sampling/sampling_logp_difference/max": 4.24475622177124, "sampling/sampling_logp_difference/mean": 0.1286645233631134, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 280.16796875, "completions/mean_terminated_length": 280.16796875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.12094041053205729, "epoch": 0.504424778761062, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 139351585.0, "reward": 0.762499988079071, "reward_std": 0.41216787695884705, "rewards/execution_accuracy_EX/mean": 0.75, "rewards/execution_accuracy_EX/std": 0.4338609278202057, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9793006181716919, "sampling/importance_sampling_ratio/min": 0.011343728750944138, "sampling/sampling_logp_difference/max": 4.479090213775635, "sampling/sampling_logp_difference/mean": 0.12439154088497162, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 297.28125, "completions/mean_terminated_length": 297.28125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.12189241219311953, "epoch": 0.5061946902654867, "frac_reward_zero_std": 0.0, "grad_norm": 0.06708248641559404, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 139885689.0, "reward": 0.5806640386581421, "reward_std": 0.4726512134075165, "rewards/execution_accuracy_EX/mean": 0.55859375, "rewards/execution_accuracy_EX/std": 0.4975275993347168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9511847496032715, "sampling/importance_sampling_ratio/mean": 0.9776397943496704, "sampling/importance_sampling_ratio/min": 0.011178990826010704, "sampling/sampling_logp_difference/max": 4.493719100952148, "sampling/sampling_logp_difference/mean": 0.12825796008110046, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 333.3125, "completions/mean_terminated_length": 333.3125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.13882357813417912, "epoch": 0.5079646017699115, "frac_reward_zero_std": 0.0, "grad_norm": 0.11393450014408663, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 140461433.0, "reward": 0.5843750238418579, "reward_std": 0.47219759225845337, "rewards/execution_accuracy_EX/mean": 0.5625, "rewards/execution_accuracy_EX/std": 0.49705013632774353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9792859554290771, "sampling/importance_sampling_ratio/min": 0.011136699467897415, "sampling/sampling_logp_difference/max": 4.497509479522705, "sampling/sampling_logp_difference/mean": 0.1329355239868164, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 304.63671875, "completions/mean_terminated_length": 304.63671875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.11912460718303919, "epoch": 0.5097345132743363, "frac_reward_zero_std": 0.0, "grad_norm": 0.24897134072012178, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 140816876.0, "reward": 0.5658203363418579, "reward_std": 0.4741697609424591, "rewards/execution_accuracy_EX/mean": 0.54296875, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.996685266494751, "sampling/importance_sampling_ratio/mean": 0.9798433780670166, "sampling/importance_sampling_ratio/min": 0.011232483200728893, "sampling/sampling_logp_difference/max": 4.488945484161377, "sampling/sampling_logp_difference/mean": 0.12297806143760681, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 297.28515625, "completions/mean_terminated_length": 297.28515625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.13027150463312864, "epoch": 0.511504424778761, "frac_reward_zero_std": 0.0, "grad_norm": 0.34148070514391965, "learning_rate": 1e-06, "loss": -0.0068, "num_tokens": 141478917.0, "reward": 0.7736327648162842, "reward_std": 0.40552324056625366, "rewards/execution_accuracy_EX/mean": 0.76171875, "rewards/execution_accuracy_EX/std": 0.4268665909767151, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9046118259429932, "sampling/importance_sampling_ratio/mean": 0.9805927276611328, "sampling/importance_sampling_ratio/min": 0.00674409931525588, "sampling/sampling_logp_difference/max": 4.999087333679199, "sampling/sampling_logp_difference/mean": 0.1281258761882782, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 347.13671875, "completions/mean_terminated_length": 347.13671875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.1376639176160097, "epoch": 0.5132743362831859, "frac_reward_zero_std": 0.0, "grad_norm": 0.18460042307419683, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 142237640.0, "reward": 0.5138671398162842, "reward_std": 0.47579970955848694, "rewards/execution_accuracy_EX/mean": 0.48828125, "rewards/execution_accuracy_EX/std": 0.5008418560028076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9834855794906616, "sampling/importance_sampling_ratio/min": 0.0007139044464565814, "sampling/sampling_logp_difference/max": 7.2447614669799805, "sampling/sampling_logp_difference/mean": 0.12558966875076294, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 316.39453125, "completions/mean_terminated_length": 316.39453125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.14952068030834198, "epoch": 0.5150442477876106, "frac_reward_zero_std": 0.0, "grad_norm": 0.4095413319425253, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 143017469.0, "reward": 0.40625, "reward_std": 0.4608176648616791, "rewards/execution_accuracy_EX/mean": 0.375, "rewards/execution_accuracy_EX/std": 0.4850712716579437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9810730218887329, "sampling/importance_sampling_ratio/min": 0.0004330030642449856, "sampling/sampling_logp_difference/max": 7.744765758514404, "sampling/sampling_logp_difference/mean": 0.1375647485256195, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 318.98828125, "completions/mean_terminated_length": 318.98828125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.1301272250711918, "epoch": 0.5168141592920354, "frac_reward_zero_std": 0.0, "grad_norm": 0.29732860154924123, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 143526138.0, "reward": 0.762499988079071, "reward_std": 0.41216787695884705, "rewards/execution_accuracy_EX/mean": 0.75, "rewards/execution_accuracy_EX/std": 0.4338609278202057, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9799952507019043, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.12830430269241333, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 297.74609375, "completions/mean_terminated_length": 297.74609375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.13298062421381474, "epoch": 0.5185840707964602, "frac_reward_zero_std": 0.0, "grad_norm": 0.2787990158226181, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 144205865.0, "reward": 0.7365233898162842, "reward_std": 0.4261363446712494, "rewards/execution_accuracy_EX/mean": 0.72265625, "rewards/execution_accuracy_EX/std": 0.4485645890235901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9447996616363525, "sampling/importance_sampling_ratio/mean": 0.980080783367157, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.13017098605632782, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 324.515625, "completions/mean_terminated_length": 324.515625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.145736213773489, "epoch": 0.5203539823008849, "frac_reward_zero_std": 0.0, "grad_norm": 0.22014542190198488, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 144691277.0, "reward": 0.48417967557907104, "reward_std": 0.4741697609424591, "rewards/execution_accuracy_EX/mean": 0.45703125, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9811964631080627, "sampling/importance_sampling_ratio/min": 0.008673182688653469, "sampling/sampling_logp_difference/max": 4.747519493103027, "sampling/sampling_logp_difference/mean": 0.1368313431739807, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 278.1796875, "completions/mean_terminated_length": 278.1796875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.11698152963072062, "epoch": 0.5221238938053098, "frac_reward_zero_std": 0.0, "grad_norm": 0.31372558641082093, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 145100715.0, "reward": 0.5695312023162842, "reward_std": 0.4738343358039856, "rewards/execution_accuracy_EX/mean": 0.546875, "rewards/execution_accuracy_EX/std": 0.4987730085849762, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.978211522102356, "sampling/importance_sampling_ratio/min": 0.006767896935343742, "sampling/sampling_logp_difference/max": 4.995564937591553, "sampling/sampling_logp_difference/mean": 0.12563587725162506, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 303.96875, "completions/mean_terminated_length": 303.96875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.12474698945879936, "epoch": 0.5238938053097345, "frac_reward_zero_std": 0.0, "grad_norm": 0.2729089845891089, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 145483427.0, "reward": 0.6029296517372131, "reward_std": 0.46948155760765076, "rewards/execution_accuracy_EX/mean": 0.58203125, "rewards/execution_accuracy_EX/std": 0.49419113993644714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9778717756271362, "sampling/importance_sampling_ratio/min": 0.01152029074728489, "sampling/sampling_logp_difference/max": 4.4636454582214355, "sampling/sampling_logp_difference/mean": 0.1309039443731308, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 285.16796875, "completions/mean_terminated_length": 285.16796875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.1238878509029746, "epoch": 0.5256637168141592, "frac_reward_zero_std": 0.0, "grad_norm": 0.2991484627446501, "learning_rate": 1e-06, "loss": 0.0155, "num_tokens": 146028926.0, "reward": 0.632617175579071, "reward_std": 0.46355465054512024, "rewards/execution_accuracy_EX/mean": 0.61328125, "rewards/execution_accuracy_EX/std": 0.4879522919654846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9792416095733643, "sampling/importance_sampling_ratio/min": 0.00682168360799551, "sampling/sampling_logp_difference/max": 4.987648963928223, "sampling/sampling_logp_difference/mean": 0.1251850724220276, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 273.83984375, "completions/mean_terminated_length": 273.83984375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.12272048555314541, "epoch": 0.5274336283185841, "frac_reward_zero_std": 0.0, "grad_norm": 0.24746918048771818, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 146340501.0, "reward": 0.42851561307907104, "reward_std": 0.46600863337516785, "rewards/execution_accuracy_EX/mean": 0.3984375, "rewards/execution_accuracy_EX/std": 0.4905354380607605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7686686515808105, "sampling/importance_sampling_ratio/mean": 0.9759113788604736, "sampling/importance_sampling_ratio/min": 0.008663984946906567, "sampling/sampling_logp_difference/max": 4.748580455780029, "sampling/sampling_logp_difference/mean": 0.13672521710395813, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 291.1484375, "completions/mean_terminated_length": 291.1484375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.12394737359136343, "epoch": 0.5292035398230088, "frac_reward_zero_std": 0.0, "grad_norm": 0.23592129131052397, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 146827131.0, "reward": 0.539843738079071, "reward_std": 0.4756980240345001, "rewards/execution_accuracy_EX/mean": 0.515625, "rewards/execution_accuracy_EX/std": 0.5007347464561462, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9780433177947998, "sampling/importance_sampling_ratio/min": 0.011163590475916862, "sampling/sampling_logp_difference/max": 4.495097637176514, "sampling/sampling_logp_difference/mean": 0.13204070925712585, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 283.06640625, "completions/mean_terminated_length": 283.06640625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.10977283492684364, "epoch": 0.5309734513274337, "frac_reward_zero_std": 0.0, "grad_norm": 0.39569112200650275, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 147205932.0, "reward": 0.6214843988418579, "reward_std": 0.46600863337516785, "rewards/execution_accuracy_EX/mean": 0.6015625, "rewards/execution_accuracy_EX/std": 0.4905354380607605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9754796028137207, "sampling/importance_sampling_ratio/min": 0.004210295621305704, "sampling/sampling_logp_difference/max": 5.470222473144531, "sampling/sampling_logp_difference/mean": 0.12663070857524872, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 293.30078125, "completions/mean_terminated_length": 293.30078125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.13259944692254066, "epoch": 0.5327433628318584, "frac_reward_zero_std": 0.0, "grad_norm": 0.23623998480101524, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 147868361.0, "reward": 0.5806640386581421, "reward_std": 0.4726512134075165, "rewards/execution_accuracy_EX/mean": 0.55859375, "rewards/execution_accuracy_EX/std": 0.4975275993347168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9826499223709106, "sampling/importance_sampling_ratio/min": 0.009238926693797112, "sampling/sampling_logp_difference/max": 4.684329509735107, "sampling/sampling_logp_difference/mean": 0.12703940272331238, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 283.91796875, "completions/mean_terminated_length": 283.91796875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.1126785958185792, "epoch": 0.5345132743362832, "frac_reward_zero_std": 0.0, "grad_norm": 0.25116616262053426, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 148352836.0, "reward": 0.614062488079071, "reward_std": 0.46748965978622437, "rewards/execution_accuracy_EX/mean": 0.59375, "rewards/execution_accuracy_EX/std": 0.49209436774253845, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9756138920783997, "sampling/importance_sampling_ratio/min": 0.011125700548291206, "sampling/sampling_logp_difference/max": 4.498497486114502, "sampling/sampling_logp_difference/mean": 0.12942509353160858, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 291.55859375, "completions/mean_terminated_length": 291.55859375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.12431715428829193, "epoch": 0.536283185840708, "frac_reward_zero_std": 0.0, "grad_norm": 0.3050077209973281, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 148898403.0, "reward": 0.558398425579071, "reward_std": 0.4747525453567505, "rewards/execution_accuracy_EX/mean": 0.53515625, "rewards/execution_accuracy_EX/std": 0.49973952770233154, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9719961881637573, "sampling/importance_sampling_ratio/mean": 0.9764449596405029, "sampling/importance_sampling_ratio/min": 0.0024954539258033037, "sampling/sampling_logp_difference/max": 5.993284702301025, "sampling/sampling_logp_difference/mean": 0.13283288478851318, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 287.00390625, "completions/mean_terminated_length": 287.00390625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.1052580252289772, "epoch": 0.5380530973451327, "frac_reward_zero_std": 0.0, "grad_norm": 0.2822301487141552, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 149474948.0, "reward": 0.7365233898162842, "reward_std": 0.426136314868927, "rewards/execution_accuracy_EX/mean": 0.72265625, "rewards/execution_accuracy_EX/std": 0.4485645890235901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9817387461662292, "sampling/importance_sampling_ratio/min": 0.0031865073833614588, "sampling/sampling_logp_difference/max": 5.7488298416137695, "sampling/sampling_logp_difference/mean": 0.11145463585853577, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 273.0703125, "completions/mean_terminated_length": 273.0703125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.0955903958529234, "epoch": 0.5398230088495575, "frac_reward_zero_std": 0.0, "grad_norm": 0.335108241926158, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 149883126.0, "reward": 0.6734374761581421, "reward_std": 0.45209479331970215, "rewards/execution_accuracy_EX/mean": 0.65625, "rewards/execution_accuracy_EX/std": 0.47588926553726196, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8455147743225098, "sampling/importance_sampling_ratio/mean": 0.9789060354232788, "sampling/importance_sampling_ratio/min": 0.004095620010048151, "sampling/sampling_logp_difference/max": 5.497837066650391, "sampling/sampling_logp_difference/mean": 0.11343404650688171, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 263.23828125, "completions/mean_terminated_length": 263.23828125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.10447422694414854, "epoch": 0.5415929203539823, "frac_reward_zero_std": 0.0, "grad_norm": 0.3523130921969951, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 150463939.0, "reward": 0.7587890625, "reward_std": 0.4142923355102539, "rewards/execution_accuracy_EX/mean": 0.74609375, "rewards/execution_accuracy_EX/std": 0.4360972046852112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9138906002044678, "sampling/importance_sampling_ratio/mean": 0.9792771935462952, "sampling/importance_sampling_ratio/min": 0.004135684575885534, "sampling/sampling_logp_difference/max": 5.488102436065674, "sampling/sampling_logp_difference/mean": 0.11896039545536041, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 282.62890625, "completions/mean_terminated_length": 282.62890625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.11410106346011162, "epoch": 0.5433628318584071, "frac_reward_zero_std": 0.0, "grad_norm": 0.3785351339284642, "learning_rate": 1e-06, "loss": -0.0069, "num_tokens": 151005332.0, "reward": 0.77734375, "reward_std": 0.4032154679298401, "rewards/execution_accuracy_EX/mean": 0.765625, "rewards/execution_accuracy_EX/std": 0.42443734407424927, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9782164096832275, "sampling/importance_sampling_ratio/min": 0.008668584749102592, "sampling/sampling_logp_difference/max": 4.748049736022949, "sampling/sampling_logp_difference/mean": 0.12442274391651154, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 302.93359375, "completions/mean_terminated_length": 302.93359375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.10900202672928572, "epoch": 0.5451327433628319, "frac_reward_zero_std": 0.0, "grad_norm": 0.2965077105083676, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 151567075.0, "reward": 0.7476562261581421, "reward_std": 0.4204040467739105, "rewards/execution_accuracy_EX/mean": 0.734375, "rewards/execution_accuracy_EX/std": 0.4425306022167206, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9635331630706787, "sampling/importance_sampling_ratio/mean": 0.9799326658248901, "sampling/importance_sampling_ratio/min": 0.013507682830095291, "sampling/sampling_logp_difference/max": 4.304496765136719, "sampling/sampling_logp_difference/mean": 0.1177440732717514, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 297.703125, "completions/mean_terminated_length": 297.703125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.10871629603207111, "epoch": 0.5469026548672566, "frac_reward_zero_std": 0.0, "grad_norm": 0.1966184489579502, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 151994919.0, "reward": 0.5101562142372131, "reward_std": 0.47569799423217773, "rewards/execution_accuracy_EX/mean": 0.484375, "rewards/execution_accuracy_EX/std": 0.5007347464561462, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9745786190032959, "sampling/importance_sampling_ratio/min": 0.003188925562426448, "sampling/sampling_logp_difference/max": 5.748071193695068, "sampling/sampling_logp_difference/mean": 0.1306045800447464, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 278.81640625, "completions/mean_terminated_length": 278.81640625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.09989846311509609, "epoch": 0.5486725663716814, "frac_reward_zero_std": 0.0, "grad_norm": 0.28743303121757885, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 152490808.0, "reward": 0.632617175579071, "reward_std": 0.46355465054512024, "rewards/execution_accuracy_EX/mean": 0.61328125, "rewards/execution_accuracy_EX/std": 0.4879522919654846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9809783697128296, "sampling/importance_sampling_ratio/mean": 0.9790998697280884, "sampling/importance_sampling_ratio/min": 0.008679677732288837, "sampling/sampling_logp_difference/max": 4.746770858764648, "sampling/sampling_logp_difference/mean": 0.11842896044254303, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 264.90234375, "completions/mean_terminated_length": 264.90234375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.09828067570924759, "epoch": 0.5504424778761062, "frac_reward_zero_std": 0.0, "grad_norm": 0.17912085939098438, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 152934223.0, "reward": 0.6808593273162842, "reward_std": 0.44958025217056274, "rewards/execution_accuracy_EX/mean": 0.6640625, "rewards/execution_accuracy_EX/std": 0.4732423722743988, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8792515993118286, "sampling/importance_sampling_ratio/mean": 0.9768507480621338, "sampling/importance_sampling_ratio/min": 0.00525788776576519, "sampling/sampling_logp_difference/max": 5.248025894165039, "sampling/sampling_logp_difference/mean": 0.12117663770914078, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 272.1953125, "completions/mean_terminated_length": 272.1953125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.09398173447698355, "epoch": 0.552212389380531, "frac_reward_zero_std": 0.0, "grad_norm": 0.254011635424865, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 153340865.0, "reward": 0.7142578363418579, "reward_std": 0.4365212619304657, "rewards/execution_accuracy_EX/mean": 0.69921875, "rewards/execution_accuracy_EX/std": 0.45949608087539673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8599828481674194, "sampling/importance_sampling_ratio/mean": 0.9765916466712952, "sampling/importance_sampling_ratio/min": 0.006748223211616278, "sampling/sampling_logp_difference/max": 4.998476028442383, "sampling/sampling_logp_difference/mean": 0.11689199507236481, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 263.62890625, "completions/mean_terminated_length": 263.62890625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.09625769220292568, "epoch": 0.5539823008849557, "frac_reward_zero_std": 0.0, "grad_norm": 0.20811602609326846, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 153646226.0, "reward": 0.6771484613418579, "reward_std": 0.450854629278183, "rewards/execution_accuracy_EX/mean": 0.66015625, "rewards/execution_accuracy_EX/std": 0.47458380460739136, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8367109298706055, "sampling/importance_sampling_ratio/mean": 0.9744427800178528, "sampling/importance_sampling_ratio/min": 0.0052512455731630325, "sampling/sampling_logp_difference/max": 5.2492899894714355, "sampling/sampling_logp_difference/mean": 0.12279993295669556, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 258.38671875, "completions/mean_terminated_length": 258.38671875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.09994737897068262, "epoch": 0.5557522123893806, "frac_reward_zero_std": 0.0, "grad_norm": 0.31314542087485775, "learning_rate": 1e-06, "loss": -0.0074, "num_tokens": 154093365.0, "reward": 0.5806640386581421, "reward_std": 0.4726512134075165, "rewards/execution_accuracy_EX/mean": 0.55859375, "rewards/execution_accuracy_EX/std": 0.4975275993347168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9763374328613281, "sampling/importance_sampling_ratio/min": 0.005260583944618702, "sampling/sampling_logp_difference/max": 5.247513294219971, "sampling/sampling_logp_difference/mean": 0.1211942732334137, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 240.60546875, "completions/mean_terminated_length": 240.60546875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.09025467094033957, "epoch": 0.5575221238938053, "frac_reward_zero_std": 0.0, "grad_norm": 0.31219309775486603, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 154605888.0, "reward": 0.6734374761581421, "reward_std": 0.45209482312202454, "rewards/execution_accuracy_EX/mean": 0.65625, "rewards/execution_accuracy_EX/std": 0.47588926553726196, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8792815208435059, "sampling/importance_sampling_ratio/mean": 0.9757258892059326, "sampling/importance_sampling_ratio/min": 0.0035351368132978678, "sampling/sampling_logp_difference/max": 5.645003318786621, "sampling/sampling_logp_difference/mean": 0.12007880210876465, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 250.88671875, "completions/mean_terminated_length": 250.88671875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.10382918268442154, "epoch": 0.5592920353982301, "frac_reward_zero_std": 0.0, "grad_norm": 0.4815627276678539, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 155197251.0, "reward": 0.6734374761581421, "reward_std": 0.45209482312202454, "rewards/execution_accuracy_EX/mean": 0.65625, "rewards/execution_accuracy_EX/std": 0.47588926553726196, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9763296246528625, "sampling/importance_sampling_ratio/min": 0.011155710555613041, "sampling/sampling_logp_difference/max": 4.4958038330078125, "sampling/sampling_logp_difference/mean": 0.12063725292682648, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 245.35546875, "completions/mean_terminated_length": 245.35546875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.09619526844471693, "epoch": 0.5610619469026549, "frac_reward_zero_std": 0.0, "grad_norm": 0.4402110009806778, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 155720206.0, "reward": 0.688281238079071, "reward_std": 0.44692784547805786, "rewards/execution_accuracy_EX/mean": 0.671875, "rewards/execution_accuracy_EX/std": 0.47045037150382996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9741077423095703, "sampling/importance_sampling_ratio/min": 0.004092916380614042, "sampling/sampling_logp_difference/max": 5.498497486114502, "sampling/sampling_logp_difference/mean": 0.125055730342865, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 250.2265625, "completions/mean_terminated_length": 250.2265625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.10002064984291792, "epoch": 0.5628318584070796, "frac_reward_zero_std": 0.0, "grad_norm": 0.2587463217461931, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 156195000.0, "reward": 0.5843749642372131, "reward_std": 0.47219762206077576, "rewards/execution_accuracy_EX/mean": 0.5625, "rewards/execution_accuracy_EX/std": 0.49705013632774353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.97621750831604, "sampling/importance_sampling_ratio/min": 0.011119124479591846, "sampling/sampling_logp_difference/max": 4.499088764190674, "sampling/sampling_logp_difference/mean": 0.11931677907705307, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 269.61328125, "completions/mean_terminated_length": 269.61328125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.10486160963773727, "epoch": 0.5646017699115045, "frac_reward_zero_std": 0.0, "grad_norm": 0.2857999286716648, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 156651493.0, "reward": 0.4916015565395355, "reward_std": 0.4747525453567505, "rewards/execution_accuracy_EX/mean": 0.46484375, "rewards/execution_accuracy_EX/std": 0.49973952770233154, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9782221913337708, "sampling/importance_sampling_ratio/min": 0.004096901509910822, "sampling/sampling_logp_difference/max": 5.497524261474609, "sampling/sampling_logp_difference/mean": 0.11965882033109665, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 219.46484375, "completions/mean_terminated_length": 219.46484375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.07706360146403313, "epoch": 0.5663716814159292, "frac_reward_zero_std": 0.0, "grad_norm": 0.7846440050613834, "learning_rate": 1e-06, "loss": -0.0144, "num_tokens": 157122444.0, "reward": 0.9332031011581421, "reward_std": 0.243365079164505, "rewards/execution_accuracy_EX/mean": 0.9296875, "rewards/execution_accuracy_EX/std": 0.2561737895011902, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8161041736602783, "sampling/importance_sampling_ratio/mean": 0.9782350063323975, "sampling/importance_sampling_ratio/min": 0.006755828391760588, "sampling/sampling_logp_difference/max": 4.997349739074707, "sampling/sampling_logp_difference/mean": 0.10431890189647675, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 270.3203125, "completions/mean_terminated_length": 270.3203125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.09842854365706444, "epoch": 0.5681415929203539, "frac_reward_zero_std": 0.0, "grad_norm": 0.25287181076132914, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 157775678.0, "reward": 0.49531251192092896, "reward_std": 0.47499996423721313, "rewards/execution_accuracy_EX/mean": 0.46875, "rewards/execution_accuracy_EX/std": 0.5, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9794289469718933, "sampling/importance_sampling_ratio/min": 0.011159423738718033, "sampling/sampling_logp_difference/max": 4.495471000671387, "sampling/sampling_logp_difference/mean": 0.10976652801036835, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 239.75390625, "completions/mean_terminated_length": 239.75390625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.0821539806202054, "epoch": 0.5699115044247788, "frac_reward_zero_std": 0.0, "grad_norm": 0.5025369058197589, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 158274831.0, "reward": 0.591796875, "reward_std": 0.471201092004776, "rewards/execution_accuracy_EX/mean": 0.5703125, "rewards/execution_accuracy_EX/std": 0.4960011839866638, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7514076232910156, "sampling/importance_sampling_ratio/mean": 0.9740892648696899, "sampling/importance_sampling_ratio/min": 0.008661828935146332, "sampling/sampling_logp_difference/max": 4.748829364776611, "sampling/sampling_logp_difference/mean": 0.1122419685125351, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 250.7109375, "completions/mean_terminated_length": 250.7109375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.10371797811239958, "epoch": 0.5716814159292035, "frac_reward_zero_std": 0.0, "grad_norm": 0.45759429321599143, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 158743541.0, "reward": 0.706835925579071, "reward_std": 0.43967700004577637, "rewards/execution_accuracy_EX/mean": 0.69140625, "rewards/execution_accuracy_EX/std": 0.46281787753105164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9778295755386353, "sampling/importance_sampling_ratio/min": 0.007033617235720158, "sampling/sampling_logp_difference/max": 4.957054138183594, "sampling/sampling_logp_difference/mean": 0.12171629071235657, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 259.421875, "completions/mean_terminated_length": 259.421875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.10069741401821375, "epoch": 0.5734513274336284, "frac_reward_zero_std": 0.0, "grad_norm": 0.42692721403591716, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 159162193.0, "reward": 0.7291015386581421, "reward_std": 0.4297545552253723, "rewards/execution_accuracy_EX/mean": 0.71484375, "rewards/execution_accuracy_EX/std": 0.4523732364177704, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8864747285842896, "sampling/importance_sampling_ratio/mean": 0.9758027791976929, "sampling/importance_sampling_ratio/min": 0.004090497270226479, "sampling/sampling_logp_difference/max": 5.499088764190674, "sampling/sampling_logp_difference/mean": 0.12244510650634766, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 244.5234375, "completions/mean_terminated_length": 244.5234375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.09591101668775082, "epoch": 0.5752212389380531, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 159625319.0, "reward": 0.703125, "reward_std": 0.44119933247566223, "rewards/execution_accuracy_EX/mean": 0.6875, "rewards/execution_accuracy_EX/std": 0.4644203782081604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9343760013580322, "sampling/importance_sampling_ratio/mean": 0.9767451286315918, "sampling/importance_sampling_ratio/min": 0.006755660753697157, "sampling/sampling_logp_difference/max": 4.997374534606934, "sampling/sampling_logp_difference/mean": 0.12048230320215225, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 260.671875, "completions/mean_terminated_length": 260.671875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.09766383562237024, "epoch": 0.5769911504424778, "frac_reward_zero_std": 0.0, "grad_norm": 0.3238322691384502, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 160079603.0, "reward": 0.539843738079071, "reward_std": 0.4756980240345001, "rewards/execution_accuracy_EX/mean": 0.515625, "rewards/execution_accuracy_EX/std": 0.5007347464561462, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8316380977630615, "sampling/importance_sampling_ratio/mean": 0.9780088663101196, "sampling/importance_sampling_ratio/min": 0.004358318634331226, "sampling/sampling_logp_difference/max": 5.4356689453125, "sampling/sampling_logp_difference/mean": 0.11697864532470703, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 260.49609375, "completions/mean_terminated_length": 260.49609375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.09727056697010994, "epoch": 0.5787610619469027, "frac_reward_zero_std": 0.0, "grad_norm": 0.12152999501259097, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 160609698.0, "reward": 0.4136718511581421, "reward_std": 0.46267399191856384, "rewards/execution_accuracy_EX/mean": 0.3828125, "rewards/execution_accuracy_EX/std": 0.48702529072761536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9785568714141846, "sampling/importance_sampling_ratio/min": 0.008679252117872238, "sampling/sampling_logp_difference/max": 4.746819972991943, "sampling/sampling_logp_difference/mean": 0.11541437357664108, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 267.3046875, "completions/mean_terminated_length": 267.3046875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.09658203925937414, "epoch": 0.5805309734513274, "frac_reward_zero_std": 0.0, "grad_norm": 0.16257796850582978, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 161201104.0, "reward": 0.4544922113418579, "reward_std": 0.47065800428390503, "rewards/execution_accuracy_EX/mean": 0.42578125, "rewards/execution_accuracy_EX/std": 0.49542948603630066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9462175369262695, "sampling/importance_sampling_ratio/mean": 0.9753714799880981, "sampling/importance_sampling_ratio/min": 0.008661828935146332, "sampling/sampling_logp_difference/max": 4.748829364776611, "sampling/sampling_logp_difference/mean": 0.12179063260555267, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 231.80078125, "completions/mean_terminated_length": 231.80078125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.09262214880436659, "epoch": 0.5823008849557522, "frac_reward_zero_std": 0.0, "grad_norm": 0.29201357395420724, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 161737357.0, "reward": 0.814453125, "reward_std": 0.37735676765441895, "rewards/execution_accuracy_EX/mean": 0.8046875, "rewards/execution_accuracy_EX/std": 0.39721766114234924, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9784231185913086, "sampling/importance_sampling_ratio/min": 0.006784507539123297, "sampling/sampling_logp_difference/max": 4.9931135177612305, "sampling/sampling_logp_difference/mean": 0.11321832984685898, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 245.95703125, "completions/mean_terminated_length": 245.95703125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.09585994947701693, "epoch": 0.584070796460177, "frac_reward_zero_std": 0.0, "grad_norm": 0.2578667322795693, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 162234770.0, "reward": 0.6214843392372131, "reward_std": 0.46600863337516785, "rewards/execution_accuracy_EX/mean": 0.6015625, "rewards/execution_accuracy_EX/std": 0.4905354380607605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8975902795791626, "sampling/importance_sampling_ratio/mean": 0.9785220623016357, "sampling/importance_sampling_ratio/min": 0.008152726106345654, "sampling/sampling_logp_difference/max": 4.809402942657471, "sampling/sampling_logp_difference/mean": 0.11667103320360184, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 242.01171875, "completions/mean_terminated_length": 242.01171875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.09702995885163546, "epoch": 0.5858407079646017, "frac_reward_zero_std": 0.0, "grad_norm": 0.5710229349636504, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 162718165.0, "reward": 0.6845703125, "reward_std": 0.44827139377593994, "rewards/execution_accuracy_EX/mean": 0.66796875, "rewards/execution_accuracy_EX/std": 0.4718646705150604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9797040224075317, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.1162787675857544, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 250.1640625, "completions/mean_terminated_length": 250.1640625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.1122655738145113, "epoch": 0.5876106194690266, "frac_reward_zero_std": 0.0, "grad_norm": 0.4722082937173414, "learning_rate": 1e-06, "loss": -0.0079, "num_tokens": 163255727.0, "reward": 0.6326172351837158, "reward_std": 0.46355465054512024, "rewards/execution_accuracy_EX/mean": 0.61328125, "rewards/execution_accuracy_EX/std": 0.4879522919654846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.922654390335083, "sampling/importance_sampling_ratio/mean": 0.9778604507446289, "sampling/importance_sampling_ratio/min": 0.014302237890660763, "sampling/sampling_logp_difference/max": 4.247339248657227, "sampling/sampling_logp_difference/mean": 0.12552200257778168, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 268.4375, "completions/mean_terminated_length": 268.4375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.10480065550655127, "epoch": 0.5893805309734513, "frac_reward_zero_std": 0.0, "grad_norm": 0.2888702807425869, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 163635503.0, "reward": 0.42851561307907104, "reward_std": 0.46600863337516785, "rewards/execution_accuracy_EX/mean": 0.3984375, "rewards/execution_accuracy_EX/std": 0.4905354380607605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8170230388641357, "sampling/importance_sampling_ratio/mean": 0.9762803912162781, "sampling/importance_sampling_ratio/min": 0.006748165003955364, "sampling/sampling_logp_difference/max": 4.9984846115112305, "sampling/sampling_logp_difference/mean": 0.12657693028450012, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 312.25, "completions/mean_terminated_length": 312.25, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.12133604194968939, "epoch": 0.5911504424778761, "frac_reward_zero_std": 0.0, "grad_norm": 0.37753305590241776, "learning_rate": 1e-06, "loss": -0.0337, "num_tokens": 164132943.0, "reward": 0.5843750238418579, "reward_std": 0.47219759225845337, "rewards/execution_accuracy_EX/mean": 0.5625, "rewards/execution_accuracy_EX/std": 0.49705013632774353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9800047874450684, "sampling/importance_sampling_ratio/min": 0.005264220293611288, "sampling/sampling_logp_difference/max": 5.246822357177734, "sampling/sampling_logp_difference/mean": 0.12678708136081696, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 280.16796875, "completions/mean_terminated_length": 280.16796875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.12003406789153814, "epoch": 0.5929203539823009, "frac_reward_zero_std": 0.0, "grad_norm": 0.19497734889866827, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 164540906.0, "reward": 0.6585937142372131, "reward_std": 0.456719309091568, "rewards/execution_accuracy_EX/mean": 0.640625, "rewards/execution_accuracy_EX/std": 0.4807571768760681, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.866982102394104, "sampling/importance_sampling_ratio/mean": 0.9788284301757812, "sampling/importance_sampling_ratio/min": 0.002553603844717145, "sampling/sampling_logp_difference/max": 5.970249652862549, "sampling/sampling_logp_difference/mean": 0.12541426718235016, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 265.65234375, "completions/mean_terminated_length": 265.65234375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.11629681382328272, "epoch": 0.5946902654867257, "frac_reward_zero_std": 0.0, "grad_norm": 0.19946792412467465, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 165054241.0, "reward": 0.6957030892372131, "reward_std": 0.44413506984710693, "rewards/execution_accuracy_EX/mean": 0.6796875, "rewards/execution_accuracy_EX/std": 0.4675106406211853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9283543825149536, "sampling/importance_sampling_ratio/mean": 0.9771718978881836, "sampling/importance_sampling_ratio/min": 0.008706767112016678, "sampling/sampling_logp_difference/max": 4.743654727935791, "sampling/sampling_logp_difference/mean": 0.12751495838165283, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 270.5703125, "completions/mean_terminated_length": 270.5703125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.11773484852164984, "epoch": 0.5964601769911504, "frac_reward_zero_std": 0.0, "grad_norm": 0.2508809026152861, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 165478163.0, "reward": 0.6623046398162842, "reward_std": 0.4556131064891815, "rewards/execution_accuracy_EX/mean": 0.64453125, "rewards/execution_accuracy_EX/std": 0.4795927405357361, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9786386489868164, "sampling/importance_sampling_ratio/min": 0.014295632019639015, "sampling/sampling_logp_difference/max": 4.247801303863525, "sampling/sampling_logp_difference/mean": 0.12480713427066803, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 266.6875, "completions/mean_terminated_length": 266.6875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.10925499815493822, "epoch": 0.5982300884955752, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 165908579.0, "reward": 0.703125, "reward_std": 0.44119933247566223, "rewards/execution_accuracy_EX/mean": 0.6875, "rewards/execution_accuracy_EX/std": 0.4644203782081604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9785383343696594, "sampling/importance_sampling_ratio/min": 0.011943034827709198, "sampling/sampling_logp_difference/max": 4.42760705947876, "sampling/sampling_logp_difference/mean": 0.12010498344898224, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 289.578125, "completions/mean_terminated_length": 289.578125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.12177309859544039, "epoch": 0.6, "frac_reward_zero_std": 0.0, "grad_norm": 0.1742291895418494, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 166535463.0, "reward": 0.6882811784744263, "reward_std": 0.44692784547805786, "rewards/execution_accuracy_EX/mean": 0.671875, "rewards/execution_accuracy_EX/std": 0.47045037150382996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8843876123428345, "sampling/importance_sampling_ratio/mean": 0.9795670509338379, "sampling/importance_sampling_ratio/min": 0.008726546540856361, "sampling/sampling_logp_difference/max": 4.741385459899902, "sampling/sampling_logp_difference/mean": 0.12084391713142395, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 270.84375, "completions/mean_terminated_length": 270.84375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.12302173301577568, "epoch": 0.6017699115044248, "frac_reward_zero_std": 0.0, "grad_norm": 0.2681129890989471, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 167112255.0, "reward": 0.46562498807907104, "reward_std": 0.47219759225845337, "rewards/execution_accuracy_EX/mean": 0.4375, "rewards/execution_accuracy_EX/std": 0.49705013632774353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.822189450263977, "sampling/importance_sampling_ratio/mean": 0.9797269105911255, "sampling/importance_sampling_ratio/min": 0.005811800714582205, "sampling/sampling_logp_difference/max": 5.147864818572998, "sampling/sampling_logp_difference/mean": 0.12662267684936523, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 270.91015625, "completions/mean_terminated_length": 270.91015625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.12622435204684734, "epoch": 0.6035398230088496, "frac_reward_zero_std": 0.0, "grad_norm": 0.15821104933695412, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 167711560.0, "reward": 0.7587890625, "reward_std": 0.4142923057079315, "rewards/execution_accuracy_EX/mean": 0.74609375, "rewards/execution_accuracy_EX/std": 0.4360972046852112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9329793453216553, "sampling/importance_sampling_ratio/mean": 0.9792646169662476, "sampling/importance_sampling_ratio/min": 0.006748078390955925, "sampling/sampling_logp_difference/max": 4.998497486114502, "sampling/sampling_logp_difference/mean": 0.1252666562795639, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 285.59375, "completions/mean_terminated_length": 285.59375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.13098933827131987, "epoch": 0.6053097345132743, "frac_reward_zero_std": 0.0, "grad_norm": 0.3805116896194998, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 168161728.0, "reward": 0.717968761920929, "reward_std": 0.4348871409893036, "rewards/execution_accuracy_EX/mean": 0.703125, "rewards/execution_accuracy_EX/std": 0.45777595043182373, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8983339071273804, "sampling/importance_sampling_ratio/mean": 0.9839341640472412, "sampling/importance_sampling_ratio/min": 0.014309640042483807, "sampling/sampling_logp_difference/max": 4.246821880340576, "sampling/sampling_logp_difference/mean": 0.12266640365123749, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 311.59765625, "completions/mean_terminated_length": 311.59765625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.13412226364016533, "epoch": 0.6070796460176991, "frac_reward_zero_std": 0.0, "grad_norm": 0.44996187058916765, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 168695081.0, "reward": 0.6177734136581421, "reward_std": 0.46676450967788696, "rewards/execution_accuracy_EX/mean": 0.59765625, "rewards/execution_accuracy_EX/std": 0.4913311004638672, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9826523065567017, "sampling/importance_sampling_ratio/min": 0.008697489276528358, "sampling/sampling_logp_difference/max": 4.744720935821533, "sampling/sampling_logp_difference/mean": 0.12485301494598389, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 309.85546875, "completions/mean_terminated_length": 309.85546875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.14732183329761028, "epoch": 0.6088495575221239, "frac_reward_zero_std": 0.0, "grad_norm": 0.38312070738900167, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 169420772.0, "reward": 0.521289050579071, "reward_std": 0.47591593861579895, "rewards/execution_accuracy_EX/mean": 0.49609375, "rewards/execution_accuracy_EX/std": 0.5009641647338867, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9817765951156616, "sampling/importance_sampling_ratio/min": 0.014309640042483807, "sampling/sampling_logp_difference/max": 4.246821880340576, "sampling/sampling_logp_difference/mean": 0.1348007321357727, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 309.87890625, "completions/mean_terminated_length": 309.87890625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.14372923597693443, "epoch": 0.6106194690265486, "frac_reward_zero_std": 0.0, "grad_norm": 0.3621342816710659, "learning_rate": 1e-06, "loss": 0.0241, "num_tokens": 169841093.0, "reward": 0.6957031488418579, "reward_std": 0.4441350996494293, "rewards/execution_accuracy_EX/mean": 0.6796875, "rewards/execution_accuracy_EX/std": 0.4675106406211853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9832894206047058, "sampling/importance_sampling_ratio/min": 0.008700923062860966, "sampling/sampling_logp_difference/max": 4.744326114654541, "sampling/sampling_logp_difference/mean": 0.13119953870773315, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 299.38671875, "completions/mean_terminated_length": 299.38671875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.14318578131496906, "epoch": 0.6123893805309735, "frac_reward_zero_std": 0.0, "grad_norm": 0.24299852408912867, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 170237352.0, "reward": 0.792187511920929, "reward_std": 0.393498033285141, "rewards/execution_accuracy_EX/mean": 0.78125, "rewards/execution_accuracy_EX/std": 0.41420844197273254, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.928519606590271, "sampling/importance_sampling_ratio/mean": 0.9804319143295288, "sampling/importance_sampling_ratio/min": 0.006759620737284422, "sampling/sampling_logp_difference/max": 4.996788501739502, "sampling/sampling_logp_difference/mean": 0.1348951756954193, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.13611670583486557, "epoch": 0.6141592920353982, "frac_reward_zero_std": 0.0, "grad_norm": 0.17150394795151383, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 170787464.0, "reward": 0.7142578363418579, "reward_std": 0.4365212619304657, "rewards/execution_accuracy_EX/mean": 0.69921875, "rewards/execution_accuracy_EX/std": 0.45949608087539673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9802919626235962, "sampling/importance_sampling_ratio/min": 0.011360770091414452, "sampling/sampling_logp_difference/max": 4.477589130401611, "sampling/sampling_logp_difference/mean": 0.1316777467727661, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 326.8984375, "completions/mean_terminated_length": 326.8984375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.16614997386932373, "epoch": 0.6159292035398231, "frac_reward_zero_std": 0.0, "grad_norm": 0.2131704702543648, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 171351518.0, "reward": 0.3060546815395355, "reward_std": 0.42235618829727173, "rewards/execution_accuracy_EX/mean": 0.26953125, "rewards/execution_accuracy_EX/std": 0.44458550214767456, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9815507531166077, "sampling/importance_sampling_ratio/min": 0.011136953718960285, "sampling/sampling_logp_difference/max": 4.497486591339111, "sampling/sampling_logp_difference/mean": 0.14674508571624756, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 303.4609375, "completions/mean_terminated_length": 303.4609375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.14835299737751484, "epoch": 0.6176991150442478, "frac_reward_zero_std": 0.0, "grad_norm": 0.29762721413951043, "learning_rate": 1e-06, "loss": -0.0051, "num_tokens": 171970196.0, "reward": 0.4693359136581421, "reward_std": 0.4726511836051941, "rewards/execution_accuracy_EX/mean": 0.44140625, "rewards/execution_accuracy_EX/std": 0.4975275993347168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9478397369384766, "sampling/importance_sampling_ratio/mean": 0.9828253984451294, "sampling/importance_sampling_ratio/min": 0.0022946451790630817, "sampling/sampling_logp_difference/max": 6.077177047729492, "sampling/sampling_logp_difference/mean": 0.13308770954608917, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 299.88671875, "completions/mean_terminated_length": 299.88671875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.15259709022939205, "epoch": 0.6194690265486725, "frac_reward_zero_std": 0.0, "grad_norm": 0.3279121660327367, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 172545751.0, "reward": 0.591796875, "reward_std": 0.471201092004776, "rewards/execution_accuracy_EX/mean": 0.5703125, "rewards/execution_accuracy_EX/std": 0.4960011839866638, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9815083742141724, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.13767719268798828, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 311.00390625, "completions/mean_terminated_length": 311.00390625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.14528056234121323, "epoch": 0.6212389380530974, "frac_reward_zero_std": 0.0, "grad_norm": 0.23019182346710979, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 173100168.0, "reward": 0.6029297113418579, "reward_std": 0.46948155760765076, "rewards/execution_accuracy_EX/mean": 0.58203125, "rewards/execution_accuracy_EX/std": 0.49419113993644714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9438985586166382, "sampling/importance_sampling_ratio/mean": 0.9813041687011719, "sampling/importance_sampling_ratio/min": 0.011136539280414581, "sampling/sampling_logp_difference/max": 4.497523784637451, "sampling/sampling_logp_difference/mean": 0.13343140482902527, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 308.8203125, "completions/mean_terminated_length": 308.8203125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.17054642364382744, "epoch": 0.6230088495575221, "frac_reward_zero_std": 0.0, "grad_norm": 0.2460299764669613, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 173559146.0, "reward": 0.6771484613418579, "reward_std": 0.4508545994758606, "rewards/execution_accuracy_EX/mean": 0.66015625, "rewards/execution_accuracy_EX/std": 0.47458380460739136, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9823266267776489, "sampling/importance_sampling_ratio/min": 0.01840067282319069, "sampling/sampling_logp_difference/max": 3.995368003845215, "sampling/sampling_logp_difference/mean": 0.1443067193031311, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 333.90625, "completions/mean_terminated_length": 333.90625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.17115960828959942, "epoch": 0.6247787610619469, "frac_reward_zero_std": 0.0, "grad_norm": 0.11972538118286806, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 173985234.0, "reward": 0.5064452886581421, "reward_std": 0.47556719183921814, "rewards/execution_accuracy_EX/mean": 0.48046875, "rewards/execution_accuracy_EX/std": 0.5005971193313599, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9859043955802917, "sampling/importance_sampling_ratio/min": 0.0011984164593741298, "sampling/sampling_logp_difference/max": 6.726754188537598, "sampling/sampling_logp_difference/mean": 0.14230495691299438, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 279.91015625, "completions/mean_terminated_length": 279.91015625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.15714056976139545, "epoch": 0.6265486725663717, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 174461883.0, "reward": 0.762499988079071, "reward_std": 0.41216787695884705, "rewards/execution_accuracy_EX/mean": 0.75, "rewards/execution_accuracy_EX/std": 0.4338609278202057, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8311058282852173, "sampling/importance_sampling_ratio/mean": 0.9806782007217407, "sampling/importance_sampling_ratio/min": 0.006941431201994419, "sampling/sampling_logp_difference/max": 4.970247268676758, "sampling/sampling_logp_difference/mean": 0.13960732519626617, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 280.109375, "completions/mean_terminated_length": 280.109375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.15817486122250557, "epoch": 0.6283185840707964, "frac_reward_zero_std": 0.0, "grad_norm": 0.28922512002492845, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 174927415.0, "reward": 0.7328124642372131, "reward_std": 0.4279654622077942, "rewards/execution_accuracy_EX/mean": 0.71875, "rewards/execution_accuracy_EX/std": 0.45048993825912476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9616512060165405, "sampling/importance_sampling_ratio/mean": 0.9811063408851624, "sampling/importance_sampling_ratio/min": 0.018573788926005363, "sampling/sampling_logp_difference/max": 3.986003875732422, "sampling/sampling_logp_difference/mean": 0.1422651708126068, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 316.3828125, "completions/mean_terminated_length": 316.3828125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.16602585650980473, "epoch": 0.6300884955752213, "frac_reward_zero_std": 0.0, "grad_norm": 0.21675306464316224, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 175397865.0, "reward": 0.7142578363418579, "reward_std": 0.4365212917327881, "rewards/execution_accuracy_EX/mean": 0.69921875, "rewards/execution_accuracy_EX/std": 0.45949608087539673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9855636358261108, "sampling/importance_sampling_ratio/min": 0.008775142952799797, "sampling/sampling_logp_difference/max": 4.735832214355469, "sampling/sampling_logp_difference/mean": 0.13811041414737701, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 301.63671875, "completions/mean_terminated_length": 301.63671875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.17440044321119785, "epoch": 0.631858407079646, "frac_reward_zero_std": 0.0, "grad_norm": 0.23015549064829782, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 175926764.0, "reward": 0.669726550579071, "reward_std": 0.4533011019229889, "rewards/execution_accuracy_EX/mean": 0.65234375, "rewards/execution_accuracy_EX/std": 0.4771590530872345, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9849185943603516, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.1428661346435547, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 290.98046875, "completions/mean_terminated_length": 290.98046875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.17171570286154747, "epoch": 0.6336283185840708, "frac_reward_zero_std": 0.0, "grad_norm": 0.2443959815433609, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 176443751.0, "reward": 0.6363281011581421, "reward_std": 0.46267402172088623, "rewards/execution_accuracy_EX/mean": 0.6171875, "rewards/execution_accuracy_EX/std": 0.48702529072761536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9632278680801392, "sampling/importance_sampling_ratio/mean": 0.9822627305984497, "sampling/importance_sampling_ratio/min": 0.013144331984221935, "sampling/sampling_logp_difference/max": 4.3317646980285645, "sampling/sampling_logp_difference/mean": 0.14267569780349731, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 292.84375, "completions/mean_terminated_length": 292.84375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.1765604056417942, "epoch": 0.6353982300884956, "frac_reward_zero_std": 0.0, "grad_norm": 0.3131359599874531, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 177196543.0, "reward": 0.7216796875, "reward_std": 0.4332149922847748, "rewards/execution_accuracy_EX/mean": 0.70703125, "rewards/execution_accuracy_EX/std": 0.45601576566696167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.979896068572998, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.15033632516860962, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 272.4609375, "completions/mean_terminated_length": 272.4609375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.15468990057706833, "epoch": 0.6371681415929203, "frac_reward_zero_std": 0.0, "grad_norm": 0.39250898384012695, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 177658261.0, "reward": 0.792187511920929, "reward_std": 0.393498033285141, "rewards/execution_accuracy_EX/mean": 0.78125, "rewards/execution_accuracy_EX/std": 0.41420844197273254, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9108880758285522, "sampling/importance_sampling_ratio/mean": 0.9805917739868164, "sampling/importance_sampling_ratio/min": 0.01430963259190321, "sampling/sampling_logp_difference/max": 4.246822357177734, "sampling/sampling_logp_difference/mean": 0.13606077432632446, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 298.0234375, "completions/mean_terminated_length": 298.0234375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.18882922641932964, "epoch": 0.6389380530973451, "frac_reward_zero_std": 0.0, "grad_norm": 0.30951042598403783, "learning_rate": 1e-06, "loss": -0.0093, "num_tokens": 178143723.0, "reward": 0.7142578363418579, "reward_std": 0.4365212619304657, "rewards/execution_accuracy_EX/mean": 0.69921875, "rewards/execution_accuracy_EX/std": 0.45949608087539673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9841555953025818, "sampling/importance_sampling_ratio/min": 0.01430963259190321, "sampling/sampling_logp_difference/max": 4.246822357177734, "sampling/sampling_logp_difference/mean": 0.15244781970977783, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 285.67578125, "completions/mean_terminated_length": 285.67578125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.1687780823558569, "epoch": 0.6407079646017699, "frac_reward_zero_std": 0.0, "grad_norm": 0.1541661540541937, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 178615368.0, "reward": 0.706835925579071, "reward_std": 0.439676970243454, "rewards/execution_accuracy_EX/mean": 0.69140625, "rewards/execution_accuracy_EX/std": 0.46281787753105164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9848163723945618, "sampling/importance_sampling_ratio/min": 0.01105455867946148, "sampling/sampling_logp_difference/max": 4.504912376403809, "sampling/sampling_logp_difference/mean": 0.14022907614707947, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 326.1484375, "completions/mean_terminated_length": 326.1484375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.19181636348366737, "epoch": 0.6424778761061947, "frac_reward_zero_std": 0.0, "grad_norm": 0.4408793451721107, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 179121054.0, "reward": 0.576953113079071, "reward_std": 0.47307512164115906, "rewards/execution_accuracy_EX/mean": 0.5546875, "rewards/execution_accuracy_EX/std": 0.49797385931015015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.984829306602478, "sampling/importance_sampling_ratio/min": 0.00874820351600647, "sampling/sampling_logp_difference/max": 4.7389068603515625, "sampling/sampling_logp_difference/mean": 0.15180866420269012, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 289.96484375, "completions/mean_terminated_length": 289.96484375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.16963029652833939, "epoch": 0.6442477876106195, "frac_reward_zero_std": 0.0, "grad_norm": 0.12739688224075013, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 179656229.0, "reward": 0.703125, "reward_std": 0.44119933247566223, "rewards/execution_accuracy_EX/mean": 0.6875, "rewards/execution_accuracy_EX/std": 0.4644203782081604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9797656536102295, "sampling/importance_sampling_ratio/min": 0.018439162522554398, "sampling/sampling_logp_difference/max": 3.9932785034179688, "sampling/sampling_logp_difference/mean": 0.14988864958286285, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 330.34375, "completions/mean_terminated_length": 330.34375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.182080315425992, "epoch": 0.6460176991150443, "frac_reward_zero_std": 0.0, "grad_norm": 0.19637362753621984, "learning_rate": 1e-06, "loss": -0.0054, "num_tokens": 180251981.0, "reward": 0.7216796875, "reward_std": 0.4332149624824524, "rewards/execution_accuracy_EX/mean": 0.70703125, "rewards/execution_accuracy_EX/std": 0.45601576566696167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9820328950881958, "sampling/importance_sampling_ratio/mean": 0.9836443662643433, "sampling/importance_sampling_ratio/min": 0.014291773550212383, "sampling/sampling_logp_difference/max": 4.248071193695068, "sampling/sampling_logp_difference/mean": 0.14891904592514038, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 342.00390625, "completions/mean_terminated_length": 342.00390625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.1771495994180441, "epoch": 0.647787610619469, "frac_reward_zero_std": 0.0, "grad_norm": 0.30658590682268666, "learning_rate": 1e-06, "loss": -0.0086, "num_tokens": 180701886.0, "reward": 0.6363281011581421, "reward_std": 0.46267399191856384, "rewards/execution_accuracy_EX/mean": 0.6171875, "rewards/execution_accuracy_EX/std": 0.48702529072761536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9865796566009521, "sampling/importance_sampling_ratio/min": 0.00869713630527258, "sampling/sampling_logp_difference/max": 4.7447614669799805, "sampling/sampling_logp_difference/mean": 0.14339247345924377, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 333.34375, "completions/mean_terminated_length": 333.34375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.1673466358333826, "epoch": 0.6495575221238938, "frac_reward_zero_std": 0.0, "grad_norm": 0.3282424362319832, "learning_rate": 1e-06, "loss": -0.0096, "num_tokens": 181082454.0, "reward": 0.743945300579071, "reward_std": 0.42235618829727173, "rewards/execution_accuracy_EX/mean": 0.73046875, "rewards/execution_accuracy_EX/std": 0.44458550214767456, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9846060276031494, "sampling/importance_sampling_ratio/min": 0.011136539280414581, "sampling/sampling_logp_difference/max": 4.497523784637451, "sampling/sampling_logp_difference/mean": 0.13978558778762817, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1178.0, "completions/max_terminated_length": 1178.0, "completions/mean_length": 357.84375, "completions/mean_terminated_length": 357.84375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.1844386588782072, "epoch": 0.6513274336283186, "frac_reward_zero_std": 0.0, "grad_norm": 0.37451782258654803, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 181645694.0, "reward": 0.606640636920929, "reward_std": 0.46884801983833313, "rewards/execution_accuracy_EX/mean": 0.5859375, "rewards/execution_accuracy_EX/std": 0.4935242533683777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9867744445800781, "sampling/importance_sampling_ratio/min": 0.008679230697453022, "sampling/sampling_logp_difference/max": 4.746822357177734, "sampling/sampling_logp_difference/mean": 0.14332488179206848, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 336.61328125, "completions/mean_terminated_length": 336.61328125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.175746975466609, "epoch": 0.6530973451327433, "frac_reward_zero_std": 0.0, "grad_norm": 0.2723377936995241, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 182268331.0, "reward": 0.5658202767372131, "reward_std": 0.4741697609424591, "rewards/execution_accuracy_EX/mean": 0.54296875, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.995046615600586, "sampling/importance_sampling_ratio/mean": 0.9859664440155029, "sampling/importance_sampling_ratio/min": 0.018332336097955704, "sampling/sampling_logp_difference/max": 3.999088764190674, "sampling/sampling_logp_difference/mean": 0.14141210913658142, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 306.1484375, "completions/mean_terminated_length": 306.1484375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.17632250115275383, "epoch": 0.6548672566371682, "frac_reward_zero_std": 0.0, "grad_norm": 0.27545191815465125, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 182616609.0, "reward": 0.5621093511581421, "reward_std": 0.47447580099105835, "rewards/execution_accuracy_EX/mean": 0.5390625, "rewards/execution_accuracy_EX/std": 0.4994482398033142, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9145256280899048, "sampling/importance_sampling_ratio/mean": 0.9843444228172302, "sampling/importance_sampling_ratio/min": 0.011136539280414581, "sampling/sampling_logp_difference/max": 4.497523784637451, "sampling/sampling_logp_difference/mean": 0.1441006362438202, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 320.12890625, "completions/mean_terminated_length": 320.12890625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.19141799211502075, "epoch": 0.6566371681415929, "frac_reward_zero_std": 0.0, "grad_norm": 0.42132400591255087, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 183033346.0, "reward": 0.666015625, "reward_std": 0.4544737935066223, "rewards/execution_accuracy_EX/mean": 0.6484375, "rewards/execution_accuracy_EX/std": 0.47839346528053284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9803478717803955, "sampling/importance_sampling_ratio/min": 0.018343178555369377, "sampling/sampling_logp_difference/max": 3.998497486114502, "sampling/sampling_logp_difference/mean": 0.16229598224163055, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 324.56640625, "completions/mean_terminated_length": 324.56640625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.1893010027706623, "epoch": 0.6584070796460177, "frac_reward_zero_std": 0.0, "grad_norm": 0.21672127442107716, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 183743939.0, "reward": 0.666015625, "reward_std": 0.4544737637042999, "rewards/execution_accuracy_EX/mean": 0.6484375, "rewards/execution_accuracy_EX/std": 0.47839346528053284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9842427968978882, "sampling/importance_sampling_ratio/min": 0.014364885166287422, "sampling/sampling_logp_difference/max": 4.242968559265137, "sampling/sampling_logp_difference/mean": 0.1574934720993042, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 398.5, "completions/mean_terminated_length": 398.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.20206800289452076, "epoch": 0.6601769911504425, "frac_reward_zero_std": 0.0, "grad_norm": 0.28906542018159676, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 184220771.0, "reward": 0.5138671398162842, "reward_std": 0.47579970955848694, "rewards/execution_accuracy_EX/mean": 0.48828125, "rewards/execution_accuracy_EX/std": 0.5008418560028076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9886964559555054, "sampling/importance_sampling_ratio/min": 0.023641198873519897, "sampling/sampling_logp_difference/max": 3.7447643280029297, "sampling/sampling_logp_difference/mean": 0.15359437465667725, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 348.703125, "completions/mean_terminated_length": 348.703125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.1811294611543417, "epoch": 0.6619469026548672, "frac_reward_zero_std": 0.0, "grad_norm": 0.3619688170617479, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 184793095.0, "reward": 0.818164050579071, "reward_std": 0.3744697570800781, "rewards/execution_accuracy_EX/mean": 0.80859375, "rewards/execution_accuracy_EX/std": 0.39417871832847595, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9510424137115479, "sampling/importance_sampling_ratio/mean": 0.985849142074585, "sampling/importance_sampling_ratio/min": 0.02371858060359955, "sampling/sampling_logp_difference/max": 3.7414965629577637, "sampling/sampling_logp_difference/mean": 0.14919239282608032, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 322.453125, "completions/mean_terminated_length": 322.453125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.18431797996163368, "epoch": 0.6637168141592921, "frac_reward_zero_std": 0.0, "grad_norm": 0.20322991160833945, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 185365515.0, "reward": 0.6548827886581421, "reward_std": 0.45779263973236084, "rewards/execution_accuracy_EX/mean": 0.63671875, "rewards/execution_accuracy_EX/std": 0.48188701272010803, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9828222990036011, "sampling/importance_sampling_ratio/min": 0.018439654260873795, "sampling/sampling_logp_difference/max": 3.9932518005371094, "sampling/sampling_logp_difference/mean": 0.15434765815734863, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 367.359375, "completions/mean_terminated_length": 367.359375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.18431459739804268, "epoch": 0.6654867256637168, "frac_reward_zero_std": 0.0, "grad_norm": 0.3488093480850353, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 185861191.0, "reward": 0.558398425579071, "reward_std": 0.4747525453567505, "rewards/execution_accuracy_EX/mean": 0.53515625, "rewards/execution_accuracy_EX/std": 0.49973952770233154, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9850502014160156, "sampling/importance_sampling_ratio/min": 0.01433913316577673, "sampling/sampling_logp_difference/max": 4.244762897491455, "sampling/sampling_logp_difference/mean": 0.15114128589630127, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 341.76171875, "completions/mean_terminated_length": 341.76171875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.16793095134198666, "epoch": 0.6672566371681415, "frac_reward_zero_std": 0.0, "grad_norm": 0.20028002512110296, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 186490618.0, "reward": 0.6177734136581421, "reward_std": 0.46676453948020935, "rewards/execution_accuracy_EX/mean": 0.59765625, "rewards/execution_accuracy_EX/std": 0.4913311004638672, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9677222967147827, "sampling/importance_sampling_ratio/mean": 0.9848576784133911, "sampling/importance_sampling_ratio/min": 0.018390806391835213, "sampling/sampling_logp_difference/max": 3.9959044456481934, "sampling/sampling_logp_difference/mean": 0.14287824928760529, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 391.6171875, "completions/mean_terminated_length": 391.6171875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.18167337216436863, "epoch": 0.6690265486725664, "frac_reward_zero_std": 0.0, "grad_norm": 0.44731050248057425, "learning_rate": 1e-06, "loss": 0.0213, "num_tokens": 186940952.0, "reward": 0.6214843988418579, "reward_std": 0.46600863337516785, "rewards/execution_accuracy_EX/mean": 0.6015625, "rewards/execution_accuracy_EX/std": 0.4905354380607605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9838932752609253, "sampling/importance_sampling_ratio/min": 0.014340986497700214, "sampling/sampling_logp_difference/max": 4.244633674621582, "sampling/sampling_logp_difference/mean": 0.15083645284175873, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 349.71484375, "completions/mean_terminated_length": 335.0235595703125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.18162584863603115, "epoch": 0.6707964601769911, "frac_reward_zero_std": 0.0, "grad_norm": 0.12641193978532114, "learning_rate": 1e-06, "loss": -0.0186, "num_tokens": 187507311.0, "reward": 0.7623046636581421, "reward_std": 0.41251853108406067, "rewards/execution_accuracy_EX/mean": 0.75, "rewards/execution_accuracy_EX/std": 0.4338609278202057, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9822250604629517, "sampling/importance_sampling_ratio/min": 0.011371039785444736, "sampling/sampling_logp_difference/max": 4.476685523986816, "sampling/sampling_logp_difference/mean": 0.15520714223384857, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 318.67578125, "completions/mean_terminated_length": 318.67578125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.16668269224464893, "epoch": 0.672566371681416, "frac_reward_zero_std": 0.0, "grad_norm": 0.1811797559427384, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 188113132.0, "reward": 0.688281238079071, "reward_std": 0.4469278156757355, "rewards/execution_accuracy_EX/mean": 0.671875, "rewards/execution_accuracy_EX/std": 0.47045037150382996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9808695912361145, "sampling/importance_sampling_ratio/min": 0.014337287284433842, "sampling/sampling_logp_difference/max": 4.24489164352417, "sampling/sampling_logp_difference/mean": 0.14662903547286987, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 347.63671875, "completions/mean_terminated_length": 347.63671875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.17708031088113785, "epoch": 0.6743362831858407, "frac_reward_zero_std": 0.0, "grad_norm": 0.29415766923403913, "learning_rate": 1e-06, "loss": 0.0116, "num_tokens": 188563679.0, "reward": 0.666015625, "reward_std": 0.4544737637042999, "rewards/execution_accuracy_EX/mean": 0.6484375, "rewards/execution_accuracy_EX/std": 0.47839346528053284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9815072417259216, "sampling/importance_sampling_ratio/min": 0.011125700548291206, "sampling/sampling_logp_difference/max": 4.498497486114502, "sampling/sampling_logp_difference/mean": 0.15357543528079987, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1074.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 369.50390625, "completions/mean_terminated_length": 369.50390625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.1768922433257103, "epoch": 0.6761061946902654, "frac_reward_zero_std": 0.0, "grad_norm": 0.26338379961442426, "learning_rate": 1e-06, "loss": -0.0077, "num_tokens": 189036928.0, "reward": 0.6845703125, "reward_std": 0.44827139377593994, "rewards/execution_accuracy_EX/mean": 0.66796875, "rewards/execution_accuracy_EX/std": 0.4718646705150604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9431287050247192, "sampling/importance_sampling_ratio/mean": 0.984576940536499, "sampling/importance_sampling_ratio/min": 0.008697203360497952, "sampling/sampling_logp_difference/max": 4.744753837585449, "sampling/sampling_logp_difference/mean": 0.14685526490211487, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 355.2734375, "completions/mean_terminated_length": 355.2734375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.17549428157508373, "epoch": 0.6778761061946903, "frac_reward_zero_std": 0.0, "grad_norm": 0.3562485539515273, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 189634886.0, "reward": 0.666015625, "reward_std": 0.4544737637042999, "rewards/execution_accuracy_EX/mean": 0.6484375, "rewards/execution_accuracy_EX/std": 0.47839346528053284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9829525947570801, "sampling/importance_sampling_ratio/min": 0.018439073115587234, "sampling/sampling_logp_difference/max": 3.993283271789551, "sampling/sampling_logp_difference/mean": 0.1483614444732666, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1076.0, "completions/max_terminated_length": 1076.0, "completions/mean_length": 333.75390625, "completions/mean_terminated_length": 333.75390625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.1677437536418438, "epoch": 0.679646017699115, "frac_reward_zero_std": 0.0, "grad_norm": 0.23258807614770383, "learning_rate": 1e-06, "loss": -0.0157, "num_tokens": 190079831.0, "reward": 0.6771484613418579, "reward_std": 0.450854629278183, "rewards/execution_accuracy_EX/mean": 0.66015625, "rewards/execution_accuracy_EX/std": 0.47458380460739136, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9809868335723877, "sampling/importance_sampling_ratio/min": 0.014412390999495983, "sampling/sampling_logp_difference/max": 4.239666938781738, "sampling/sampling_logp_difference/mean": 0.14908547699451447, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1260.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 366.09375, "completions/mean_terminated_length": 366.09375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.17234939150512218, "epoch": 0.6814159292035398, "frac_reward_zero_std": 0.0, "grad_norm": 0.07922133000913145, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 190520879.0, "reward": 0.7587890625, "reward_std": 0.4142923355102539, "rewards/execution_accuracy_EX/mean": 0.74609375, "rewards/execution_accuracy_EX/std": 0.4360972046852112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9845023155212402, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.1468074917793274, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 344.98046875, "completions/mean_terminated_length": 344.98046875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.16980272345244884, "epoch": 0.6831858407079646, "frac_reward_zero_std": 0.0, "grad_norm": 0.3334577712274148, "learning_rate": 1e-06, "loss": 0.0134, "num_tokens": 191015850.0, "reward": 0.7328124642372131, "reward_std": 0.4279654324054718, "rewards/execution_accuracy_EX/mean": 0.71875, "rewards/execution_accuracy_EX/std": 0.45048993825912476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.924397587776184, "sampling/importance_sampling_ratio/mean": 0.9805434346199036, "sampling/importance_sampling_ratio/min": 0.01062503457069397, "sampling/sampling_logp_difference/max": 4.54454231262207, "sampling/sampling_logp_difference/mean": 0.152784526348114, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1235.0, "completions/max_terminated_length": 1235.0, "completions/mean_length": 378.21875, "completions/mean_terminated_length": 378.21875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.16122937947511673, "epoch": 0.6849557522123894, "frac_reward_zero_std": 0.0, "grad_norm": 0.14798100699710418, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 191360466.0, "reward": 0.6994140148162842, "reward_std": 0.44268524646759033, "rewards/execution_accuracy_EX/mean": 0.68359375, "rewards/execution_accuracy_EX/std": 0.4659844934940338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9820222854614258, "sampling/importance_sampling_ratio/min": 0.014339085668325424, "sampling/sampling_logp_difference/max": 4.2447662353515625, "sampling/sampling_logp_difference/mean": 0.14505022764205933, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 320.3046875, "completions/mean_terminated_length": 320.3046875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.15651732683181763, "epoch": 0.6867256637168142, "frac_reward_zero_std": 0.0, "grad_norm": 0.1303891658844245, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 192174448.0, "reward": 0.6919921636581421, "reward_std": 0.4455491602420807, "rewards/execution_accuracy_EX/mean": 0.67578125, "rewards/execution_accuracy_EX/std": 0.46899911761283875, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9775676131248474, "sampling/importance_sampling_ratio/min": 0.008661825209856033, "sampling/sampling_logp_difference/max": 4.7488298416137695, "sampling/sampling_logp_difference/mean": 0.1476084291934967, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 301.3984375, "completions/mean_terminated_length": 301.3984375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.1464787721633911, "epoch": 0.6884955752212389, "frac_reward_zero_std": 0.0, "grad_norm": 0.3208387829515269, "learning_rate": 1e-06, "loss": -0.0137, "num_tokens": 192575654.0, "reward": 0.8404296636581421, "reward_std": 0.3558422923088074, "rewards/execution_accuracy_EX/mean": 0.83203125, "rewards/execution_accuracy_EX/std": 0.3745708465576172, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.946167230606079, "sampling/importance_sampling_ratio/mean": 0.9796954393386841, "sampling/importance_sampling_ratio/min": 0.011154407635331154, "sampling/sampling_logp_difference/max": 4.495920658111572, "sampling/sampling_logp_difference/mean": 0.14021122455596924, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 372.05859375, "completions/mean_terminated_length": 372.05859375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.1822884976863861, "epoch": 0.6902654867256637, "frac_reward_zero_std": 0.0, "grad_norm": 0.25459226196081364, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 193008021.0, "reward": 0.6845703125, "reward_std": 0.44827139377593994, "rewards/execution_accuracy_EX/mean": 0.66796875, "rewards/execution_accuracy_EX/std": 0.4718646705150604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9802509546279907, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.1628512442111969, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1050.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 375.18359375, "completions/mean_terminated_length": 375.18359375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.16373853757977486, "epoch": 0.6920353982300885, "frac_reward_zero_std": 0.0, "grad_norm": 0.24275023309596502, "learning_rate": 1e-06, "loss": 0.0143, "num_tokens": 193467636.0, "reward": 0.5658202767372131, "reward_std": 0.4741697609424591, "rewards/execution_accuracy_EX/mean": 0.54296875, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8604068756103516, "sampling/importance_sampling_ratio/mean": 0.9817614555358887, "sampling/importance_sampling_ratio/min": 0.005253662820905447, "sampling/sampling_logp_difference/max": 5.2488298416137695, "sampling/sampling_logp_difference/mean": 0.14821916818618774, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 458.9921875, "completions/mean_terminated_length": 458.9921875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.18126251175999641, "epoch": 0.6938053097345133, "frac_reward_zero_std": 0.0, "grad_norm": 0.26367888656482547, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 193961314.0, "reward": 0.5732421875, "reward_std": 0.47346949577331543, "rewards/execution_accuracy_EX/mean": 0.55078125, "rewards/execution_accuracy_EX/std": 0.49838894605636597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9846888780593872, "sampling/importance_sampling_ratio/min": 0.0178952906280756, "sampling/sampling_logp_difference/max": 4.023217678070068, "sampling/sampling_logp_difference/mean": 0.1535714566707611, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 355.2890625, "completions/mean_terminated_length": 355.2890625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.1500838827341795, "epoch": 0.695575221238938, "frac_reward_zero_std": 0.0, "grad_norm": 0.15445753666811401, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 194386748.0, "reward": 0.6437499523162842, "reward_std": 0.4608176648616791, "rewards/execution_accuracy_EX/mean": 0.625, "rewards/execution_accuracy_EX/std": 0.4850712716579437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9157530069351196, "sampling/importance_sampling_ratio/mean": 0.979602575302124, "sampling/importance_sampling_ratio/min": 0.014344037510454655, "sampling/sampling_logp_difference/max": 4.244421005249023, "sampling/sampling_logp_difference/mean": 0.14788876473903656, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 402.8359375, "completions/mean_terminated_length": 402.8359375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.16516896896064281, "epoch": 0.6973451327433628, "frac_reward_zero_std": 0.0, "grad_norm": 0.26684222396392127, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 194943170.0, "reward": 0.6029297113418579, "reward_std": 0.46948158740997314, "rewards/execution_accuracy_EX/mean": 0.58203125, "rewards/execution_accuracy_EX/std": 0.49419113993644714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9838463068008423, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.1454889476299286, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 504.04296875, "completions/mean_terminated_length": 475.75982666015625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.19731133989989758, "epoch": 0.6991150442477876, "frac_reward_zero_std": 0.0, "grad_norm": 0.21571475021879108, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 195425757.0, "reward": 0.6025390625, "reward_std": 0.4699639081954956, "rewards/execution_accuracy_EX/mean": 0.58203125, "rewards/execution_accuracy_EX/std": 0.49419113993644714, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9842385649681091, "sampling/importance_sampling_ratio/min": 0.008363359607756138, "sampling/sampling_logp_difference/max": 4.783895015716553, "sampling/sampling_logp_difference/mean": 0.1636572629213333, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 340.890625, "completions/mean_terminated_length": 340.890625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.1476024929434061, "epoch": 0.7008849557522124, "frac_reward_zero_std": 0.0, "grad_norm": 0.20505814438933806, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 195909905.0, "reward": 0.781054675579071, "reward_std": 0.40085992217063904, "rewards/execution_accuracy_EX/mean": 0.76953125, "rewards/execution_accuracy_EX/std": 0.4219578504562378, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.952225923538208, "sampling/importance_sampling_ratio/mean": 0.9757999181747437, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.15290255844593048, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 407.89453125, "completions/mean_terminated_length": 407.89453125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.16637560166418552, "epoch": 0.7026548672566372, "frac_reward_zero_std": 0.0, "grad_norm": 0.1967975586757834, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 196611574.0, "reward": 0.43964844942092896, "reward_std": 0.46818408370018005, "rewards/execution_accuracy_EX/mean": 0.41015625, "rewards/execution_accuracy_EX/std": 0.49282538890838623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9798034429550171, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.1534593105316162, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 343.5859375, "completions/mean_terminated_length": 343.5859375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.14015005342662334, "epoch": 0.7044247787610619, "frac_reward_zero_std": 0.0, "grad_norm": 0.2878680426578357, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 197036092.0, "reward": 0.8070312738418579, "reward_std": 0.38295724987983704, "rewards/execution_accuracy_EX/mean": 0.796875, "rewards/execution_accuracy_EX/std": 0.40311288833618164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9774739742279053, "sampling/importance_sampling_ratio/min": 0.006744090002030134, "sampling/sampling_logp_difference/max": 4.999088764190674, "sampling/sampling_logp_difference/mean": 0.14382323622703552, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 361.1796875, "completions/mean_terminated_length": 361.1796875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.14296685345470905, "epoch": 0.7061946902654868, "frac_reward_zero_std": 0.0, "grad_norm": 0.12198919653195027, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 197449178.0, "reward": 0.591796875, "reward_std": 0.471201092004776, "rewards/execution_accuracy_EX/mean": 0.5703125, "rewards/execution_accuracy_EX/std": 0.4960011839866638, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9053452014923096, "sampling/importance_sampling_ratio/mean": 0.9786155223846436, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.14207744598388672, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 369.94921875, "completions/mean_terminated_length": 369.94921875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.14750699326395988, "epoch": 0.7079646017699115, "frac_reward_zero_std": 0.0, "grad_norm": 0.29651302408551683, "learning_rate": 1e-06, "loss": 0.0164, "num_tokens": 197806301.0, "reward": 0.632617175579071, "reward_std": 0.4635546803474426, "rewards/execution_accuracy_EX/mean": 0.61328125, "rewards/execution_accuracy_EX/std": 0.4879522919654846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8656444549560547, "sampling/importance_sampling_ratio/mean": 0.98008793592453, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.14344340562820435, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 346.69140625, "completions/mean_terminated_length": 346.69140625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.1517967376857996, "epoch": 0.7097345132743362, "frac_reward_zero_std": 0.0, "grad_norm": 0.1732909052946054, "learning_rate": 1e-06, "loss": -0.0079, "num_tokens": 198185710.0, "reward": 0.558398425579071, "reward_std": 0.4747525453567505, "rewards/execution_accuracy_EX/mean": 0.53515625, "rewards/execution_accuracy_EX/std": 0.49973952770233154, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9788327217102051, "sampling/importance_sampling_ratio/min": 0.011139354668557644, "sampling/sampling_logp_difference/max": 4.4972710609436035, "sampling/sampling_logp_difference/mean": 0.14681211113929749, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 354.61328125, "completions/mean_terminated_length": 354.61328125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.15006682462990284, "epoch": 0.7115044247787611, "frac_reward_zero_std": 0.0, "grad_norm": 0.24945580913104384, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 198746491.0, "reward": 0.6214843392372131, "reward_std": 0.46600866317749023, "rewards/execution_accuracy_EX/mean": 0.6015625, "rewards/execution_accuracy_EX/std": 0.4905354380607605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9758408069610596, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.1516970694065094, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1337.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 340.38671875, "completions/mean_terminated_length": 340.38671875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.15468825958669186, "epoch": 0.7132743362831858, "frac_reward_zero_std": 0.0, "grad_norm": 0.35526546154810484, "learning_rate": 1e-06, "loss": -0.0104, "num_tokens": 199405054.0, "reward": 0.5843750238418579, "reward_std": 0.47219759225845337, "rewards/execution_accuracy_EX/mean": 0.5625, "rewards/execution_accuracy_EX/std": 0.49705013632774353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8626350164413452, "sampling/importance_sampling_ratio/mean": 0.9773305654525757, "sampling/importance_sampling_ratio/min": 0.0004330030642449856, "sampling/sampling_logp_difference/max": 7.744765758514404, "sampling/sampling_logp_difference/mean": 0.15096834301948547, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1938.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 343.01171875, "completions/mean_terminated_length": 343.01171875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.12916157208383083, "epoch": 0.7150442477876107, "frac_reward_zero_std": 0.0, "grad_norm": 0.37986404361298337, "learning_rate": 1e-06, "loss": 0.0532, "num_tokens": 200009857.0, "reward": 0.8033202886581421, "reward_std": 0.3856732249259949, "rewards/execution_accuracy_EX/mean": 0.79296875, "rewards/execution_accuracy_EX/std": 0.40597182512283325, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9800818562507629, "sampling/importance_sampling_ratio/min": 0.011119124479591846, "sampling/sampling_logp_difference/max": 4.499088764190674, "sampling/sampling_logp_difference/mean": 0.13622796535491943, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 310.34375, "completions/mean_terminated_length": 310.34375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.13033237494528294, "epoch": 0.7168141592920354, "frac_reward_zero_std": 0.0, "grad_norm": 0.42270490346299705, "learning_rate": 1e-06, "loss": -0.0318, "num_tokens": 200554713.0, "reward": 0.5658203363418579, "reward_std": 0.4741697609424591, "rewards/execution_accuracy_EX/mean": 0.54296875, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.892430067062378, "sampling/importance_sampling_ratio/mean": 0.9765554666519165, "sampling/importance_sampling_ratio/min": 0.005313629750162363, "sampling/sampling_logp_difference/max": 5.237480163574219, "sampling/sampling_logp_difference/mean": 0.14130495488643646, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1401.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 426.2265625, "completions/mean_terminated_length": 426.2265625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.1708923950791359, "epoch": 0.7185840707964601, "frac_reward_zero_std": 0.0, "grad_norm": 0.19902699891012435, "learning_rate": 1e-06, "loss": 0.0069, "num_tokens": 201120515.0, "reward": 0.5509765148162842, "reward_std": 0.475218266248703, "rewards/execution_accuracy_EX/mean": 0.52734375, "rewards/execution_accuracy_EX/std": 0.5002297759056091, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9810044765472412, "sampling/importance_sampling_ratio/min": 0.005353882443159819, "sampling/sampling_logp_difference/max": 5.229933261871338, "sampling/sampling_logp_difference/mean": 0.16047163307666779, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 353.83984375, "completions/mean_terminated_length": 339.16473388671875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.1377250738441944, "epoch": 0.720353982300885, "frac_reward_zero_std": 0.0, "grad_norm": 0.26194346338740876, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 201699914.0, "reward": 0.4580077826976776, "reward_std": 0.471381276845932, "rewards/execution_accuracy_EX/mean": 0.4296875, "rewards/execution_accuracy_EX/std": 0.4960011839866638, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9756262898445129, "sampling/importance_sampling_ratio/min": 0.008682269603013992, "sampling/sampling_logp_difference/max": 4.746472358703613, "sampling/sampling_logp_difference/mean": 0.14412958920001984, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2590.0, "completions/max_terminated_length": 2590.0, "completions/mean_length": 426.4765625, "completions/mean_terminated_length": 426.4765625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.1585888434201479, "epoch": 0.7221238938053097, "frac_reward_zero_std": 0.0, "grad_norm": 0.24600787563356571, "learning_rate": 1e-06, "loss": -0.0396, "num_tokens": 202409444.0, "reward": 0.49160152673721313, "reward_std": 0.4747525453567505, "rewards/execution_accuracy_EX/mean": 0.46484375, "rewards/execution_accuracy_EX/std": 0.49973952770233154, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9811248779296875, "sampling/importance_sampling_ratio/min": 0.0052652242593467236, "sampling/sampling_logp_difference/max": 5.246631622314453, "sampling/sampling_logp_difference/mean": 0.1513960063457489, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 320.90234375, "completions/mean_terminated_length": 320.90234375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.12159352377057076, "epoch": 0.7238938053097345, "frac_reward_zero_std": 0.0, "grad_norm": 0.2573446286270932, "learning_rate": 1e-06, "loss": -0.0077, "num_tokens": 202815371.0, "reward": 0.5249999761581421, "reward_std": 0.47593045234680176, "rewards/execution_accuracy_EX/mean": 0.5, "rewards/execution_accuracy_EX/std": 0.5009794235229492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9761395454406738, "sampling/importance_sampling_ratio/min": 0.011183848604559898, "sampling/sampling_logp_difference/max": 4.493284702301025, "sampling/sampling_logp_difference/mean": 0.13443967700004578, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 376.0078125, "completions/mean_terminated_length": 361.41961669921875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.13782457448542118, "epoch": 0.7256637168141593, "frac_reward_zero_std": 0.0, "grad_norm": 0.26690984263834117, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 203581469.0, "reward": 0.5248047113418579, "reward_std": 0.47613638639450073, "rewards/execution_accuracy_EX/mean": 0.5, "rewards/execution_accuracy_EX/std": 0.5009794235229492, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9774079918861389, "sampling/importance_sampling_ratio/min": 0.0020053053740411997, "sampling/sampling_logp_difference/max": 6.211958885192871, "sampling/sampling_logp_difference/mean": 0.14452630281448364, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 405.08203125, "completions/mean_terminated_length": 390.6078796386719, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.13971972465515137, "epoch": 0.727433628318584, "frac_reward_zero_std": 0.0, "grad_norm": 0.1622591965922765, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 204176818.0, "reward": 0.5693359375, "reward_std": 0.4740595817565918, "rewards/execution_accuracy_EX/mean": 0.546875, "rewards/execution_accuracy_EX/std": 0.4987730085849762, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 1.9836039543151855, "sampling/importance_sampling_ratio/mean": 0.9794758558273315, "sampling/importance_sampling_ratio/min": 0.0025437939912080765, "sampling/sampling_logp_difference/max": 5.9740986824035645, "sampling/sampling_logp_difference/mean": 0.13981348276138306, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 338.421875, "completions/mean_terminated_length": 338.421875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.12366738822311163, "epoch": 0.7292035398230089, "frac_reward_zero_std": 0.0, "grad_norm": 0.5099800386456143, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 204702526.0, "reward": 0.725390613079071, "reward_std": 0.4315042495727539, "rewards/execution_accuracy_EX/mean": 0.7109375, "rewards/execution_accuracy_EX/std": 0.45421501994132996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9769878387451172, "sampling/importance_sampling_ratio/min": 0.004092916380614042, "sampling/sampling_logp_difference/max": 5.498497486114502, "sampling/sampling_logp_difference/mean": 0.13305461406707764, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 296.91015625, "completions/mean_terminated_length": 296.91015625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.11305101774632931, "epoch": 0.7309734513274336, "frac_reward_zero_std": 0.0, "grad_norm": 0.32362625799769246, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 205312423.0, "reward": 0.7513671517372131, "reward_std": 0.4184097647666931, "rewards/execution_accuracy_EX/mean": 0.73828125, "rewards/execution_accuracy_EX/std": 0.4404313564300537, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9774864912033081, "sampling/importance_sampling_ratio/min": 0.006741675082594156, "sampling/sampling_logp_difference/max": 4.999446868896484, "sampling/sampling_logp_difference/mean": 0.12930026650428772, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1164.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 301.6953125, "completions/mean_terminated_length": 301.6953125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.12935538683086634, "epoch": 0.7327433628318584, "frac_reward_zero_std": 0.0, "grad_norm": 0.30840177983682704, "learning_rate": 1e-06, "loss": -0.0184, "num_tokens": 205727561.0, "reward": 0.6400390863418579, "reward_std": 0.4617617428302765, "rewards/execution_accuracy_EX/mean": 0.62109375, "rewards/execution_accuracy_EX/std": 0.4860650300979614, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9756845831871033, "sampling/importance_sampling_ratio/min": 0.0052536651492118835, "sampling/sampling_logp_difference/max": 5.248829364776611, "sampling/sampling_logp_difference/mean": 0.14531688392162323, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1171.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 308.29296875, "completions/mean_terminated_length": 308.29296875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.11433220468461514, "epoch": 0.7345132743362832, "frac_reward_zero_std": 0.0, "grad_norm": 0.32537855064281407, "learning_rate": 1e-06, "loss": -0.005, "num_tokens": 206225012.0, "reward": 0.699414074420929, "reward_std": 0.44268524646759033, "rewards/execution_accuracy_EX/mean": 0.68359375, "rewards/execution_accuracy_EX/std": 0.4659844934940338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9792124032974243, "sampling/importance_sampling_ratio/min": 0.001520410762168467, "sampling/sampling_logp_difference/max": 6.48877477645874, "sampling/sampling_logp_difference/mean": 0.128114253282547, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 879.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 330.4765625, "completions/mean_terminated_length": 330.4765625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.11765911988914013, "epoch": 0.736283185840708, "frac_reward_zero_std": 0.0, "grad_norm": 0.13226726347536988, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 206701230.0, "reward": 0.632617175579071, "reward_std": 0.46355465054512024, "rewards/execution_accuracy_EX/mean": 0.61328125, "rewards/execution_accuracy_EX/std": 0.4879522919654846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9782488346099854, "sampling/importance_sampling_ratio/min": 0.005450984928756952, "sampling/sampling_logp_difference/max": 5.211958885192871, "sampling/sampling_logp_difference/mean": 0.1290372759103775, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1738.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 331.71484375, "completions/mean_terminated_length": 331.71484375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.1193550219759345, "epoch": 0.7380530973451327, "frac_reward_zero_std": 0.0, "grad_norm": 0.1995712844401606, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 207215781.0, "reward": 0.669726550579071, "reward_std": 0.4533011019229889, "rewards/execution_accuracy_EX/mean": 0.65234375, "rewards/execution_accuracy_EX/std": 0.4771590530872345, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9766120910644531, "sampling/importance_sampling_ratio/min": 0.005258153658360243, "sampling/sampling_logp_difference/max": 5.2479753494262695, "sampling/sampling_logp_difference/mean": 0.13726986944675446, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 390.8203125, "completions/mean_terminated_length": 361.6456604003906, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.132203945890069, "epoch": 0.7398230088495575, "frac_reward_zero_std": 0.0, "grad_norm": 0.2415576277838756, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 207701815.0, "reward": 0.717578113079071, "reward_std": 0.43551141023635864, "rewards/execution_accuracy_EX/mean": 0.703125, "rewards/execution_accuracy_EX/std": 0.45777595043182373, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9799232482910156, "sampling/importance_sampling_ratio/min": 0.006748078390955925, "sampling/sampling_logp_difference/max": 4.998497486114502, "sampling/sampling_logp_difference/mean": 0.13808873295783997, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1667.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 314.91015625, "completions/mean_terminated_length": 314.91015625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.11113210208714008, "epoch": 0.7415929203539823, "frac_reward_zero_std": 0.0, "grad_norm": 0.09458253582339968, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 208343072.0, "reward": 0.7587890625, "reward_std": 0.4142923355102539, "rewards/execution_accuracy_EX/mean": 0.74609375, "rewards/execution_accuracy_EX/std": 0.4360972046852112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8623942136764526, "sampling/importance_sampling_ratio/mean": 0.9757764935493469, "sampling/importance_sampling_ratio/min": 0.011125700548291206, "sampling/sampling_logp_difference/max": 4.498497486114502, "sampling/sampling_logp_difference/mean": 0.13390567898750305, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1395.0, "completions/max_terminated_length": 1395.0, "completions/mean_length": 391.25, "completions/mean_terminated_length": 391.25, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.12792749982327223, "epoch": 0.7433628318584071, "frac_reward_zero_std": 0.0, "grad_norm": 0.2103665825522292, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 209057408.0, "reward": 0.4730468690395355, "reward_std": 0.47307512164115906, "rewards/execution_accuracy_EX/mean": 0.4453125, "rewards/execution_accuracy_EX/std": 0.49797385931015015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9786797165870667, "sampling/importance_sampling_ratio/min": 0.011125700548291206, "sampling/sampling_logp_difference/max": 4.498497486114502, "sampling/sampling_logp_difference/mean": 0.1359177827835083, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1192.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 309.5390625, "completions/mean_terminated_length": 309.5390625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.10244856681674719, "epoch": 0.7451327433628319, "frac_reward_zero_std": 0.0, "grad_norm": 0.13597446755628906, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 209494746.0, "reward": 0.632617175579071, "reward_std": 0.46355465054512024, "rewards/execution_accuracy_EX/mean": 0.61328125, "rewards/execution_accuracy_EX/std": 0.4879522919654846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9756293296813965, "sampling/importance_sampling_ratio/min": 0.011125764809548855, "sampling/sampling_logp_difference/max": 4.4984917640686035, "sampling/sampling_logp_difference/mean": 0.13026078045368195, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 339.15234375, "completions/mean_terminated_length": 339.15234375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.11116266716271639, "epoch": 0.7469026548672566, "frac_reward_zero_std": 0.0, "grad_norm": 0.24474655526352976, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 210003345.0, "reward": 0.740234375, "reward_std": 0.4242667853832245, "rewards/execution_accuracy_EX/mean": 0.7265625, "rewards/execution_accuracy_EX/std": 0.446596622467041, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9771470427513123, "sampling/importance_sampling_ratio/min": 0.005275054834783077, "sampling/sampling_logp_difference/max": 5.2447662353515625, "sampling/sampling_logp_difference/mean": 0.1310897022485733, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 323.17578125, "completions/mean_terminated_length": 323.17578125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.10073120426386595, "epoch": 0.7486725663716814, "frac_reward_zero_std": 0.0, "grad_norm": 0.2185985904271473, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 210497358.0, "reward": 0.6066405773162842, "reward_std": 0.46884801983833313, "rewards/execution_accuracy_EX/mean": 0.5859375, "rewards/execution_accuracy_EX/std": 0.4935242533683777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9788148403167725, "sampling/importance_sampling_ratio/min": 0.008668468333780766, "sampling/sampling_logp_difference/max": 4.748063087463379, "sampling/sampling_logp_difference/mean": 0.12084464728832245, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 336.5390625, "completions/mean_terminated_length": 336.5390625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.09866121783852577, "epoch": 0.7504424778761062, "frac_reward_zero_std": 0.0, "grad_norm": 0.3646291725281725, "learning_rate": 1e-06, "loss": -0.0184, "num_tokens": 210924232.0, "reward": 0.7105468511581421, "reward_std": 0.43811774253845215, "rewards/execution_accuracy_EX/mean": 0.6953125, "rewards/execution_accuracy_EX/std": 0.4611765742301941, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9776594638824463, "sampling/importance_sampling_ratio/min": 0.006850760895758867, "sampling/sampling_logp_difference/max": 4.983395576477051, "sampling/sampling_logp_difference/mean": 0.11906301975250244, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 309.0, "completions/mean_terminated_length": 309.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.10941006522625685, "epoch": 0.7522123893805309, "frac_reward_zero_std": 0.0, "grad_norm": 0.1041127715544993, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 211448024.0, "reward": 0.6957031488418579, "reward_std": 0.4441350996494293, "rewards/execution_accuracy_EX/mean": 0.6796875, "rewards/execution_accuracy_EX/std": 0.4675106406211853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9391770362854004, "sampling/importance_sampling_ratio/mean": 0.9759188890457153, "sampling/importance_sampling_ratio/min": 0.005253662820905447, "sampling/sampling_logp_difference/max": 5.2488298416137695, "sampling/sampling_logp_difference/mean": 0.1324661374092102, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 370.87890625, "completions/mean_terminated_length": 356.2705993652344, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.12021107878535986, "epoch": 0.7539823008849558, "frac_reward_zero_std": 0.0, "grad_norm": 0.2874033325364916, "learning_rate": 1e-06, "loss": -0.0066, "num_tokens": 211892185.0, "reward": 0.6917968392372131, "reward_std": 0.44584256410598755, "rewards/execution_accuracy_EX/mean": 0.67578125, "rewards/execution_accuracy_EX/std": 0.46899911761283875, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 1.8821347951889038, "sampling/importance_sampling_ratio/mean": 0.9791556596755981, "sampling/importance_sampling_ratio/min": 0.008726546540856361, "sampling/sampling_logp_difference/max": 4.741385459899902, "sampling/sampling_logp_difference/mean": 0.13618966937065125, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1255.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 318.7265625, "completions/mean_terminated_length": 318.7265625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.09897692129015923, "epoch": 0.7557522123893805, "frac_reward_zero_std": 0.0, "grad_norm": 0.17419818865642023, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 212368499.0, "reward": 0.6957031488418579, "reward_std": 0.44413506984710693, "rewards/execution_accuracy_EX/mean": 0.6796875, "rewards/execution_accuracy_EX/std": 0.4675106406211853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9803605675697327, "sampling/importance_sampling_ratio/min": 0.005269622430205345, "sampling/sampling_logp_difference/max": 5.2457966804504395, "sampling/sampling_logp_difference/mean": 0.11493834853172302, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 332.6796875, "completions/mean_terminated_length": 332.6796875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.09865881130099297, "epoch": 0.7575221238938054, "frac_reward_zero_std": 0.0, "grad_norm": 0.26984354637504, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 212933489.0, "reward": 0.595507800579071, "reward_std": 0.47065800428390503, "rewards/execution_accuracy_EX/mean": 0.57421875, "rewards/execution_accuracy_EX/std": 0.49542948603630066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9766672253608704, "sampling/importance_sampling_ratio/min": 0.004090497270226479, "sampling/sampling_logp_difference/max": 5.499088764190674, "sampling/sampling_logp_difference/mean": 0.12172359228134155, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1338.0, "completions/max_terminated_length": 1338.0, "completions/mean_length": 342.16796875, "completions/mean_terminated_length": 342.16796875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.11332171130925417, "epoch": 0.7592920353982301, "frac_reward_zero_std": 0.0, "grad_norm": 0.5388262429331899, "learning_rate": 1e-06, "loss": -0.0284, "num_tokens": 213391980.0, "reward": 0.6363281011581421, "reward_std": 0.46267399191856384, "rewards/execution_accuracy_EX/mean": 0.6171875, "rewards/execution_accuracy_EX/std": 0.48702529072761536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9824634790420532, "sampling/importance_sampling_ratio/min": 0.008850703947246075, "sampling/sampling_logp_difference/max": 4.727258205413818, "sampling/sampling_logp_difference/mean": 0.12259416282176971, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 339.33984375, "completions/mean_terminated_length": 324.60784912109375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.10638696234673262, "epoch": 0.7610619469026548, "frac_reward_zero_std": 0.0, "grad_norm": 0.40723506148938576, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 214108387.0, "reward": 0.4839843809604645, "reward_std": 0.47435954213142395, "rewards/execution_accuracy_EX/mean": 0.45703125, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9761378169059753, "sampling/importance_sampling_ratio/min": 0.005292918533086777, "sampling/sampling_logp_difference/max": 5.241385459899902, "sampling/sampling_logp_difference/mean": 0.1323973536491394, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 323.34375, "completions/mean_terminated_length": 293.6377868652344, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.09915001504123211, "epoch": 0.7628318584070797, "frac_reward_zero_std": 0.0, "grad_norm": 0.2657950774754114, "learning_rate": 1e-06, "loss": -0.0054, "num_tokens": 214724827.0, "reward": 0.62109375, "reward_std": 0.466510146856308, "rewards/execution_accuracy_EX/mean": 0.6015625, "rewards/execution_accuracy_EX/std": 0.4905354380607605, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9778218269348145, "sampling/importance_sampling_ratio/min": 0.0023839690256863832, "sampling/sampling_logp_difference/max": 6.0389885902404785, "sampling/sampling_logp_difference/mean": 0.12448465079069138, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 253.078125, "completions/mean_terminated_length": 253.078125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.0758161349222064, "epoch": 0.7646017699115044, "frac_reward_zero_std": 0.0, "grad_norm": 0.36656361065128307, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 215296431.0, "reward": 0.743945300579071, "reward_std": 0.42235618829727173, "rewards/execution_accuracy_EX/mean": 0.73046875, "rewards/execution_accuracy_EX/std": 0.44458550214767456, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.996958613395691, "sampling/importance_sampling_ratio/mean": 0.9752987623214722, "sampling/importance_sampling_ratio/min": 0.008661850355565548, "sampling/sampling_logp_difference/max": 4.74882698059082, "sampling/sampling_logp_difference/mean": 0.11455568671226501, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 316.26953125, "completions/mean_terminated_length": 316.26953125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.08657894004136324, "epoch": 0.7663716814159292, "frac_reward_zero_std": 0.0, "grad_norm": 0.3329185795384755, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 215715492.0, "reward": 0.7291015386581421, "reward_std": 0.4297545552253723, "rewards/execution_accuracy_EX/mean": 0.71484375, "rewards/execution_accuracy_EX/std": 0.4523732364177704, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9779499769210815, "sampling/importance_sampling_ratio/min": 0.002480123657733202, "sampling/sampling_logp_difference/max": 5.999446868896484, "sampling/sampling_logp_difference/mean": 0.11364603787660599, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 298.28515625, "completions/mean_terminated_length": 298.28515625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.08486288227140903, "epoch": 0.768141592920354, "frac_reward_zero_std": 0.0, "grad_norm": 0.37818714546091925, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 216232189.0, "reward": 0.6845703125, "reward_std": 0.44827139377593994, "rewards/execution_accuracy_EX/mean": 0.66796875, "rewards/execution_accuracy_EX/std": 0.4718646705150604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9771759510040283, "sampling/importance_sampling_ratio/min": 0.005371001549065113, "sampling/sampling_logp_difference/max": 5.226740837097168, "sampling/sampling_logp_difference/mean": 0.11518388986587524, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 314.4921875, "completions/mean_terminated_length": 314.4921875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.08155077509582043, "epoch": 0.7699115044247787, "frac_reward_zero_std": 0.0, "grad_norm": 0.2974299365827861, "learning_rate": 1e-06, "loss": -0.012, "num_tokens": 216687803.0, "reward": 0.49531248211860657, "reward_std": 0.4749999940395355, "rewards/execution_accuracy_EX/mean": 0.46875, "rewards/execution_accuracy_EX/std": 0.5, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9413729906082153, "sampling/importance_sampling_ratio/mean": 0.9788044095039368, "sampling/importance_sampling_ratio/min": 0.0024801555555313826, "sampling/sampling_logp_difference/max": 5.999433994293213, "sampling/sampling_logp_difference/mean": 0.10834769159555435, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 269.78125, "completions/mean_terminated_length": 269.78125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.0770259564742446, "epoch": 0.7716814159292036, "frac_reward_zero_std": 0.0, "grad_norm": 0.3477427421826566, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 217213539.0, "reward": 0.7179687023162842, "reward_std": 0.4348871409893036, "rewards/execution_accuracy_EX/mean": 0.703125, "rewards/execution_accuracy_EX/std": 0.45777595043182373, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9755126237869263, "sampling/importance_sampling_ratio/min": 0.005266683176159859, "sampling/sampling_logp_difference/max": 5.246354579925537, "sampling/sampling_logp_difference/mean": 0.11614066362380981, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 309.98828125, "completions/mean_terminated_length": 309.98828125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.08747177477926016, "epoch": 0.7734513274336283, "frac_reward_zero_std": 0.0, "grad_norm": 0.23042955091273926, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 217712144.0, "reward": 0.5992187261581421, "reward_std": 0.47008487582206726, "rewards/execution_accuracy_EX/mean": 0.578125, "rewards/execution_accuracy_EX/std": 0.49482619762420654, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9772858619689941, "sampling/importance_sampling_ratio/min": 0.006758376490324736, "sampling/sampling_logp_difference/max": 4.996972560882568, "sampling/sampling_logp_difference/mean": 0.11812114715576172, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 336.046875, "completions/mean_terminated_length": 336.046875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.095473762601614, "epoch": 0.7752212389380531, "frac_reward_zero_std": 0.0, "grad_norm": 0.10586711541233572, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 218379916.0, "reward": 0.517578125, "reward_std": 0.47587236762046814, "rewards/execution_accuracy_EX/mean": 0.4921875, "rewards/execution_accuracy_EX/std": 0.5009182691574097, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9747204780578613, "sampling/importance_sampling_ratio/min": 0.004099207930266857, "sampling/sampling_logp_difference/max": 5.49696159362793, "sampling/sampling_logp_difference/mean": 0.1270812749862671, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 277.58984375, "completions/mean_terminated_length": 277.58984375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.07738334592431784, "epoch": 0.7769911504424779, "frac_reward_zero_std": 0.0, "grad_norm": 0.3806241018322237, "learning_rate": 1e-06, "loss": -0.0345, "num_tokens": 219015459.0, "reward": 0.47675782442092896, "reward_std": 0.47346949577331543, "rewards/execution_accuracy_EX/mean": 0.44921875, "rewards/execution_accuracy_EX/std": 0.49838894605636597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9766601920127869, "sampling/importance_sampling_ratio/min": 0.008679235354065895, "sampling/sampling_logp_difference/max": 4.746821880340576, "sampling/sampling_logp_difference/mean": 0.11376699805259705, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1640.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 348.04296875, "completions/mean_terminated_length": 348.04296875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.09423344023525715, "epoch": 0.7787610619469026, "frac_reward_zero_std": 0.0, "grad_norm": 0.4084732418284597, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 219464046.0, "reward": 0.6437499523162842, "reward_std": 0.46081769466400146, "rewards/execution_accuracy_EX/mean": 0.625, "rewards/execution_accuracy_EX/std": 0.4850712716579437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8886933326721191, "sampling/importance_sampling_ratio/mean": 0.9797013998031616, "sampling/importance_sampling_ratio/min": 0.0052512455731630325, "sampling/sampling_logp_difference/max": 5.2492899894714355, "sampling/sampling_logp_difference/mean": 0.11707621067762375, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 330.74609375, "completions/mean_terminated_length": 330.74609375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.0814787931740284, "epoch": 0.7805309734513274, "frac_reward_zero_std": 0.0, "grad_norm": 0.4846264441212243, "learning_rate": 1e-06, "loss": -0.0116, "num_tokens": 219979437.0, "reward": 0.614062488079071, "reward_std": 0.46748965978622437, "rewards/execution_accuracy_EX/mean": 0.59375, "rewards/execution_accuracy_EX/std": 0.49209436774253845, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9803451299667358, "sampling/importance_sampling_ratio/min": 0.0007139010122045875, "sampling/sampling_logp_difference/max": 7.2447662353515625, "sampling/sampling_logp_difference/mean": 0.10679909586906433, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 318.1328125, "completions/mean_terminated_length": 303.3176574707031, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.09138348698616028, "epoch": 0.7823008849557522, "frac_reward_zero_std": 0.0, "grad_norm": 0.0035199025850439912, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 220368959.0, "reward": 0.5248047113418579, "reward_std": 0.47613638639450073, "rewards/execution_accuracy_EX/mean": 0.5, "rewards/execution_accuracy_EX/std": 0.5009794235229492, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9822543859481812, "sampling/importance_sampling_ratio/min": 0.006748310290277004, "sampling/sampling_logp_difference/max": 4.998463153839111, "sampling/sampling_logp_difference/mean": 0.11548037827014923, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 375.08984375, "completions/mean_terminated_length": 360.498046875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.09969014022499323, "epoch": 0.784070796460177, "frac_reward_zero_std": 0.0, "grad_norm": 0.7800076888829647, "learning_rate": 1e-06, "loss": -0.0169, "num_tokens": 220829350.0, "reward": 0.7660156488418579, "reward_std": 0.4103529453277588, "rewards/execution_accuracy_EX/mean": 0.75390625, "rewards/execution_accuracy_EX/std": 0.43157756328582764, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9796102046966553, "sampling/importance_sampling_ratio/min": 0.004096942488104105, "sampling/sampling_logp_difference/max": 5.497514247894287, "sampling/sampling_logp_difference/mean": 0.1221030205488205, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1707.0, "completions/max_terminated_length": 1707.0, "completions/mean_length": 396.296875, "completions/mean_terminated_length": 396.296875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.10772426798939705, "epoch": 0.7858407079646018, "frac_reward_zero_std": 0.0, "grad_norm": 0.2087415195653686, "learning_rate": 1e-06, "loss": -0.0113, "num_tokens": 221508242.0, "reward": 0.47675779461860657, "reward_std": 0.47346949577331543, "rewards/execution_accuracy_EX/mean": 0.44921875, "rewards/execution_accuracy_EX/std": 0.49838894605636597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9800965785980225, "sampling/importance_sampling_ratio/min": 0.005525479093194008, "sampling/sampling_logp_difference/max": 5.198385238647461, "sampling/sampling_logp_difference/mean": 0.12877586483955383, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1450.0, "completions/max_terminated_length": 1450.0, "completions/mean_length": 310.40234375, "completions/mean_terminated_length": 310.40234375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.09367150627076626, "epoch": 0.7876106194690266, "frac_reward_zero_std": 0.0, "grad_norm": 0.34971138259769186, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 222059529.0, "reward": 0.595507800579071, "reward_std": 0.47065800428390503, "rewards/execution_accuracy_EX/mean": 0.57421875, "rewards/execution_accuracy_EX/std": 0.49542948603630066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9749582409858704, "sampling/importance_sampling_ratio/min": 0.006767093203961849, "sampling/sampling_logp_difference/max": 4.995683670043945, "sampling/sampling_logp_difference/mean": 0.12886998057365417, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1591.0, "completions/max_terminated_length": 1591.0, "completions/mean_length": 327.578125, "completions/mean_terminated_length": 327.578125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07842460833489895, "epoch": 0.7893805309734513, "frac_reward_zero_std": 0.0, "grad_norm": 0.2822770722979932, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 222598349.0, "reward": 0.614062488079071, "reward_std": 0.46748965978622437, "rewards/execution_accuracy_EX/mean": 0.59375, "rewards/execution_accuracy_EX/std": 0.49209436774253845, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8763935565948486, "sampling/importance_sampling_ratio/mean": 0.981664776802063, "sampling/importance_sampling_ratio/min": 0.00708415312692523, "sampling/sampling_logp_difference/max": 4.949894905090332, "sampling/sampling_logp_difference/mean": 0.10455193370580673, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 317.0703125, "completions/mean_terminated_length": 317.0703125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.09262909553945065, "epoch": 0.7911504424778761, "frac_reward_zero_std": 0.0, "grad_norm": 0.3778221159657832, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 222976927.0, "reward": 0.6548827886581421, "reward_std": 0.45779263973236084, "rewards/execution_accuracy_EX/mean": 0.63671875, "rewards/execution_accuracy_EX/std": 0.48188701272010803, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9781402349472046, "sampling/importance_sampling_ratio/mean": 0.9780029654502869, "sampling/importance_sampling_ratio/min": 0.00409080320969224, "sampling/sampling_logp_difference/max": 5.499013900756836, "sampling/sampling_logp_difference/mean": 0.12452073395252228, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1167.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 330.015625, "completions/mean_terminated_length": 330.015625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.0872789965942502, "epoch": 0.7929203539823009, "frac_reward_zero_std": 0.0, "grad_norm": 0.3657236297441997, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 223373923.0, "reward": 0.5658202767372131, "reward_std": 0.4741697609424591, "rewards/execution_accuracy_EX/mean": 0.54296875, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9787552356719971, "sampling/importance_sampling_ratio/min": 0.006754662375897169, "sampling/sampling_logp_difference/max": 4.997522354125977, "sampling/sampling_logp_difference/mean": 0.11839602887630463, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1659.0, "completions/max_terminated_length": 1659.0, "completions/mean_length": 321.67578125, "completions/mean_terminated_length": 321.67578125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.08092036191374063, "epoch": 0.7946902654867256, "frac_reward_zero_std": 0.0, "grad_norm": 0.2651752619147164, "learning_rate": 1e-06, "loss": -0.0088, "num_tokens": 223736000.0, "reward": 0.5806640386581421, "reward_std": 0.4726512134075165, "rewards/execution_accuracy_EX/mean": 0.55859375, "rewards/execution_accuracy_EX/std": 0.4975275993347168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9768083095550537, "sampling/importance_sampling_ratio/min": 0.005254730116575956, "sampling/sampling_logp_difference/max": 5.248626708984375, "sampling/sampling_logp_difference/mean": 0.11782976984977722, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 306.34765625, "completions/mean_terminated_length": 306.34765625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07939030043780804, "epoch": 0.7964601769911505, "frac_reward_zero_std": 0.0, "grad_norm": 0.31977415414022614, "learning_rate": 1e-06, "loss": -0.01, "num_tokens": 224237881.0, "reward": 0.6957031488418579, "reward_std": 0.44413506984710693, "rewards/execution_accuracy_EX/mean": 0.6796875, "rewards/execution_accuracy_EX/std": 0.4675106406211853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9801115989685059, "sampling/importance_sampling_ratio/min": 0.005262570921331644, "sampling/sampling_logp_difference/max": 5.247135639190674, "sampling/sampling_logp_difference/mean": 0.10750046372413635, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1044.0, "completions/max_terminated_length": 1044.0, "completions/mean_length": 316.109375, "completions/mean_terminated_length": 316.109375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.09447643160820007, "epoch": 0.7982300884955752, "frac_reward_zero_std": 0.0, "grad_norm": 0.2655966100963604, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 224989829.0, "reward": 0.8701171875, "reward_std": 0.32701200246810913, "rewards/execution_accuracy_EX/mean": 0.86328125, "rewards/execution_accuracy_EX/std": 0.34422317147254944, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9353786706924438, "sampling/importance_sampling_ratio/mean": 0.9811705350875854, "sampling/importance_sampling_ratio/min": 0.004089032299816608, "sampling/sampling_logp_difference/max": 5.499446868896484, "sampling/sampling_logp_difference/mean": 0.11678052693605423, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2181.0, "completions/mean_length": 414.13671875, "completions/mean_terminated_length": 399.69805908203125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.10684379749000072, "epoch": 0.8, "frac_reward_zero_std": 0.0, "grad_norm": 0.3080515698834411, "learning_rate": 1e-06, "loss": 0.0256, "num_tokens": 225598360.0, "reward": 0.732617199420929, "reward_std": 0.4282895624637604, "rewards/execution_accuracy_EX/mean": 0.71875, "rewards/execution_accuracy_EX/std": 0.45048993825912476, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 1.997812032699585, "sampling/importance_sampling_ratio/mean": 0.9780542850494385, "sampling/importance_sampling_ratio/min": 0.005716521292924881, "sampling/sampling_logp_difference/max": 5.164394855499268, "sampling/sampling_logp_difference/mean": 0.13397234678268433, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 274.3828125, "completions/mean_terminated_length": 274.3828125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.07503345794975758, "epoch": 0.8017699115044248, "frac_reward_zero_std": 0.0, "grad_norm": 0.32222800405943564, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 226191178.0, "reward": 0.517578125, "reward_std": 0.47587236762046814, "rewards/execution_accuracy_EX/mean": 0.4921875, "rewards/execution_accuracy_EX/std": 0.5009182691574097, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9768715500831604, "sampling/importance_sampling_ratio/min": 0.001507165958173573, "sampling/sampling_logp_difference/max": 6.497524261474609, "sampling/sampling_logp_difference/mean": 0.11086973547935486, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1686.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 346.09375, "completions/mean_terminated_length": 346.09375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.08223924785852432, "epoch": 0.8035398230088495, "frac_reward_zero_std": 0.0, "grad_norm": 0.26762464550604864, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 226621234.0, "reward": 0.5658203363418579, "reward_std": 0.4741697609424591, "rewards/execution_accuracy_EX/mean": 0.54296875, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9794480204582214, "sampling/importance_sampling_ratio/min": 0.0051038190722465515, "sampling/sampling_logp_difference/max": 5.277766227722168, "sampling/sampling_logp_difference/mean": 0.10982825607061386, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1222.0, "completions/max_terminated_length": 1222.0, "completions/mean_length": 336.48046875, "completions/mean_terminated_length": 336.48046875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.08400444034487009, "epoch": 0.8053097345132744, "frac_reward_zero_std": 0.0, "grad_norm": 0.4363384259899209, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 227079309.0, "reward": 0.6103515625, "reward_std": 0.46818408370018005, "rewards/execution_accuracy_EX/mean": 0.58984375, "rewards/execution_accuracy_EX/std": 0.49282538890838623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9790494441986084, "sampling/importance_sampling_ratio/min": 0.004761462565511465, "sampling/sampling_logp_difference/max": 5.347200393676758, "sampling/sampling_logp_difference/mean": 0.11057664453983307, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1864.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 291.53515625, "completions/mean_terminated_length": 291.53515625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.07178681809455156, "epoch": 0.8070796460176991, "frac_reward_zero_std": 0.0, "grad_norm": 0.3319572345197162, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 227467366.0, "reward": 0.5806640386581421, "reward_std": 0.4726512134075165, "rewards/execution_accuracy_EX/mean": 0.55859375, "rewards/execution_accuracy_EX/std": 0.4975275993347168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9800399541854858, "sampling/importance_sampling_ratio/min": 0.0056649609468877316, "sampling/sampling_logp_difference/max": 5.173455238342285, "sampling/sampling_logp_difference/mean": 0.09974545240402222, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 318.5390625, "completions/mean_terminated_length": 303.7254943847656, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.09256428107619286, "epoch": 0.8088495575221238, "frac_reward_zero_std": 0.0, "grad_norm": 0.30369054606532936, "learning_rate": 1e-06, "loss": 0.0384, "num_tokens": 228061648.0, "reward": 0.6175780892372131, "reward_std": 0.4670134484767914, "rewards/execution_accuracy_EX/mean": 0.59765625, "rewards/execution_accuracy_EX/std": 0.4913311004638672, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9786026477813721, "sampling/importance_sampling_ratio/min": 0.001931285485625267, "sampling/sampling_logp_difference/max": 6.249569416046143, "sampling/sampling_logp_difference/mean": 0.11863424628973007, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1378.0, "completions/mean_length": 316.33203125, "completions/mean_terminated_length": 301.50982666015625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.094835733063519, "epoch": 0.8106194690265487, "frac_reward_zero_std": 0.0, "grad_norm": 0.5356222093813154, "learning_rate": 1e-06, "loss": 0.0474, "num_tokens": 228590741.0, "reward": 0.7771484851837158, "reward_std": 0.4035811126232147, "rewards/execution_accuracy_EX/mean": 0.765625, "rewards/execution_accuracy_EX/std": 0.42443734407424927, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9795012474060059, "sampling/importance_sampling_ratio/min": 0.006784177850931883, "sampling/sampling_logp_difference/max": 4.993162155151367, "sampling/sampling_logp_difference/mean": 0.12180796265602112, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 296.68359375, "completions/mean_terminated_length": 296.68359375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.0966231282800436, "epoch": 0.8123893805309734, "frac_reward_zero_std": 0.0, "grad_norm": 0.3685554782293145, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 228984324.0, "reward": 0.632617175579071, "reward_std": 0.46355465054512024, "rewards/execution_accuracy_EX/mean": 0.61328125, "rewards/execution_accuracy_EX/std": 0.4879522919654846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9779250621795654, "sampling/importance_sampling_ratio/min": 0.004093545023351908, "sampling/sampling_logp_difference/max": 5.4983439445495605, "sampling/sampling_logp_difference/mean": 0.12139870226383209, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 258.25390625, "completions/mean_terminated_length": 258.25390625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.07291886862367392, "epoch": 0.8141592920353983, "frac_reward_zero_std": 0.0, "grad_norm": 0.5167090050362314, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 229537205.0, "reward": 0.847851574420929, "reward_std": 0.3490958511829376, "rewards/execution_accuracy_EX/mean": 0.83984375, "rewards/execution_accuracy_EX/std": 0.36746934056282043, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9779763221740723, "sampling/importance_sampling_ratio/min": 0.005634597036987543, "sampling/sampling_logp_difference/max": 5.178829669952393, "sampling/sampling_logp_difference/mean": 0.10613366961479187, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1291.0, "completions/max_terminated_length": 1291.0, "completions/mean_length": 344.578125, "completions/mean_terminated_length": 344.578125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.10439775697886944, "epoch": 0.815929203539823, "frac_reward_zero_std": 0.0, "grad_norm": 0.3876629389144973, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 230155113.0, "reward": 0.6029297113418579, "reward_std": 0.46948155760765076, "rewards/execution_accuracy_EX/mean": 0.58203125, "rewards/execution_accuracy_EX/std": 0.49419113993644714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8959710597991943, "sampling/importance_sampling_ratio/mean": 0.9778479337692261, "sampling/importance_sampling_ratio/min": 0.004101135302335024, "sampling/sampling_logp_difference/max": 5.496491432189941, "sampling/sampling_logp_difference/mean": 0.12573343515396118, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 242.7265625, "completions/mean_terminated_length": 242.7265625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.0799469705671072, "epoch": 0.8176991150442477, "frac_reward_zero_std": 0.0, "grad_norm": 0.2972234810588866, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 230501267.0, "reward": 0.7216796875, "reward_std": 0.4332149624824524, "rewards/execution_accuracy_EX/mean": 0.70703125, "rewards/execution_accuracy_EX/std": 0.45601576566696167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7134068012237549, "sampling/importance_sampling_ratio/mean": 0.9784244298934937, "sampling/importance_sampling_ratio/min": 0.008661828935146332, "sampling/sampling_logp_difference/max": 4.748829364776611, "sampling/sampling_logp_difference/mean": 0.11004751920700073, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2012.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 308.67578125, "completions/mean_terminated_length": 308.67578125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.09601080138236284, "epoch": 0.8194690265486726, "frac_reward_zero_std": 0.0, "grad_norm": 0.4236245912645116, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 230888512.0, "reward": 0.588085949420929, "reward_std": 0.4717142581939697, "rewards/execution_accuracy_EX/mean": 0.56640625, "rewards/execution_accuracy_EX/std": 0.4965413510799408, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9766750335693359, "sampling/importance_sampling_ratio/min": 0.006748078390955925, "sampling/sampling_logp_difference/max": 4.998497486114502, "sampling/sampling_logp_difference/mean": 0.12495654821395874, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1173.0, "completions/max_terminated_length": 1173.0, "completions/mean_length": 301.8671875, "completions/mean_terminated_length": 301.8671875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.09849263541400433, "epoch": 0.8212389380530973, "frac_reward_zero_std": 0.0, "grad_norm": 0.270938897089436, "learning_rate": 1e-06, "loss": 0.0041, "num_tokens": 231461182.0, "reward": 0.43964841961860657, "reward_std": 0.46818408370018005, "rewards/execution_accuracy_EX/mean": 0.41015625, "rewards/execution_accuracy_EX/std": 0.49282538890838623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.976449728012085, "sampling/importance_sampling_ratio/min": 0.00628508860245347, "sampling/sampling_logp_difference/max": 5.069575309753418, "sampling/sampling_logp_difference/mean": 0.12751615047454834, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 284.84375, "completions/mean_terminated_length": 269.8980407714844, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.08508272003382444, "epoch": 0.8230088495575221, "frac_reward_zero_std": 0.0, "grad_norm": 0.3992028430125624, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 231903430.0, "reward": 0.5804687738418579, "reward_std": 0.47288161516189575, "rewards/execution_accuracy_EX/mean": 0.55859375, "rewards/execution_accuracy_EX/std": 0.4975275993347168, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9783906936645508, "sampling/importance_sampling_ratio/min": 0.0052665723487734795, "sampling/sampling_logp_difference/max": 5.246375560760498, "sampling/sampling_logp_difference/mean": 0.11341390013694763, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 354.26171875, "completions/mean_terminated_length": 339.5882568359375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.1056946087628603, "epoch": 0.8247787610619469, "frac_reward_zero_std": 0.0, "grad_norm": 0.24208797497960982, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 232385337.0, "reward": 0.5804687738418579, "reward_std": 0.47248753905296326, "rewards/execution_accuracy_EX/mean": 0.55859375, "rewards/execution_accuracy_EX/std": 0.4975275993347168, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9797363877296448, "sampling/importance_sampling_ratio/min": 0.005264295265078545, "sampling/sampling_logp_difference/max": 5.246808052062988, "sampling/sampling_logp_difference/mean": 0.12469679117202759, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2396.0, "completions/mean_length": 387.91015625, "completions/mean_terminated_length": 329.0516052246094, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.09944021981209517, "epoch": 0.8265486725663717, "frac_reward_zero_std": 0.0, "grad_norm": 0.3333028002812477, "learning_rate": 1e-06, "loss": -0.0346, "num_tokens": 232984002.0, "reward": 0.5873047113418579, "reward_std": 0.472648948431015, "rewards/execution_accuracy_EX/mean": 0.56640625, "rewards/execution_accuracy_EX/std": 0.4965413510799408, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.981889009475708, "sampling/importance_sampling_ratio/min": 0.006765489932149649, "sampling/sampling_logp_difference/max": 4.995920658111572, "sampling/sampling_logp_difference/mean": 0.11872401833534241, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 310.57421875, "completions/mean_terminated_length": 295.72943115234375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.09511182922869921, "epoch": 0.8283185840707965, "frac_reward_zero_std": 0.0, "grad_norm": 0.177096113735158, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 233585941.0, "reward": 0.628710925579071, "reward_std": 0.4646587371826172, "rewards/execution_accuracy_EX/mean": 0.609375, "rewards/execution_accuracy_EX/std": 0.48884621262550354, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 1.8923084735870361, "sampling/importance_sampling_ratio/mean": 0.9799119830131531, "sampling/importance_sampling_ratio/min": 0.004091258160769939, "sampling/sampling_logp_difference/max": 5.498902797698975, "sampling/sampling_logp_difference/mean": 0.12098175287246704, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 295.96875, "completions/mean_terminated_length": 295.96875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.11154700815677643, "epoch": 0.8300884955752212, "frac_reward_zero_std": 0.0, "grad_norm": 0.34022177779675594, "learning_rate": 1e-06, "loss": -0.029, "num_tokens": 234076189.0, "reward": 0.591796875, "reward_std": 0.471201092004776, "rewards/execution_accuracy_EX/mean": 0.5703125, "rewards/execution_accuracy_EX/std": 0.4960011839866638, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9795331358909607, "sampling/importance_sampling_ratio/min": 0.008679230697453022, "sampling/sampling_logp_difference/max": 4.746822357177734, "sampling/sampling_logp_difference/mean": 0.12765681743621826, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 353.2421875, "completions/mean_terminated_length": 308.8616638183594, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.10618323087692261, "epoch": 0.831858407079646, "frac_reward_zero_std": 0.0, "grad_norm": 0.22207375763996906, "learning_rate": 1e-06, "loss": -0.0132, "num_tokens": 234673403.0, "reward": 0.602343738079071, "reward_std": 0.47020477056503296, "rewards/execution_accuracy_EX/mean": 0.58203125, "rewards/execution_accuracy_EX/std": 0.49419113993644714, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "sampling/importance_sampling_ratio/max": 1.9668482542037964, "sampling/importance_sampling_ratio/mean": 0.9778167009353638, "sampling/importance_sampling_ratio/min": 0.006744090002030134, "sampling/sampling_logp_difference/max": 4.999088764190674, "sampling/sampling_logp_difference/mean": 0.1282060444355011, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 274.1640625, "completions/mean_terminated_length": 274.1640625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07836710195988417, "epoch": 0.8336283185840708, "frac_reward_zero_std": 0.0, "grad_norm": 0.39106157893749205, "learning_rate": 1e-06, "loss": -0.0073, "num_tokens": 235065909.0, "reward": 0.703125, "reward_std": 0.44119933247566223, "rewards/execution_accuracy_EX/mean": 0.6875, "rewards/execution_accuracy_EX/std": 0.4644203782081604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8677870035171509, "sampling/importance_sampling_ratio/mean": 0.9805489182472229, "sampling/importance_sampling_ratio/min": 0.0015083501348271966, "sampling/sampling_logp_difference/max": 6.496738910675049, "sampling/sampling_logp_difference/mean": 0.10855324566364288, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1193.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 304.578125, "completions/mean_terminated_length": 304.578125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.09662522282451391, "epoch": 0.8353982300884956, "frac_reward_zero_std": 0.0, "grad_norm": 0.49762010935734724, "learning_rate": 1e-06, "loss": -0.0123, "num_tokens": 235549065.0, "reward": 0.632617175579071, "reward_std": 0.46355465054512024, "rewards/execution_accuracy_EX/mean": 0.61328125, "rewards/execution_accuracy_EX/std": 0.4879522919654846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9155534505844116, "sampling/importance_sampling_ratio/mean": 0.9793773889541626, "sampling/importance_sampling_ratio/min": 0.005264220293611288, "sampling/sampling_logp_difference/max": 5.246822357177734, "sampling/sampling_logp_difference/mean": 0.11838836967945099, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1203.0, "completions/max_terminated_length": 1203.0, "completions/mean_length": 361.87890625, "completions/mean_terminated_length": 361.87890625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.11292469780892134, "epoch": 0.8371681415929203, "frac_reward_zero_std": 0.0, "grad_norm": 0.2334895495758685, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 235973434.0, "reward": 0.4878906011581421, "reward_std": 0.47447580099105835, "rewards/execution_accuracy_EX/mean": 0.4609375, "rewards/execution_accuracy_EX/std": 0.4994482398033142, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9886711835861206, "sampling/importance_sampling_ratio/mean": 0.9788157939910889, "sampling/importance_sampling_ratio/min": 0.006413101684302092, "sampling/sampling_logp_difference/max": 5.049412250518799, "sampling/sampling_logp_difference/mean": 0.12969273328781128, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1406.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 412.51171875, "completions/mean_terminated_length": 412.51171875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.13038655370473862, "epoch": 0.8389380530973451, "frac_reward_zero_std": 0.0, "grad_norm": 0.26600631244934575, "learning_rate": 1e-06, "loss": -0.0104, "num_tokens": 236574461.0, "reward": 0.5769531726837158, "reward_std": 0.47307515144348145, "rewards/execution_accuracy_EX/mean": 0.5546875, "rewards/execution_accuracy_EX/std": 0.49797385931015015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9779107570648193, "sampling/importance_sampling_ratio/min": 0.005253662820905447, "sampling/sampling_logp_difference/max": 5.2488298416137695, "sampling/sampling_logp_difference/mean": 0.1439025104045868, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 269.2109375, "completions/mean_terminated_length": 269.2109375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.08845985028892756, "epoch": 0.8407079646017699, "frac_reward_zero_std": 0.0, "grad_norm": 0.4759736012801882, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 237175155.0, "reward": 0.892382800579071, "reward_std": 0.3016792833805084, "rewards/execution_accuracy_EX/mean": 0.88671875, "rewards/execution_accuracy_EX/std": 0.31755712628364563, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9776943922042847, "sampling/importance_sampling_ratio/min": 0.007106812205165625, "sampling/sampling_logp_difference/max": 4.946701526641846, "sampling/sampling_logp_difference/mean": 0.11410893499851227, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 336.890625, "completions/mean_terminated_length": 322.1490478515625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.09549013525247574, "epoch": 0.8424778761061947, "frac_reward_zero_std": 0.0, "grad_norm": 0.451095405100909, "learning_rate": 1e-06, "loss": -0.0459, "num_tokens": 237681991.0, "reward": 0.7660155892372131, "reward_std": 0.4103529453277588, "rewards/execution_accuracy_EX/mean": 0.75390625, "rewards/execution_accuracy_EX/std": 0.43157756328582764, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9781780242919922, "sampling/importance_sampling_ratio/min": 0.0052643753588199615, "sampling/sampling_logp_difference/max": 5.246792793273926, "sampling/sampling_logp_difference/mean": 0.12181083858013153, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2138.0, "completions/max_terminated_length": 2138.0, "completions/mean_length": 419.80078125, "completions/mean_terminated_length": 419.80078125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.13889087829738855, "epoch": 0.8442477876106195, "frac_reward_zero_std": 0.0, "grad_norm": 0.2564480535930519, "learning_rate": 1e-06, "loss": -0.0039, "num_tokens": 238064516.0, "reward": 0.591796875, "reward_std": 0.471201092004776, "rewards/execution_accuracy_EX/mean": 0.5703125, "rewards/execution_accuracy_EX/std": 0.4960011839866638, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8539725542068481, "sampling/importance_sampling_ratio/mean": 0.980320930480957, "sampling/importance_sampling_ratio/min": 0.004103494342416525, "sampling/sampling_logp_difference/max": 5.495916366577148, "sampling/sampling_logp_difference/mean": 0.14981991052627563, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1503.0, "completions/max_terminated_length": 1503.0, "completions/mean_length": 357.73828125, "completions/mean_terminated_length": 357.73828125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.10023377742618322, "epoch": 0.8460176991150442, "frac_reward_zero_std": 0.0, "grad_norm": 0.13332488800234604, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 238531889.0, "reward": 0.5732421875, "reward_std": 0.47346949577331543, "rewards/execution_accuracy_EX/mean": 0.55078125, "rewards/execution_accuracy_EX/std": 0.49838894605636597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9783233404159546, "sampling/importance_sampling_ratio/min": 0.00525366747751832, "sampling/sampling_logp_difference/max": 5.248828887939453, "sampling/sampling_logp_difference/mean": 0.12261553108692169, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 379.9453125, "completions/mean_terminated_length": 365.37255859375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.11600113194435835, "epoch": 0.8477876106194691, "frac_reward_zero_std": 0.0, "grad_norm": 0.29708866923388877, "learning_rate": 1e-06, "loss": -0.0216, "num_tokens": 239113379.0, "reward": 0.6509765386581421, "reward_std": 0.4591008126735687, "rewards/execution_accuracy_EX/mean": 0.6328125, "rewards/execution_accuracy_EX/std": 0.48298248648643494, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.980574369430542, "sampling/importance_sampling_ratio/min": 0.0031994825694710016, "sampling/sampling_logp_difference/max": 5.7447662353515625, "sampling/sampling_logp_difference/mean": 0.12897510826587677, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2453.0, "completions/max_terminated_length": 2453.0, "completions/mean_length": 373.86328125, "completions/mean_terminated_length": 373.86328125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.11521474458277225, "epoch": 0.8495575221238938, "frac_reward_zero_std": 0.0, "grad_norm": 0.1868225823936595, "learning_rate": 1e-06, "loss": -0.0118, "num_tokens": 239471104.0, "reward": 0.5361328125, "reward_std": 0.47579970955848694, "rewards/execution_accuracy_EX/mean": 0.51171875, "rewards/execution_accuracy_EX/std": 0.5008418560028076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9807099103927612, "sampling/importance_sampling_ratio/min": 0.011119124479591846, "sampling/sampling_logp_difference/max": 4.499088764190674, "sampling/sampling_logp_difference/mean": 0.1286713182926178, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 335.7265625, "completions/mean_terminated_length": 320.98040771484375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.0934757087379694, "epoch": 0.8513274336283185, "frac_reward_zero_std": 0.0, "grad_norm": 0.19902786838365327, "learning_rate": 1e-06, "loss": -0.0117, "num_tokens": 239957354.0, "reward": 0.6398437023162842, "reward_std": 0.4620228111743927, "rewards/execution_accuracy_EX/mean": 0.62109375, "rewards/execution_accuracy_EX/std": 0.4860650300979614, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9837499856948853, "sampling/importance_sampling_ratio/min": 0.0052512455731630325, "sampling/sampling_logp_difference/max": 5.2492899894714355, "sampling/sampling_logp_difference/mean": 0.10776247084140778, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2343.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 384.7265625, "completions/mean_terminated_length": 384.7265625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.1303556589409709, "epoch": 0.8530973451327434, "frac_reward_zero_std": 0.0, "grad_norm": 0.1347995517132836, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 240401876.0, "reward": 0.5361328125, "reward_std": 0.47579970955848694, "rewards/execution_accuracy_EX/mean": 0.51171875, "rewards/execution_accuracy_EX/std": 0.5008418560028076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9799525737762451, "sampling/importance_sampling_ratio/min": 0.005257649812847376, "sampling/sampling_logp_difference/max": 5.248071193695068, "sampling/sampling_logp_difference/mean": 0.14246472716331482, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2144.0, "completions/max_terminated_length": 2144.0, "completions/mean_length": 434.94921875, "completions/mean_terminated_length": 434.94921875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.12181931920349598, "epoch": 0.8548672566371681, "frac_reward_zero_std": 0.0, "grad_norm": 0.2763309347628719, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 241004567.0, "reward": 0.5101562738418579, "reward_std": 0.47569799423217773, "rewards/execution_accuracy_EX/mean": 0.484375, "rewards/execution_accuracy_EX/std": 0.5007347464561462, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9833311438560486, "sampling/importance_sampling_ratio/min": 0.0006765146972611547, "sampling/sampling_logp_difference/max": 7.298556327819824, "sampling/sampling_logp_difference/mean": 0.1296975165605545, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 377.12890625, "completions/mean_terminated_length": 362.54510498046875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.0899679409340024, "epoch": 0.856637168141593, "frac_reward_zero_std": 0.0, "grad_norm": 0.20983163973775368, "learning_rate": 1e-06, "loss": -0.0165, "num_tokens": 241374248.0, "reward": 0.5990234613418579, "reward_std": 0.4703242778778076, "rewards/execution_accuracy_EX/mean": 0.578125, "rewards/execution_accuracy_EX/std": 0.49482619762420654, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9800814390182495, "sampling/importance_sampling_ratio/min": 0.008661837317049503, "sampling/sampling_logp_difference/max": 4.748828411102295, "sampling/sampling_logp_difference/mean": 0.11837954819202423, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 448.2109375, "completions/mean_terminated_length": 448.2109375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.11048067081719637, "epoch": 0.8584070796460177, "frac_reward_zero_std": 0.0, "grad_norm": 0.3145437375507862, "learning_rate": 1e-06, "loss": 0.0143, "num_tokens": 241922686.0, "reward": 0.5992187261581421, "reward_std": 0.47008487582206726, "rewards/execution_accuracy_EX/mean": 0.578125, "rewards/execution_accuracy_EX/std": 0.49482619762420654, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9808949828147888, "sampling/importance_sampling_ratio/min": 0.006867280695587397, "sampling/sampling_logp_difference/max": 4.980987071990967, "sampling/sampling_logp_difference/mean": 0.12507322430610657, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1378.0, "completions/max_terminated_length": 1378.0, "completions/mean_length": 329.79296875, "completions/mean_terminated_length": 329.79296875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.09246771596372128, "epoch": 0.8601769911504424, "frac_reward_zero_std": 0.0, "grad_norm": 0.13229801693114943, "learning_rate": 1e-06, "loss": 0.0053, "num_tokens": 242302729.0, "reward": 0.7587890625, "reward_std": 0.4142923355102539, "rewards/execution_accuracy_EX/mean": 0.74609375, "rewards/execution_accuracy_EX/std": 0.4360972046852112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9906672239303589, "sampling/importance_sampling_ratio/mean": 0.9774895906448364, "sampling/importance_sampling_ratio/min": 0.005810766946524382, "sampling/sampling_logp_difference/max": 5.148042678833008, "sampling/sampling_logp_difference/mean": 0.11818039417266846, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2082.0, "completions/max_terminated_length": 2082.0, "completions/mean_length": 453.2421875, "completions/mean_terminated_length": 453.2421875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.11817977298051119, "epoch": 0.8619469026548673, "frac_reward_zero_std": 0.0, "grad_norm": 0.10729397252234574, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 242791847.0, "reward": 0.6548827886581421, "reward_std": 0.45779263973236084, "rewards/execution_accuracy_EX/mean": 0.63671875, "rewards/execution_accuracy_EX/std": 0.48188701272010803, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9795247316360474, "sampling/importance_sampling_ratio/min": 0.001930958591401577, "sampling/sampling_logp_difference/max": 6.249738693237305, "sampling/sampling_logp_difference/mean": 0.13629119098186493, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1224.0, "completions/mean_length": 368.82421875, "completions/mean_terminated_length": 339.47637939453125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.08221569657325745, "epoch": 0.863716814159292, "frac_reward_zero_std": 0.0, "grad_norm": 0.2331255969849816, "learning_rate": 1e-06, "loss": -0.0151, "num_tokens": 243339690.0, "reward": 0.7435546517372131, "reward_std": 0.4230230450630188, "rewards/execution_accuracy_EX/mean": 0.73046875, "rewards/execution_accuracy_EX/std": 0.44458550214767456, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "sampling/importance_sampling_ratio/max": 1.854588270187378, "sampling/importance_sampling_ratio/mean": 0.9802987575531006, "sampling/importance_sampling_ratio/min": 0.005265557672828436, "sampling/sampling_logp_difference/max": 5.246568202972412, "sampling/sampling_logp_difference/mean": 0.10977109521627426, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2213.0, "completions/max_terminated_length": 2213.0, "completions/mean_length": 406.40234375, "completions/mean_terminated_length": 406.40234375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.09937719628214836, "epoch": 0.8654867256637168, "frac_reward_zero_std": 0.0, "grad_norm": 0.23779578331354234, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 243947809.0, "reward": 0.6957031488418579, "reward_std": 0.44413506984710693, "rewards/execution_accuracy_EX/mean": 0.6796875, "rewards/execution_accuracy_EX/std": 0.4675106406211853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9814603328704834, "sampling/importance_sampling_ratio/min": 0.0031865073833614588, "sampling/sampling_logp_difference/max": 5.7488298416137695, "sampling/sampling_logp_difference/mean": 0.11837709695100784, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1587.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 359.12109375, "completions/mean_terminated_length": 359.12109375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.08976826258003712, "epoch": 0.8672566371681416, "frac_reward_zero_std": 0.0, "grad_norm": 0.4324991634947946, "learning_rate": 1e-06, "loss": -0.0321, "num_tokens": 244433616.0, "reward": 0.7736327648162842, "reward_std": 0.40552324056625366, "rewards/execution_accuracy_EX/mean": 0.76171875, "rewards/execution_accuracy_EX/std": 0.4268665909767151, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9764032959938049, "sampling/importance_sampling_ratio/min": 0.0052512455731630325, "sampling/sampling_logp_difference/max": 5.2492899894714355, "sampling/sampling_logp_difference/mean": 0.11896872520446777, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2296.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 350.80859375, "completions/mean_terminated_length": 350.80859375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.08920551370829344, "epoch": 0.8690265486725663, "frac_reward_zero_std": 0.0, "grad_norm": 0.15734730094162222, "learning_rate": 1e-06, "loss": 0.0099, "num_tokens": 244870687.0, "reward": 0.576953113079071, "reward_std": 0.47307515144348145, "rewards/execution_accuracy_EX/mean": 0.5546875, "rewards/execution_accuracy_EX/std": 0.49797385931015015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9843152165412903, "sampling/importance_sampling_ratio/min": 0.005254056304693222, "sampling/sampling_logp_difference/max": 5.248754978179932, "sampling/sampling_logp_difference/mean": 0.10570266097784042, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 403.25390625, "completions/mean_terminated_length": 403.25390625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.09854715596884489, "epoch": 0.8707964601769912, "frac_reward_zero_std": 0.0, "grad_norm": 0.15796668590192425, "learning_rate": 1e-06, "loss": -0.0038, "num_tokens": 245341056.0, "reward": 0.818164050579071, "reward_std": 0.3744697868824005, "rewards/execution_accuracy_EX/mean": 0.80859375, "rewards/execution_accuracy_EX/std": 0.39417871832847595, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9790687561035156, "sampling/importance_sampling_ratio/min": 0.0052512455731630325, "sampling/sampling_logp_difference/max": 5.2492899894714355, "sampling/sampling_logp_difference/mean": 0.12462789565324783, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1840.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 431.53125, "completions/mean_terminated_length": 431.53125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.11714785825461149, "epoch": 0.8725663716814159, "frac_reward_zero_std": 0.0, "grad_norm": 0.30365585894340785, "learning_rate": 1e-06, "loss": -0.0083, "num_tokens": 246064200.0, "reward": 0.539843738079071, "reward_std": 0.47569799423217773, "rewards/execution_accuracy_EX/mean": 0.515625, "rewards/execution_accuracy_EX/std": 0.5007347464561462, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9790074825286865, "sampling/importance_sampling_ratio/min": 0.004090499132871628, "sampling/sampling_logp_difference/max": 5.499088287353516, "sampling/sampling_logp_difference/mean": 0.13717404007911682, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 467.015625, "completions/mean_terminated_length": 467.015625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.11044671479612589, "epoch": 0.8743362831858407, "frac_reward_zero_std": 0.0, "grad_norm": 0.1579492562378529, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 246674044.0, "reward": 0.5287109613418579, "reward_std": 0.47591593861579895, "rewards/execution_accuracy_EX/mean": 0.50390625, "rewards/execution_accuracy_EX/std": 0.5009641647338867, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9786334037780762, "sampling/importance_sampling_ratio/min": 0.005253662820905447, "sampling/sampling_logp_difference/max": 5.2488298416137695, "sampling/sampling_logp_difference/mean": 0.13233034312725067, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3893.0, "completions/max_terminated_length": 3893.0, "completions/mean_length": 575.8984375, "completions/mean_terminated_length": 575.8984375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.11832599341869354, "epoch": 0.8761061946902655, "frac_reward_zero_std": 0.0, "grad_norm": 0.23791347944061128, "learning_rate": 1e-06, "loss": -0.0171, "num_tokens": 247231794.0, "reward": 0.6214843988418579, "reward_std": 0.46600863337516785, "rewards/execution_accuracy_EX/mean": 0.6015625, "rewards/execution_accuracy_EX/std": 0.4905354380607605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9844000339508057, "sampling/importance_sampling_ratio/min": 0.006744096055626869, "sampling/sampling_logp_difference/max": 4.999087810516357, "sampling/sampling_logp_difference/mean": 0.12517523765563965, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1979.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 431.796875, "completions/mean_terminated_length": 431.796875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.10006839781999588, "epoch": 0.8778761061946903, "frac_reward_zero_std": 0.0, "grad_norm": 0.33967296231070576, "learning_rate": 1e-06, "loss": 0.0326, "num_tokens": 247851294.0, "reward": 0.6177734136581421, "reward_std": 0.46676453948020935, "rewards/execution_accuracy_EX/mean": 0.59765625, "rewards/execution_accuracy_EX/std": 0.4913311004638672, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9763818979263306, "sampling/importance_sampling_ratio/min": 0.0020691403187811375, "sampling/sampling_logp_difference/max": 6.180622100830078, "sampling/sampling_logp_difference/mean": 0.13006316125392914, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1658.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 367.765625, "completions/mean_terminated_length": 367.765625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.0875820703804493, "epoch": 0.879646017699115, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 248338866.0, "reward": 0.5843749642372131, "reward_std": 0.47219762206077576, "rewards/execution_accuracy_EX/mean": 0.5625, "rewards/execution_accuracy_EX/std": 0.49705013632774353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.939441204071045, "sampling/importance_sampling_ratio/mean": 0.9807998538017273, "sampling/importance_sampling_ratio/min": 0.0031850412487983704, "sampling/sampling_logp_difference/max": 5.7492899894714355, "sampling/sampling_logp_difference/mean": 0.11449206620454788, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 417.390625, "completions/mean_terminated_length": 402.9647216796875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.09304992202669382, "epoch": 0.8814159292035398, "frac_reward_zero_std": 0.0, "grad_norm": 0.2856579719446031, "learning_rate": 1e-06, "loss": 0.0045, "num_tokens": 248861206.0, "reward": 0.6880859136581421, "reward_std": 0.4472186863422394, "rewards/execution_accuracy_EX/mean": 0.671875, "rewards/execution_accuracy_EX/std": 0.47045037150382996, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9826182723045349, "sampling/importance_sampling_ratio/min": 0.0025853586848825216, "sampling/sampling_logp_difference/max": 5.95789098739624, "sampling/sampling_logp_difference/mean": 0.11140456795692444, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3125.0, "completions/max_terminated_length": 3125.0, "completions/mean_length": 478.19140625, "completions/mean_terminated_length": 478.19140625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.1192424027249217, "epoch": 0.8831858407079646, "frac_reward_zero_std": 0.0, "grad_norm": 0.31405386678781605, "learning_rate": 1e-06, "loss": 0.0243, "num_tokens": 249331079.0, "reward": 0.6214843392372131, "reward_std": 0.46600863337516785, "rewards/execution_accuracy_EX/mean": 0.6015625, "rewards/execution_accuracy_EX/std": 0.4905354380607605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9545390605926514, "sampling/importance_sampling_ratio/mean": 0.9833201169967651, "sampling/importance_sampling_ratio/min": 0.004090553615242243, "sampling/sampling_logp_difference/max": 5.499074935913086, "sampling/sampling_logp_difference/mean": 0.1283523440361023, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3463.0, "completions/max_terminated_length": 3463.0, "completions/mean_length": 472.64453125, "completions/mean_terminated_length": 472.64453125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.11922439560294151, "epoch": 0.8849557522123894, "frac_reward_zero_std": 0.0, "grad_norm": 0.3128883702994043, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 249848524.0, "reward": 0.5249999761581421, "reward_std": 0.47593042254447937, "rewards/execution_accuracy_EX/mean": 0.5, "rewards/execution_accuracy_EX/std": 0.5009794235229492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9780963659286499, "sampling/importance_sampling_ratio/min": 0.002482479205355048, "sampling/sampling_logp_difference/max": 5.998497486114502, "sampling/sampling_logp_difference/mean": 0.13651864230632782, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3222.0, "completions/max_terminated_length": 3222.0, "completions/mean_length": 538.19140625, "completions/mean_terminated_length": 538.19140625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.1208310667425394, "epoch": 0.8867256637168142, "frac_reward_zero_std": 0.0, "grad_norm": 0.4005324520398836, "learning_rate": 1e-06, "loss": 0.0494, "num_tokens": 250441805.0, "reward": 0.781054675579071, "reward_std": 0.40085992217063904, "rewards/execution_accuracy_EX/mean": 0.76953125, "rewards/execution_accuracy_EX/std": 0.4219578504562378, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.980903148651123, "sampling/importance_sampling_ratio/min": 0.00409121485427022, "sampling/sampling_logp_difference/max": 5.498913288116455, "sampling/sampling_logp_difference/mean": 0.13536778092384338, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3866.0, "completions/mean_length": 592.1875, "completions/mean_terminated_length": 508.0960388183594, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.12260824535042048, "epoch": 0.8884955752212389, "frac_reward_zero_std": 0.0, "grad_norm": 0.22612955695692616, "learning_rate": 1e-06, "loss": 0.0121, "num_tokens": 251306525.0, "reward": 0.5906250476837158, "reward_std": 0.4726126492023468, "rewards/execution_accuracy_EX/mean": 0.5703125, "rewards/execution_accuracy_EX/std": 0.4960011839866638, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9827229380607605, "sampling/importance_sampling_ratio/min": 0.0031930049881339073, "sampling/sampling_logp_difference/max": 5.746792793273926, "sampling/sampling_logp_difference/mean": 0.13382920622825623, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2912.0, "completions/mean_length": 541.49609375, "completions/mean_terminated_length": 527.556884765625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.10825845878571272, "epoch": 0.8902654867256637, "frac_reward_zero_std": 0.0, "grad_norm": 0.3159933968187752, "learning_rate": 1e-06, "loss": 0.0274, "num_tokens": 251727196.0, "reward": 0.662109375, "reward_std": 0.45588722825050354, "rewards/execution_accuracy_EX/mean": 0.64453125, "rewards/execution_accuracy_EX/std": 0.4795927405357361, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9812757968902588, "sampling/importance_sampling_ratio/min": 0.005253662820905447, "sampling/sampling_logp_difference/max": 5.2488298416137695, "sampling/sampling_logp_difference/mean": 0.12318531423807144, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1989.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 486.94921875, "completions/mean_terminated_length": 486.94921875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.109563447535038, "epoch": 0.8920353982300885, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 252416847.0, "reward": 0.762499988079071, "reward_std": 0.41216787695884705, "rewards/execution_accuracy_EX/mean": 0.75, "rewards/execution_accuracy_EX/std": 0.4338609278202057, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9816642999649048, "sampling/importance_sampling_ratio/min": 0.00527913635596633, "sampling/sampling_logp_difference/max": 5.243992805480957, "sampling/sampling_logp_difference/mean": 0.12530678510665894, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2113.0, "completions/max_terminated_length": 2113.0, "completions/mean_length": 512.69140625, "completions/mean_terminated_length": 512.69140625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.1084189796820283, "epoch": 0.8938053097345132, "frac_reward_zero_std": 0.0, "grad_norm": 0.09095908302862653, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 253025600.0, "reward": 0.6400390863418579, "reward_std": 0.4617617428302765, "rewards/execution_accuracy_EX/mean": 0.62109375, "rewards/execution_accuracy_EX/std": 0.4860650300979614, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9807025194168091, "sampling/importance_sampling_ratio/min": 0.004090502858161926, "sampling/sampling_logp_difference/max": 5.499087333679199, "sampling/sampling_logp_difference/mean": 0.12630710005760193, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1307.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 376.03125, "completions/mean_terminated_length": 376.03125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.09384919796139002, "epoch": 0.8955752212389381, "frac_reward_zero_std": 0.0, "grad_norm": 0.23427264693869862, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 253559592.0, "reward": 0.6177734136581421, "reward_std": 0.46676453948020935, "rewards/execution_accuracy_EX/mean": 0.59765625, "rewards/execution_accuracy_EX/std": 0.4913311004638672, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9788470268249512, "sampling/importance_sampling_ratio/min": 0.001591972541064024, "sampling/sampling_logp_difference/max": 6.442781448364258, "sampling/sampling_logp_difference/mean": 0.11854994297027588, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1295.0, "completions/max_terminated_length": 1295.0, "completions/mean_length": 388.46875, "completions/mean_terminated_length": 388.46875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.09089147672057152, "epoch": 0.8973451327433628, "frac_reward_zero_std": 0.0, "grad_norm": 0.5619963055559248, "learning_rate": 1e-06, "loss": 0.0169, "num_tokens": 254248544.0, "reward": 0.8441406488418579, "reward_std": 0.35250478982925415, "rewards/execution_accuracy_EX/mean": 0.8359375, "rewards/execution_accuracy_EX/std": 0.3710577189922333, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9788107872009277, "sampling/importance_sampling_ratio/min": 0.004090497270226479, "sampling/sampling_logp_difference/max": 5.499088764190674, "sampling/sampling_logp_difference/mean": 0.11855372041463852, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2497.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 489.6875, "completions/mean_terminated_length": 489.6875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.1044982336461544, "epoch": 0.8991150442477877, "frac_reward_zero_std": 0.0, "grad_norm": 0.2921129016530104, "learning_rate": 1e-06, "loss": -0.0345, "num_tokens": 254817392.0, "reward": 0.5880858898162842, "reward_std": 0.4717142879962921, "rewards/execution_accuracy_EX/mean": 0.56640625, "rewards/execution_accuracy_EX/std": 0.4965413510799408, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9816838502883911, "sampling/importance_sampling_ratio/min": 0.0040977573953568935, "sampling/sampling_logp_difference/max": 5.497315406799316, "sampling/sampling_logp_difference/mean": 0.12222027778625488, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2209.0, "completions/mean_length": 439.22265625, "completions/mean_terminated_length": 424.88238525390625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.09216785151511431, "epoch": 0.9008849557522124, "frac_reward_zero_std": 0.0, "grad_norm": 0.32409460565615733, "learning_rate": 1e-06, "loss": -0.0298, "num_tokens": 255554553.0, "reward": 0.8179687261581421, "reward_std": 0.3748847544193268, "rewards/execution_accuracy_EX/mean": 0.80859375, "rewards/execution_accuracy_EX/std": 0.39417871832847595, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9796115159988403, "sampling/importance_sampling_ratio/min": 0.0031879206653684378, "sampling/sampling_logp_difference/max": 5.748386383056641, "sampling/sampling_logp_difference/mean": 0.11551597714424133, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1539.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 416.89453125, "completions/mean_terminated_length": 416.89453125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.0894682826474309, "epoch": 0.9026548672566371, "frac_reward_zero_std": 0.0, "grad_norm": 0.267773109103576, "learning_rate": 1e-06, "loss": -0.0114, "num_tokens": 256062830.0, "reward": 0.699414074420929, "reward_std": 0.44268524646759033, "rewards/execution_accuracy_EX/mean": 0.68359375, "rewards/execution_accuracy_EX/std": 0.4659844934940338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9791895151138306, "sampling/importance_sampling_ratio/min": 0.004094167612493038, "sampling/sampling_logp_difference/max": 5.498191833496094, "sampling/sampling_logp_difference/mean": 0.1187569797039032, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 546.890625, "completions/mean_terminated_length": 532.9725952148438, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.111790481954813, "epoch": 0.904424778761062, "frac_reward_zero_std": 0.0, "grad_norm": 0.1727699268771358, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 256732882.0, "reward": 0.810546875, "reward_std": 0.3805904686450958, "rewards/execution_accuracy_EX/mean": 0.80078125, "rewards/execution_accuracy_EX/std": 0.40019527077674866, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9822320938110352, "sampling/importance_sampling_ratio/min": 0.0033309035934507847, "sampling/sampling_logp_difference/max": 5.704511642456055, "sampling/sampling_logp_difference/mean": 0.1255459487438202, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2446.0, "completions/max_terminated_length": 2446.0, "completions/mean_length": 473.73046875, "completions/mean_terminated_length": 473.73046875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.094749940559268, "epoch": 0.9061946902654867, "frac_reward_zero_std": 0.0, "grad_norm": 0.49153464022870086, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 257315437.0, "reward": 0.8923828601837158, "reward_std": 0.30167925357818604, "rewards/execution_accuracy_EX/mean": 0.88671875, "rewards/execution_accuracy_EX/std": 0.31755712628364563, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9789038896560669, "sampling/importance_sampling_ratio/min": 0.00409361720085144, "sampling/sampling_logp_difference/max": 5.498326301574707, "sampling/sampling_logp_difference/mean": 0.12143756449222565, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1649.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 408.375, "completions/mean_terminated_length": 408.375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.08351874630898237, "epoch": 0.9079646017699115, "frac_reward_zero_std": 0.0, "grad_norm": 0.4131325644119373, "learning_rate": 1e-06, "loss": -0.0162, "num_tokens": 257769533.0, "reward": 0.8404296636581421, "reward_std": 0.3558422923088074, "rewards/execution_accuracy_EX/mean": 0.83203125, "rewards/execution_accuracy_EX/std": 0.3745708465576172, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9636545181274414, "sampling/importance_sampling_ratio/mean": 0.9805830717086792, "sampling/importance_sampling_ratio/min": 0.0052512455731630325, "sampling/sampling_logp_difference/max": 5.2492899894714355, "sampling/sampling_logp_difference/mean": 0.10943485051393509, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1727.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 404.546875, "completions/mean_terminated_length": 404.546875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.08706046268343925, "epoch": 0.9097345132743363, "frac_reward_zero_std": 0.0, "grad_norm": 0.17693289212742777, "learning_rate": 1e-06, "loss": -0.0084, "num_tokens": 258189177.0, "reward": 0.7699218988418579, "reward_std": 0.40778404474258423, "rewards/execution_accuracy_EX/mean": 0.7578125, "rewards/execution_accuracy_EX/std": 0.4292463958263397, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9794632196426392, "sampling/importance_sampling_ratio/min": 0.004096901509910822, "sampling/sampling_logp_difference/max": 5.497524261474609, "sampling/sampling_logp_difference/mean": 0.1164979562163353, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2443.0, "completions/max_terminated_length": 2443.0, "completions/mean_length": 425.27734375, "completions/mean_terminated_length": 425.27734375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.08132558595389128, "epoch": 0.911504424778761, "frac_reward_zero_std": 0.0, "grad_norm": 0.32445579593943996, "learning_rate": 1e-06, "loss": -0.0264, "num_tokens": 258727360.0, "reward": 0.38398435711860657, "reward_std": 0.4544737637042999, "rewards/execution_accuracy_EX/mean": 0.3515625, "rewards/execution_accuracy_EX/std": 0.47839346528053284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9808209538459778, "sampling/importance_sampling_ratio/min": 0.002113393973559141, "sampling/sampling_logp_difference/max": 6.159460067749023, "sampling/sampling_logp_difference/mean": 0.10712788999080658, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3404.0, "completions/max_terminated_length": 3404.0, "completions/mean_length": 517.51171875, "completions/mean_terminated_length": 517.51171875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.09406900592148304, "epoch": 0.9132743362831859, "frac_reward_zero_std": 0.0, "grad_norm": 0.17404203103502056, "learning_rate": 1e-06, "loss": 0.0117, "num_tokens": 259274915.0, "reward": 0.703125, "reward_std": 0.44119933247566223, "rewards/execution_accuracy_EX/mean": 0.6875, "rewards/execution_accuracy_EX/std": 0.4644203782081604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9837716221809387, "sampling/importance_sampling_ratio/min": 0.003188925562426448, "sampling/sampling_logp_difference/max": 5.748071193695068, "sampling/sampling_logp_difference/mean": 0.10998736321926117, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2474.0, "completions/mean_length": 600.546875, "completions/mean_terminated_length": 586.8392333984375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.11646796762943268, "epoch": 0.9150442477876106, "frac_reward_zero_std": 0.0, "grad_norm": 0.15815404113748285, "learning_rate": 1e-06, "loss": -0.0135, "num_tokens": 260107151.0, "reward": 0.5804687738418579, "reward_std": 0.47288164496421814, "rewards/execution_accuracy_EX/mean": 0.55859375, "rewards/execution_accuracy_EX/std": 0.4975275993347168, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9820814728736877, "sampling/importance_sampling_ratio/min": 0.0005535886157304049, "sampling/sampling_logp_difference/max": 7.499088764190674, "sampling/sampling_logp_difference/mean": 0.1287885457277298, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1655.0, "completions/max_terminated_length": 1655.0, "completions/mean_length": 444.37890625, "completions/mean_terminated_length": 444.37890625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.08392216078937054, "epoch": 0.9168141592920354, "frac_reward_zero_std": 0.0, "grad_norm": 0.2918197477798191, "learning_rate": 1e-06, "loss": 0.0296, "num_tokens": 260586608.0, "reward": 0.6251952648162842, "reward_std": 0.4652217924594879, "rewards/execution_accuracy_EX/mean": 0.60546875, "rewards/execution_accuracy_EX/std": 0.48970720171928406, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9797810316085815, "sampling/importance_sampling_ratio/min": 0.004092922434210777, "sampling/sampling_logp_difference/max": 5.498496055603027, "sampling/sampling_logp_difference/mean": 0.11067145317792892, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3870.0, "completions/mean_length": 763.07421875, "completions/mean_terminated_length": 710.170654296875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.11816421616822481, "epoch": 0.9185840707964602, "frac_reward_zero_std": 0.0, "grad_norm": 0.14280175240062037, "learning_rate": 1e-06, "loss": -0.0063, "num_tokens": 261095315.0, "reward": 0.598437488079071, "reward_std": 0.4710412621498108, "rewards/execution_accuracy_EX/mean": 0.578125, "rewards/execution_accuracy_EX/std": 0.49482619762420654, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9847060441970825, "sampling/importance_sampling_ratio/min": 0.005259410012513399, "sampling/sampling_logp_difference/max": 5.24773645401001, "sampling/sampling_logp_difference/mean": 0.1283852905035019, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1585.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 456.22265625, "completions/mean_terminated_length": 456.22265625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.0838457578793168, "epoch": 0.9203539823008849, "frac_reward_zero_std": 0.0, "grad_norm": 0.18157275520876287, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 261569372.0, "reward": 0.688281238079071, "reward_std": 0.44692784547805786, "rewards/execution_accuracy_EX/mean": 0.671875, "rewards/execution_accuracy_EX/std": 0.47045037150382996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9820669889450073, "sampling/importance_sampling_ratio/min": 0.002480123657733202, "sampling/sampling_logp_difference/max": 5.999446868896484, "sampling/sampling_logp_difference/mean": 0.10748156905174255, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2600.0, "completions/max_terminated_length": 2600.0, "completions/mean_length": 598.69921875, "completions/mean_terminated_length": 598.69921875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.09875625185668468, "epoch": 0.9221238938053097, "frac_reward_zero_std": 0.0, "grad_norm": 0.21427103776619694, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 262117679.0, "reward": 0.5138671398162842, "reward_std": 0.47579970955848694, "rewards/execution_accuracy_EX/mean": 0.48828125, "rewards/execution_accuracy_EX/std": 0.5008418560028076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9818670153617859, "sampling/importance_sampling_ratio/min": 0.0031865073833614588, "sampling/sampling_logp_difference/max": 5.7488298416137695, "sampling/sampling_logp_difference/mean": 0.11636389791965485, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2034.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 535.44140625, "completions/mean_terminated_length": 535.44140625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.09476997330784798, "epoch": 0.9238938053097345, "frac_reward_zero_std": 0.0, "grad_norm": 0.14134909702894238, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 262652528.0, "reward": 0.5249999761581421, "reward_std": 0.47593045234680176, "rewards/execution_accuracy_EX/mean": 0.5, "rewards/execution_accuracy_EX/std": 0.5009794235229492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9805616140365601, "sampling/importance_sampling_ratio/min": 0.004092916380614042, "sampling/sampling_logp_difference/max": 5.498497486114502, "sampling/sampling_logp_difference/mean": 0.11794306337833405, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2073.0, "completions/max_terminated_length": 2073.0, "completions/mean_length": 624.69921875, "completions/mean_terminated_length": 624.69921875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.10736214555799961, "epoch": 0.9256637168141593, "frac_reward_zero_std": 0.0, "grad_norm": 0.29183791822517186, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 263290339.0, "reward": 0.9146484136581421, "reward_std": 0.2721920311450958, "rewards/execution_accuracy_EX/mean": 0.91015625, "rewards/execution_accuracy_EX/std": 0.2865179479122162, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9826004505157471, "sampling/importance_sampling_ratio/min": 0.0024888834450393915, "sampling/sampling_logp_difference/max": 5.9959211349487305, "sampling/sampling_logp_difference/mean": 0.12244834750890732, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1350.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 479.49609375, "completions/mean_terminated_length": 479.49609375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.09032739698886871, "epoch": 0.9274336283185841, "frac_reward_zero_std": 0.0, "grad_norm": 0.09007119235338681, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 263705634.0, "reward": 0.6400390267372131, "reward_std": 0.4617617428302765, "rewards/execution_accuracy_EX/mean": 0.62109375, "rewards/execution_accuracy_EX/std": 0.4860650300979614, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.977393627166748, "sampling/importance_sampling_ratio/min": 0.005295821465551853, "sampling/sampling_logp_difference/max": 5.240837097167969, "sampling/sampling_logp_difference/mean": 0.12161089479923248, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2266.0, "completions/max_terminated_length": 2266.0, "completions/mean_length": 605.0390625, "completions/mean_terminated_length": 605.0390625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.09797837119549513, "epoch": 0.9292035398230089, "frac_reward_zero_std": 0.0, "grad_norm": 0.3552627690830833, "learning_rate": 1e-06, "loss": 0.0264, "num_tokens": 264163196.0, "reward": 0.7847656011581421, "reward_std": 0.39845579862594604, "rewards/execution_accuracy_EX/mean": 0.7734375, "rewards/execution_accuracy_EX/std": 0.41942715644836426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9794970750808716, "sampling/importance_sampling_ratio/min": 0.00121711113024503, "sampling/sampling_logp_difference/max": 6.711275100708008, "sampling/sampling_logp_difference/mean": 0.12153811752796173, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2605.0, "completions/mean_length": 738.27734375, "completions/mean_terminated_length": 725.10986328125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.11737409885972738, "epoch": 0.9309734513274336, "frac_reward_zero_std": 0.0, "grad_norm": 0.06868158913692704, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 264607667.0, "reward": 0.69921875, "reward_std": 0.44298383593559265, "rewards/execution_accuracy_EX/mean": 0.68359375, "rewards/execution_accuracy_EX/std": 0.4659844934940338, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.983741819858551, "sampling/importance_sampling_ratio/min": 0.004089032299816608, "sampling/sampling_logp_difference/max": 5.499446868896484, "sampling/sampling_logp_difference/mean": 0.12724415957927704, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1629.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 482.62109375, "completions/mean_terminated_length": 482.62109375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.08801850583404303, "epoch": 0.9327433628318584, "frac_reward_zero_std": 0.0, "grad_norm": 0.1981702316256674, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 265250706.0, "reward": 0.6214843988418579, "reward_std": 0.46600866317749023, "rewards/execution_accuracy_EX/mean": 0.6015625, "rewards/execution_accuracy_EX/std": 0.4905354380607605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9793202877044678, "sampling/importance_sampling_ratio/min": 0.005254073534160852, "sampling/sampling_logp_difference/max": 5.248751640319824, "sampling/sampling_logp_difference/mean": 0.11619452387094498, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 604.30078125, "completions/mean_terminated_length": 590.6078491210938, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.11058936733752489, "epoch": 0.9345132743362832, "frac_reward_zero_std": 0.0, "grad_norm": 0.13104503028291592, "learning_rate": 1e-06, "loss": -0.0043, "num_tokens": 265764943.0, "reward": 0.5285155773162842, "reward_std": 0.4761233627796173, "rewards/execution_accuracy_EX/mean": 0.50390625, "rewards/execution_accuracy_EX/std": 0.5009641647338867, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9812273979187012, "sampling/importance_sampling_ratio/min": 0.0012502801837399602, "sampling/sampling_logp_difference/max": 6.684387683868408, "sampling/sampling_logp_difference/mean": 0.1292923390865326, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2930.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 801.4140625, "completions/mean_terminated_length": 801.4140625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.1286866944283247, "epoch": 0.9362831858407079, "frac_reward_zero_std": 0.0, "grad_norm": 0.24626629949481477, "learning_rate": 1e-06, "loss": -0.0098, "num_tokens": 266414441.0, "reward": 0.513867199420929, "reward_std": 0.47579970955848694, "rewards/execution_accuracy_EX/mean": 0.48828125, "rewards/execution_accuracy_EX/std": 0.5008418560028076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9831207990646362, "sampling/importance_sampling_ratio/min": 0.005253662820905447, "sampling/sampling_logp_difference/max": 5.2488298416137695, "sampling/sampling_logp_difference/mean": 0.1360650360584259, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2637.0, "completions/max_terminated_length": 2637.0, "completions/mean_length": 689.53125, "completions/mean_terminated_length": 689.53125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.12619277369230986, "epoch": 0.9380530973451328, "frac_reward_zero_std": 0.0, "grad_norm": 0.20366507624595218, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 266961649.0, "reward": 0.6214843392372131, "reward_std": 0.46600866317749023, "rewards/execution_accuracy_EX/mean": 0.6015625, "rewards/execution_accuracy_EX/std": 0.4905354380607605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.978034496307373, "sampling/importance_sampling_ratio/min": 0.005257649812847376, "sampling/sampling_logp_difference/max": 5.248071193695068, "sampling/sampling_logp_difference/mean": 0.14427325129508972, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1736.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 580.36328125, "completions/mean_terminated_length": 580.36328125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.10494234599173069, "epoch": 0.9398230088495575, "frac_reward_zero_std": 0.0, "grad_norm": 0.27675007888714714, "learning_rate": 1e-06, "loss": -0.0153, "num_tokens": 267450094.0, "reward": 0.703125, "reward_std": 0.44119933247566223, "rewards/execution_accuracy_EX/mean": 0.6875, "rewards/execution_accuracy_EX/std": 0.4644203782081604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9789376258850098, "sampling/importance_sampling_ratio/min": 0.0023584607988595963, "sampling/sampling_logp_difference/max": 6.049746036529541, "sampling/sampling_logp_difference/mean": 0.12963271141052246, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2089.0, "completions/max_terminated_length": 2089.0, "completions/mean_length": 578.01953125, "completions/mean_terminated_length": 578.01953125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.09198547527194023, "epoch": 0.9415929203539823, "frac_reward_zero_std": 0.0, "grad_norm": 0.31963955960204044, "learning_rate": 1e-06, "loss": -0.0078, "num_tokens": 267878435.0, "reward": 0.669726550579071, "reward_std": 0.4533011019229889, "rewards/execution_accuracy_EX/mean": 0.65234375, "rewards/execution_accuracy_EX/std": 0.4771590530872345, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9281052350997925, "sampling/importance_sampling_ratio/mean": 0.9801375269889832, "sampling/importance_sampling_ratio/min": 0.00674409931525588, "sampling/sampling_logp_difference/max": 4.999087333679199, "sampling/sampling_logp_difference/mean": 0.11358733475208282, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1985.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 469.66796875, "completions/mean_terminated_length": 469.66796875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.09485425241291523, "epoch": 0.9433628318584071, "frac_reward_zero_std": 0.0, "grad_norm": 0.06560598067038206, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 268358302.0, "reward": 0.7587890625, "reward_std": 0.4142923355102539, "rewards/execution_accuracy_EX/mean": 0.74609375, "rewards/execution_accuracy_EX/std": 0.4360972046852112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9779180288314819, "sampling/importance_sampling_ratio/min": 0.005253662820905447, "sampling/sampling_logp_difference/max": 5.2488298416137695, "sampling/sampling_logp_difference/mean": 0.12235035747289658, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 487.91015625, "completions/mean_terminated_length": 459.5, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.09254768490791321, "epoch": 0.9451327433628318, "frac_reward_zero_std": 0.0, "grad_norm": 0.26639882083187877, "learning_rate": 1e-06, "loss": -0.0084, "num_tokens": 268833959.0, "reward": 0.8326171636581421, "reward_std": 0.3631838858127594, "rewards/execution_accuracy_EX/mean": 0.82421875, "rewards/execution_accuracy_EX/std": 0.3813795745372772, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9783578515052795, "sampling/importance_sampling_ratio/min": 0.003186581889167428, "sampling/sampling_logp_difference/max": 5.748806476593018, "sampling/sampling_logp_difference/mean": 0.12303325533866882, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2641.0, "completions/mean_length": 568.8203125, "completions/mean_terminated_length": 554.98828125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.10234727151691914, "epoch": 0.9469026548672567, "frac_reward_zero_std": 0.0, "grad_norm": 0.47445261674977657, "learning_rate": 1e-06, "loss": -0.0627, "num_tokens": 269286217.0, "reward": 0.881054699420929, "reward_std": 0.31533122062683105, "rewards/execution_accuracy_EX/mean": 0.875, "rewards/execution_accuracy_EX/std": 0.33136674761772156, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.981724202632904, "sampling/importance_sampling_ratio/min": 0.005258482415229082, "sampling/sampling_logp_difference/max": 5.247912883758545, "sampling/sampling_logp_difference/mean": 0.1213020533323288, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1720.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 506.16796875, "completions/mean_terminated_length": 506.16796875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.10132628306746483, "epoch": 0.9486725663716814, "frac_reward_zero_std": 0.0, "grad_norm": 0.20337304514034285, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 269933188.0, "reward": 0.740234375, "reward_std": 0.4242667853832245, "rewards/execution_accuracy_EX/mean": 0.7265625, "rewards/execution_accuracy_EX/std": 0.446596622467041, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9774594902992249, "sampling/importance_sampling_ratio/min": 0.002208918798714876, "sampling/sampling_logp_difference/max": 6.1152520179748535, "sampling/sampling_logp_difference/mean": 0.12845507264137268, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2129.0, "completions/max_terminated_length": 2129.0, "completions/mean_length": 602.69921875, "completions/mean_terminated_length": 602.69921875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.11411858536303043, "epoch": 0.9504424778761061, "frac_reward_zero_std": 0.0, "grad_norm": 0.26247268454516753, "learning_rate": 1e-06, "loss": 0.0151, "num_tokens": 270587543.0, "reward": 0.781054675579071, "reward_std": 0.40085992217063904, "rewards/execution_accuracy_EX/mean": 0.76953125, "rewards/execution_accuracy_EX/std": 0.4219578504562378, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9825751781463623, "sampling/importance_sampling_ratio/min": 0.002506313845515251, "sampling/sampling_logp_difference/max": 5.9889421463012695, "sampling/sampling_logp_difference/mean": 0.12803950905799866, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1466.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 551.98046875, "completions/mean_terminated_length": 551.98046875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.10755674354732037, "epoch": 0.952212389380531, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 271284898.0, "reward": 0.6437499523162842, "reward_std": 0.46081769466400146, "rewards/execution_accuracy_EX/mean": 0.625, "rewards/execution_accuracy_EX/std": 0.4850712716579437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9782460927963257, "sampling/importance_sampling_ratio/min": 0.0031874829437583685, "sampling/sampling_logp_difference/max": 5.748523712158203, "sampling/sampling_logp_difference/mean": 0.1307043731212616, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3235.0, "completions/mean_length": 585.19921875, "completions/mean_terminated_length": 571.431396484375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.1014731377363205, "epoch": 0.9539823008849557, "frac_reward_zero_std": 0.0, "grad_norm": 0.2460155561648805, "learning_rate": 1e-06, "loss": 0.0216, "num_tokens": 271868741.0, "reward": 0.6732421517372131, "reward_std": 0.4523758888244629, "rewards/execution_accuracy_EX/mean": 0.65625, "rewards/execution_accuracy_EX/std": 0.47588926553726196, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.980535626411438, "sampling/importance_sampling_ratio/min": 0.002563734073191881, "sampling/sampling_logp_difference/max": 5.966290473937988, "sampling/sampling_logp_difference/mean": 0.12229540944099426, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2161.0, "completions/max_terminated_length": 2161.0, "completions/mean_length": 574.2734375, "completions/mean_terminated_length": 574.2734375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.10646900534629822, "epoch": 0.9557522123893806, "frac_reward_zero_std": 0.0, "grad_norm": 0.32292854049371705, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 272659355.0, "reward": 0.47675782442092896, "reward_std": 0.47346949577331543, "rewards/execution_accuracy_EX/mean": 0.44921875, "rewards/execution_accuracy_EX/std": 0.49838894605636597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.894546389579773, "sampling/importance_sampling_ratio/mean": 0.97877037525177, "sampling/importance_sampling_ratio/min": 0.0002635188866406679, "sampling/sampling_logp_difference/max": 8.241385459899902, "sampling/sampling_logp_difference/mean": 0.12778767943382263, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2168.0, "completions/mean_length": 489.64453125, "completions/mean_terminated_length": 475.5019836425781, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.09318062476813793, "epoch": 0.9575221238938053, "frac_reward_zero_std": 0.0, "grad_norm": 0.1821087107227332, "learning_rate": 1e-06, "loss": -0.0179, "num_tokens": 273140576.0, "reward": 0.6546875238418579, "reward_std": 0.4580622911453247, "rewards/execution_accuracy_EX/mean": 0.63671875, "rewards/execution_accuracy_EX/std": 0.48188701272010803, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.977733314037323, "sampling/importance_sampling_ratio/min": 0.005289058666676283, "sampling/sampling_logp_difference/max": 5.242115020751953, "sampling/sampling_logp_difference/mean": 0.12172285467386246, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2160.0, "completions/max_terminated_length": 2160.0, "completions/mean_length": 654.5, "completions/mean_terminated_length": 654.5, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.12306590657681227, "epoch": 0.95929203539823, "frac_reward_zero_std": 0.0, "grad_norm": 0.19859294388031962, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 273753856.0, "reward": 0.517578125, "reward_std": 0.47587236762046814, "rewards/execution_accuracy_EX/mean": 0.4921875, "rewards/execution_accuracy_EX/std": 0.5009182691574097, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.981598973274231, "sampling/importance_sampling_ratio/min": 0.0031870543025434017, "sampling/sampling_logp_difference/max": 5.748658180236816, "sampling/sampling_logp_difference/mean": 0.13284003734588623, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1447.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 513.44140625, "completions/mean_terminated_length": 513.44140625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.09991770703345537, "epoch": 0.9610619469026549, "frac_reward_zero_std": 0.0, "grad_norm": 0.15342334977114047, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 274275281.0, "reward": 0.62890625, "reward_std": 0.46440389752388, "rewards/execution_accuracy_EX/mean": 0.609375, "rewards/execution_accuracy_EX/std": 0.48884621262550354, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9770808219909668, "sampling/importance_sampling_ratio/min": 0.0067402091808617115, "sampling/sampling_logp_difference/max": 4.999664306640625, "sampling/sampling_logp_difference/mean": 0.1259656548500061, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 742.94921875, "completions/mean_terminated_length": 742.94921875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.12249947339296341, "epoch": 0.9628318584070796, "frac_reward_zero_std": 0.0, "grad_norm": 0.25436754142506995, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 274932052.0, "reward": 0.6548827886581421, "reward_std": 0.45779263973236084, "rewards/execution_accuracy_EX/mean": 0.63671875, "rewards/execution_accuracy_EX/std": 0.48188701272010803, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9806671142578125, "sampling/importance_sampling_ratio/min": 0.004190544597804546, "sampling/sampling_logp_difference/max": 5.474924564361572, "sampling/sampling_logp_difference/mean": 0.1349233239889145, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2188.0, "completions/mean_length": 581.171875, "completions/mean_terminated_length": 567.3882446289062, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.10665471758693457, "epoch": 0.9646017699115044, "frac_reward_zero_std": 0.0, "grad_norm": 0.08188788651492554, "learning_rate": 1e-06, "loss": -0.0046, "num_tokens": 275359216.0, "reward": 0.6955077648162842, "reward_std": 0.44443103671073914, "rewards/execution_accuracy_EX/mean": 0.6796875, "rewards/execution_accuracy_EX/std": 0.4675106406211853, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9830989837646484, "sampling/importance_sampling_ratio/min": 0.0024902531877160072, "sampling/sampling_logp_difference/max": 5.995370864868164, "sampling/sampling_logp_difference/mean": 0.1171613335609436, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3124.0, "completions/max_terminated_length": 3124.0, "completions/mean_length": 672.296875, "completions/mean_terminated_length": 672.296875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.12414712738245726, "epoch": 0.9663716814159292, "frac_reward_zero_std": 0.0, "grad_norm": 0.17047261584505055, "learning_rate": 1e-06, "loss": 0.0328, "num_tokens": 275936252.0, "reward": 0.5992187261581421, "reward_std": 0.4700848460197449, "rewards/execution_accuracy_EX/mean": 0.578125, "rewards/execution_accuracy_EX/std": 0.49482619762420654, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9840304851531982, "sampling/importance_sampling_ratio/min": 0.005327270831912756, "sampling/sampling_logp_difference/max": 5.2349162101745605, "sampling/sampling_logp_difference/mean": 0.13033321499824524, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2653.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 722.80859375, "completions/mean_terminated_length": 722.80859375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.12042015977203846, "epoch": 0.968141592920354, "frac_reward_zero_std": 0.0, "grad_norm": 0.23625603716043173, "learning_rate": 1e-06, "loss": -0.0107, "num_tokens": 276612939.0, "reward": 0.576953113079071, "reward_std": 0.47307512164115906, "rewards/execution_accuracy_EX/mean": 0.5546875, "rewards/execution_accuracy_EX/std": 0.49797385931015015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9823607206344604, "sampling/importance_sampling_ratio/min": 0.004093400668352842, "sampling/sampling_logp_difference/max": 5.498379230499268, "sampling/sampling_logp_difference/mean": 0.1306840181350708, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2369.0, "completions/max_terminated_length": 2369.0, "completions/mean_length": 755.6484375, "completions/mean_terminated_length": 755.6484375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.13839434459805489, "epoch": 0.9699115044247788, "frac_reward_zero_std": 0.0, "grad_norm": 0.24603509269471777, "learning_rate": 1e-06, "loss": -0.0081, "num_tokens": 277319649.0, "reward": 0.6771484017372131, "reward_std": 0.4508545994758606, "rewards/execution_accuracy_EX/mean": 0.66015625, "rewards/execution_accuracy_EX/std": 0.47458380460739136, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9849244356155396, "sampling/importance_sampling_ratio/min": 0.0009713406325317919, "sampling/sampling_logp_difference/max": 6.936833381652832, "sampling/sampling_logp_difference/mean": 0.1350674033164978, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 559.14453125, "completions/mean_terminated_length": 559.14453125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.11165465787053108, "epoch": 0.9716814159292035, "frac_reward_zero_std": 0.0, "grad_norm": 0.21138017777095772, "learning_rate": 1e-06, "loss": -0.0064, "num_tokens": 277891766.0, "reward": 0.7216796875, "reward_std": 0.4332149624824524, "rewards/execution_accuracy_EX/mean": 0.70703125, "rewards/execution_accuracy_EX/std": 0.45601576566696167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9781904220581055, "sampling/importance_sampling_ratio/min": 0.0031994825694710016, "sampling/sampling_logp_difference/max": 5.7447662353515625, "sampling/sampling_logp_difference/mean": 0.13184812664985657, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 555.58984375, "completions/mean_terminated_length": 527.7125854492188, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.10886009782552719, "epoch": 0.9734513274336283, "frac_reward_zero_std": 0.0, "grad_norm": 0.24652408550642546, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 278449309.0, "reward": 0.6544921398162842, "reward_std": 0.45833173394203186, "rewards/execution_accuracy_EX/mean": 0.63671875, "rewards/execution_accuracy_EX/std": 0.48188701272010803, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9799903035163879, "sampling/importance_sampling_ratio/min": 0.00248888460919261, "sampling/sampling_logp_difference/max": 5.995920658111572, "sampling/sampling_logp_difference/mean": 0.12649500370025635, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2451.0, "completions/max_terminated_length": 2451.0, "completions/mean_length": 728.11328125, "completions/mean_terminated_length": 728.11328125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.1434102226048708, "epoch": 0.9752212389380531, "frac_reward_zero_std": 0.0, "grad_norm": 0.2282316784079456, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 278950234.0, "reward": 0.5695312023162842, "reward_std": 0.473834365606308, "rewards/execution_accuracy_EX/mean": 0.546875, "rewards/execution_accuracy_EX/std": 0.4987730085849762, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9825781583786011, "sampling/importance_sampling_ratio/min": 0.00443238252773881, "sampling/sampling_logp_difference/max": 5.41881799697876, "sampling/sampling_logp_difference/mean": 0.1440458595752716, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 569.9140625, "completions/mean_terminated_length": 569.9140625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.11375557817518711, "epoch": 0.9769911504424779, "frac_reward_zero_std": 0.0, "grad_norm": 0.19190393031905698, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 279539940.0, "reward": 0.6957031488418579, "reward_std": 0.44413506984710693, "rewards/execution_accuracy_EX/mean": 0.6796875, "rewards/execution_accuracy_EX/std": 0.4675106406211853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9792619943618774, "sampling/importance_sampling_ratio/min": 0.0035934499464929104, "sampling/sampling_logp_difference/max": 5.628642559051514, "sampling/sampling_logp_difference/mean": 0.13107334077358246, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1953.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 695.171875, "completions/mean_terminated_length": 695.171875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.14454811066389084, "epoch": 0.9787610619469026, "frac_reward_zero_std": 0.0, "grad_norm": 0.2952323511378513, "learning_rate": 1e-06, "loss": 0.0395, "num_tokens": 280272768.0, "reward": 0.591796875, "reward_std": 0.471201092004776, "rewards/execution_accuracy_EX/mean": 0.5703125, "rewards/execution_accuracy_EX/std": 0.4960011839866638, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9839590787887573, "sampling/importance_sampling_ratio/min": 0.005254973191767931, "sampling/sampling_logp_difference/max": 5.248580455780029, "sampling/sampling_logp_difference/mean": 0.1397646814584732, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 598.0390625, "completions/mean_terminated_length": 598.0390625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.1200046269223094, "epoch": 0.9805309734513274, "frac_reward_zero_std": 0.0, "grad_norm": 0.28352458701611716, "learning_rate": 1e-06, "loss": -0.0101, "num_tokens": 280682490.0, "reward": 0.7847656011581421, "reward_std": 0.39845579862594604, "rewards/execution_accuracy_EX/mean": 0.7734375, "rewards/execution_accuracy_EX/std": 0.41942715644836426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9784917831420898, "sampling/importance_sampling_ratio/min": 0.004114833660423756, "sampling/sampling_logp_difference/max": 5.493156909942627, "sampling/sampling_logp_difference/mean": 0.1344582438468933, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2138.0, "completions/max_terminated_length": 2138.0, "completions/mean_length": 722.98046875, "completions/mean_terminated_length": 722.98046875, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.15025673247873783, "epoch": 0.9823008849557522, "frac_reward_zero_std": 0.0, "grad_norm": 0.3030341174630012, "learning_rate": 1e-06, "loss": -0.0091, "num_tokens": 281238661.0, "reward": 0.48417967557907104, "reward_std": 0.4741697311401367, "rewards/execution_accuracy_EX/mean": 0.45703125, "rewards/execution_accuracy_EX/std": 0.4991260766983032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.982776403427124, "sampling/importance_sampling_ratio/min": 0.0040953196585178375, "sampling/sampling_logp_difference/max": 5.497910499572754, "sampling/sampling_logp_difference/mean": 0.14415386319160461, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 615.2578125, "completions/mean_terminated_length": 615.2578125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.1237062867730856, "epoch": 0.984070796460177, "frac_reward_zero_std": 0.0, "grad_norm": 0.18230060179909763, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 281776839.0, "reward": 0.6400390863418579, "reward_std": 0.4617617428302765, "rewards/execution_accuracy_EX/mean": 0.62109375, "rewards/execution_accuracy_EX/std": 0.4860650300979614, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9814517498016357, "sampling/importance_sampling_ratio/min": 0.01112808845937252, "sampling/sampling_logp_difference/max": 4.4982829093933105, "sampling/sampling_logp_difference/mean": 0.13024307787418365, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 520.00390625, "completions/mean_terminated_length": 520.00390625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.10962794907391071, "epoch": 0.9858407079646018, "frac_reward_zero_std": 0.0, "grad_norm": 0.09861330351470331, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 282454296.0, "reward": 0.6994140148162842, "reward_std": 0.44268524646759033, "rewards/execution_accuracy_EX/mean": 0.68359375, "rewards/execution_accuracy_EX/std": 0.4659844934940338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.979133665561676, "sampling/importance_sampling_ratio/min": 0.005249778274446726, "sampling/sampling_logp_difference/max": 5.249569416046143, "sampling/sampling_logp_difference/mean": 0.12833350896835327, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3972.0, "completions/mean_length": 766.83984375, "completions/mean_terminated_length": 753.7843627929688, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.14016254618763924, "epoch": 0.9876106194690265, "frac_reward_zero_std": 0.0, "grad_norm": 0.23413755278570705, "learning_rate": 1e-06, "loss": -0.0086, "num_tokens": 283027583.0, "reward": 0.7548828125, "reward_std": 0.4167163670063019, "rewards/execution_accuracy_EX/mean": 0.7421875, "rewards/execution_accuracy_EX/std": 0.4382871091365814, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9850025773048401, "sampling/importance_sampling_ratio/min": 0.003982211463153362, "sampling/sampling_logp_difference/max": 5.525918006896973, "sampling/sampling_logp_difference/mean": 0.1391599178314209, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1676.0, "completions/max_terminated_length": 1676.0, "completions/mean_length": 554.875, "completions/mean_terminated_length": 554.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.11856131162494421, "epoch": 0.9893805309734514, "frac_reward_zero_std": 0.0, "grad_norm": 0.30695062544975965, "learning_rate": 1e-06, "loss": 0.0248, "num_tokens": 283446031.0, "reward": 0.8441406488418579, "reward_std": 0.35250481963157654, "rewards/execution_accuracy_EX/mean": 0.8359375, "rewards/execution_accuracy_EX/std": 0.3710577189922333, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8930208683013916, "sampling/importance_sampling_ratio/mean": 0.9805971384048462, "sampling/importance_sampling_ratio/min": 0.0014977871906012297, "sampling/sampling_logp_difference/max": 6.5037665367126465, "sampling/sampling_logp_difference/mean": 0.13229113817214966, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2161.0, "completions/max_terminated_length": 2161.0, "completions/mean_length": 801.5859375, "completions/mean_terminated_length": 801.5859375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.15025562047958374, "epoch": 0.9911504424778761, "frac_reward_zero_std": 0.0, "grad_norm": 0.27251349764168376, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 284133477.0, "reward": 0.5695312023162842, "reward_std": 0.4738343358039856, "rewards/execution_accuracy_EX/mean": 0.546875, "rewards/execution_accuracy_EX/std": 0.4987730085849762, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9856324195861816, "sampling/importance_sampling_ratio/min": 0.0041136350482702255, "sampling/sampling_logp_difference/max": 5.493448257446289, "sampling/sampling_logp_difference/mean": 0.13937297463417053, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2816.0, "completions/max_terminated_length": 2816.0, "completions/mean_length": 560.18359375, "completions/mean_terminated_length": 560.18359375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.13532736897468567, "epoch": 0.9929203539823008, "frac_reward_zero_std": 0.0, "grad_norm": 0.33926341804167603, "learning_rate": 1e-06, "loss": -0.0064, "num_tokens": 284477332.0, "reward": 0.7884765267372131, "reward_std": 0.3960021138191223, "rewards/execution_accuracy_EX/mean": 0.77734375, "rewards/execution_accuracy_EX/std": 0.41684433817863464, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9841941595077515, "sampling/importance_sampling_ratio/min": 0.006754739210009575, "sampling/sampling_logp_difference/max": 4.99751091003418, "sampling/sampling_logp_difference/mean": 0.13882380723953247, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2215.0, "completions/max_terminated_length": 2215.0, "completions/mean_length": 633.6640625, "completions/mean_terminated_length": 633.6640625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.13252269942313433, "epoch": 0.9946902654867257, "frac_reward_zero_std": 0.0, "grad_norm": 0.20670189789834498, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 284988622.0, "reward": 0.5435546636581421, "reward_std": 0.4755672216415405, "rewards/execution_accuracy_EX/mean": 0.51953125, "rewards/execution_accuracy_EX/std": 0.5005971193313599, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.98419189453125, "sampling/importance_sampling_ratio/min": 0.005253507290035486, "sampling/sampling_logp_difference/max": 5.248859405517578, "sampling/sampling_logp_difference/mean": 0.1294565200805664, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2789.0, "completions/max_terminated_length": 2789.0, "completions/mean_length": 704.359375, "completions/mean_terminated_length": 704.359375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.15773088298738003, "epoch": 0.9964601769911504, "frac_reward_zero_std": 0.0, "grad_norm": 0.21451391274463097, "learning_rate": 1e-06, "loss": -0.0256, "num_tokens": 285461802.0, "reward": 0.6064453125, "reward_std": 0.46869391202926636, "rewards/execution_accuracy_EX/mean": 0.5859375, "rewards/execution_accuracy_EX/std": 0.4935242533683777, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9833812117576599, "sampling/importance_sampling_ratio/min": 3.445271431701258e-05, "sampling/sampling_logp_difference/max": 10.275922775268555, "sampling/sampling_logp_difference/mean": 0.1499510258436203, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 687.1015625, "completions/mean_terminated_length": 687.1015625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.1400073505938053, "epoch": 0.9982300884955753, "frac_reward_zero_std": 0.0, "grad_norm": 0.21422768031719658, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 286150612.0, "reward": 0.5990234613418579, "reward_std": 0.4699280560016632, "rewards/execution_accuracy_EX/mean": 0.578125, "rewards/execution_accuracy_EX/std": 0.49482619762420654, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9824703931808472, "sampling/importance_sampling_ratio/min": 0.0008885476854629815, "sampling/sampling_logp_difference/max": 7.0259222984313965, "sampling/sampling_logp_difference/mean": 0.13782978057861328, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3693.0, "completions/max_terminated_length": 3693.0, "completions/mean_length": 695.51171875, "completions/mean_terminated_length": 695.51171875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.1185274813324213, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.03277521648262241, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 286630103.0, "reward": 0.8765624761581421, "reward_std": 0.3139525055885315, "rewards/execution_accuracy_EX/mean": 0.875, "rewards/execution_accuracy_EX/std": 0.33136674761772156, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2920515835285187, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9840861558914185, "sampling/importance_sampling_ratio/min": 0.0001331465900875628, "sampling/sampling_logp_difference/max": 8.924059867858887, "sampling/sampling_logp_difference/mean": 0.12314605712890625, "step": 565 }, { "epoch": 1.0, "step": 565, "total_flos": 0.0, "train_loss": -8.269557677852237e-05, "train_runtime": 34845.6401, "train_samples_per_second": 0.26, "train_steps_per_second": 0.016 } ], "logging_steps": 1, "max_steps": 565, "num_input_tokens_seen": 286630103, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }