{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.555555555555555, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 227.375, "epoch": 0.2222222222222222, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.5e-07, "loss": -0.0, "reward": 3.8474007854238153, "reward_std": 0.3844260007608682, "rewards/concensus_correctness_reward_func": 1.0111249908804893, "rewards/consensus_reward_func": 1.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.6774007824715227, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.7213750029914081, "step": 2 }, { "completion_length": 146.0, "epoch": 0.4444444444444444, "grad_norm": 0.01624213345348835, "kl": 0.004959340076311491, "learning_rate": 4.994647308096508e-07, "loss": 0.0, "reward": 7.895968705415726, "reward_std": 0.027621358633041382, "rewards/concensus_correctness_reward_func": 2.540499985218048, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.625, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.24609375, "step": 4 }, { "completion_length": 143.53125, "epoch": 0.6666666666666666, "grad_norm": 3.286956787109375, "kl": 0.15800717496313155, "learning_rate": 4.951963201008075e-07, "loss": 0.0002, "reward": 6.735341787338257, "reward_std": 0.07137290760874748, "rewards/concensus_correctness_reward_func": 1.9907500222325325, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.9633418284356594, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.234375, "step": 6 }, { "completion_length": 169.28125, "epoch": 0.8888888888888888, "grad_norm": 9.426475524902344, "kl": 0.1612053846474737, "learning_rate": 4.867325323737765e-07, "loss": 0.0002, "reward": 6.488640710711479, "reward_std": 0.4351232862100005, "rewards/concensus_correctness_reward_func": 1.8333124974742532, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9131408147513866, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.1796875, "step": 8 }, { "completion_length": 151.3125, "epoch": 1.1111111111111112, "grad_norm": 9.256837844848633, "kl": 0.26464237936306745, "learning_rate": 4.7421818538317203e-07, "loss": 0.0003, "reward": 6.803590506315231, "reward_std": 0.046551278690458275, "rewards/concensus_correctness_reward_func": 2.0048749819397926, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9432468060404062, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.24609375, "step": 10 }, { "completion_length": 163.53125, "epoch": 1.3333333333333333, "grad_norm": 12.423164367675781, "kl": 0.2060670027276501, "learning_rate": 4.578674030756363e-07, "loss": 0.0002, "reward": 6.566256910562515, "reward_std": 0.0431142863817513, "rewards/concensus_correctness_reward_func": 1.9311249926686287, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.951538197696209, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.23046875, "step": 12 }, { "completion_length": 176.03125, "epoch": 1.5555555555555556, "grad_norm": 10.203335762023926, "kl": 77.29249261360383, "learning_rate": 4.379599518697443e-07, "loss": 0.0773, "reward": 6.705900013446808, "reward_std": 0.1846226155757904, "rewards/concensus_correctness_reward_func": 1.8241250235587358, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.9520875588059425, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 1.2265625, "step": 14 }, { "completion_length": 141.71875, "epoch": 1.7777777777777777, "grad_norm": 23.79648780822754, "kl": 0.6482734929886647, "learning_rate": 4.1483645377501717e-07, "loss": 0.0006, "reward": 7.299397885799408, "reward_std": 0.20589433051645756, "rewards/concensus_correctness_reward_func": 2.252999983727932, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.882335371337831, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.2109375, "step": 16 }, { "completion_length": 148.125, "epoch": 2.0, "grad_norm": 6.815433979034424, "kl": 0.30268154910299927, "learning_rate": 3.8889255825490053e-07, "loss": 0.0003, "reward": 7.134015083312988, "reward_std": 0.21759085834491998, "rewards/concensus_correctness_reward_func": 2.165499985218048, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9997650384902954, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.234375, "step": 18 }, { "completion_length": 164.0625, "epoch": 2.2222222222222223, "grad_norm": 12.549731254577637, "kl": 0.5176574731012806, "learning_rate": 3.605721725547503e-07, "loss": 0.0005, "reward": 7.055621013045311, "reward_std": 0.22157809417694807, "rewards/concensus_correctness_reward_func": 2.1124999839812517, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.9431209936738014, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.25, "step": 20 }, { "completion_length": 157.125, "epoch": 2.4444444444444446, "grad_norm": 6287.96923828125, "kl": 293.20064174674917, "learning_rate": 3.3035986632579036e-07, "loss": 0.2932, "reward": 6.572823256254196, "reward_std": 0.1607245712657459, "rewards/concensus_correctness_reward_func": 1.8748750016093254, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.8971671164035797, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.22265625, "step": 22 }, { "completion_length": 166.0625, "epoch": 2.6666666666666665, "grad_norm": 4.574695110321045, "kl": 0.3619378576404415, "learning_rate": 2.987725805040321e-07, "loss": 0.0004, "reward": 6.958562463521957, "reward_std": 0.10120465932413936, "rewards/concensus_correctness_reward_func": 2.053562507033348, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9987500011920929, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.25, "step": 24 }, { "completion_length": 137.4375, "epoch": 2.888888888888889, "grad_norm": 0.773999810218811, "kl": 0.24499184830347076, "learning_rate": 2.663507823075358e-07, "loss": 0.0002, "reward": 6.848305284976959, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.098499983549118, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9998052977025509, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 26 }, { "completion_length": 166.53125, "epoch": 3.111111111111111, "grad_norm": 13.381827354431152, "kl": 9.179280891527014, "learning_rate": 2.336492176924642e-07, "loss": 0.0092, "reward": 6.872644901275635, "reward_std": 0.18807451496832073, "rewards/concensus_correctness_reward_func": 1.9561875090003014, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9281762056052685, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.23828125, "step": 28 }, { "completion_length": 206.34375, "epoch": 3.3333333333333335, "grad_norm": 37.314693450927734, "kl": 3.1774847840424627, "learning_rate": 2.0122741949596793e-07, "loss": 0.0032, "reward": 6.091063514351845, "reward_std": 0.9392880103550851, "rewards/concensus_correctness_reward_func": 1.7318125031888485, "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.8940947782248259, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 1.1370312497019768, "step": 30 }, { "completion_length": 171.65625, "epoch": 3.5555555555555554, "grad_norm": 21.487810134887695, "kl": 0.8747884714975953, "learning_rate": 1.6964013367420965e-07, "loss": 0.0009, "reward": 6.821558505296707, "reward_std": 0.3932701610028744, "rewards/concensus_correctness_reward_func": 1.949687484651804, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9773397818207741, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 1.23828125, "step": 32 }, { "completion_length": 146.5625, "epoch": 3.7777777777777777, "grad_norm": 12.162491798400879, "kl": 0.22211330354912207, "learning_rate": 1.3942782744524973e-07, "loss": 0.0002, "reward": 6.839292526245117, "reward_std": 0.19596855714917183, "rewards/concensus_correctness_reward_func": 1.9779999945312738, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9433237910270691, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.23046875, "step": 34 }, { "completion_length": 137.15625, "epoch": 4.0, "grad_norm": 0.11948321759700775, "kl": 0.14218732749577612, "learning_rate": 1.1110744174509951e-07, "loss": 0.0001, "reward": 7.290905028581619, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.2913749888539314, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9995300769805908, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 36 }, { "completion_length": 141.96875, "epoch": 4.222222222222222, "grad_norm": 12.181346893310547, "kl": 0.2080342986737378, "learning_rate": 8.516354622498278e-08, "loss": 0.0002, "reward": 7.364260822534561, "reward_std": 0.18401533365249634, "rewards/concensus_correctness_reward_func": 2.285125009715557, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5625, "rewards/question_recreation_reward_func": 0.9679796509444714, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.2361562550067902, "step": 38 }, { "completion_length": 150.1875, "epoch": 4.444444444444445, "grad_norm": 8.948195457458496, "kl": 0.1475777947343886, "learning_rate": 6.204004813025567e-08, "loss": 0.0001, "reward": 6.86873996257782, "reward_std": 0.06226851209066808, "rewards/concensus_correctness_reward_func": 2.038250006735325, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.955489981919527, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 40 }, { "completion_length": 145.5625, "epoch": 4.666666666666667, "grad_norm": 10.765847206115723, "kl": 0.08919750896166079, "learning_rate": 4.213259692436366e-08, "loss": 0.0001, "reward": 6.426816821098328, "reward_std": 0.2883776929229498, "rewards/concensus_correctness_reward_func": 1.8681249767541885, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9436293505132198, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.2244375050067902, "step": 42 }, { "completion_length": 175.4375, "epoch": 4.888888888888889, "grad_norm": 10.602899551391602, "kl": 0.6231412263587117, "learning_rate": 2.5781814616827933e-08, "loss": 0.0006, "reward": 6.617682933807373, "reward_std": 0.45105215105286334, "rewards/concensus_correctness_reward_func": 1.8765625040978193, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.8896204419434071, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.210875004529953, "step": 44 }, { "completion_length": 175.9375, "epoch": 5.111111111111111, "grad_norm": 5.944246768951416, "kl": 0.1490701389266178, "learning_rate": 1.3267467626223605e-08, "loss": 0.0001, "reward": 7.154290199279785, "reward_std": 0.08214430417865515, "rewards/concensus_correctness_reward_func": 2.118624970316887, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9653527066111565, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.2421875, "step": 46 }, { "completion_length": 165.09375, "epoch": 5.333333333333333, "grad_norm": 12.869515419006348, "kl": 0.09843256112071685, "learning_rate": 4.803679899192392e-09, "loss": 0.0001, "reward": 6.846924722194672, "reward_std": 0.059840242145583034, "rewards/concensus_correctness_reward_func": 1.9362499937415123, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9575498029589653, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.234375, "step": 48 }, { "completion_length": 144.71875, "epoch": 5.555555555555555, "grad_norm": 10.745670318603516, "kl": 0.6310826435219496, "learning_rate": 5.352691903491303e-10, "loss": 0.0006, "reward": 7.271151572465897, "reward_std": 0.029827387348632328, "rewards/concensus_correctness_reward_func": 2.292750008404255, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.999339148402214, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.2446874976158142, "step": 50 }, { "epoch": 5.555555555555555, "step": 50, "total_flos": 0.0, "train_loss": 0.015557308013740112, "train_runtime": 385.6941, "train_samples_per_second": 2.074, "train_steps_per_second": 0.13 } ], "logging_steps": 2, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }