{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994666666666666, "eval_steps": 500, "global_step": 937, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.319148936170213e-08, "logits/chosen": 0.044245004653930664, "logits/rejected": 0.27747881412506104, "logps/chosen": -434.4978942871094, "logps/rejected": -265.4134521484375, "loss": 0.3766, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.319148936170213e-07, "logits/chosen": 0.12082168459892273, "logits/rejected": 0.2618510127067566, "logps/chosen": -347.41839599609375, "logps/rejected": -272.1236877441406, "loss": 0.3424, "rewards/accuracies": 0.3888888955116272, "rewards/chosen": 8.487684681313112e-06, "rewards/margins": -5.95807796344161e-05, "rewards/rejected": 6.80684534017928e-05, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.0638297872340427e-06, "logits/chosen": 0.08590418845415115, "logits/rejected": 0.28546762466430664, "logps/chosen": -355.88232421875, "logps/rejected": -301.07745361328125, "loss": 0.3767, "rewards/accuracies": 0.4375, "rewards/chosen": -4.2495485104154795e-06, "rewards/margins": -9.791481716092676e-05, "rewards/rejected": 9.366526501253247e-05, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.595744680851064e-06, "logits/chosen": 0.10498227924108505, "logits/rejected": 0.26788848638534546, "logps/chosen": -336.4917297363281, "logps/rejected": -282.240478515625, "loss": 0.3335, "rewards/accuracies": 0.375, "rewards/chosen": -9.1651327238651e-06, "rewards/margins": -3.620162169681862e-05, "rewards/rejected": 2.7036474421038292e-05, "step": 30 }, { "epoch": 0.04, "learning_rate": 2.1276595744680853e-06, "logits/chosen": 0.08821406960487366, "logits/rejected": 0.29493486881256104, "logps/chosen": -360.25006103515625, "logps/rejected": -293.743408203125, "loss": 0.3491, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00022433153935708106, "rewards/margins": 0.0002744749072007835, "rewards/rejected": -5.014336056774482e-05, "step": 40 }, { "epoch": 0.05, "learning_rate": 2.6595744680851065e-06, "logits/chosen": 0.08539856225252151, "logits/rejected": 0.26630836725234985, "logps/chosen": -331.37237548828125, "logps/rejected": -277.5819091796875, "loss": 0.3224, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.000468116020783782, "rewards/margins": 0.00041844701627269387, "rewards/rejected": 4.966904452885501e-05, "step": 50 }, { "epoch": 0.06, "learning_rate": 3.191489361702128e-06, "logits/chosen": 0.12674951553344727, "logits/rejected": 0.2512568533420563, "logps/chosen": -375.99163818359375, "logps/rejected": -294.652587890625, "loss": 0.3268, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.00099504878744483, "rewards/margins": 0.001509207533672452, "rewards/rejected": -0.0005141586298123002, "step": 60 }, { "epoch": 0.07, "learning_rate": 3.723404255319149e-06, "logits/chosen": 0.10996042191982269, "logits/rejected": 0.24452340602874756, "logps/chosen": -370.0396423339844, "logps/rejected": -296.1395568847656, "loss": 0.3308, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0021593121346086264, "rewards/margins": 0.0029010153375566006, "rewards/rejected": -0.0007417030283249915, "step": 70 }, { "epoch": 0.09, "learning_rate": 4.255319148936171e-06, "logits/chosen": 0.092557892203331, "logits/rejected": 0.22025224566459656, "logps/chosen": -354.04058837890625, "logps/rejected": -272.7898864746094, "loss": 0.3202, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0036561214365065098, "rewards/margins": 0.0056843506172299385, "rewards/rejected": -0.0020282294135540724, "step": 80 }, { "epoch": 0.1, "learning_rate": 4.787234042553192e-06, "logits/chosen": 0.1198895201086998, "logits/rejected": 0.26321595907211304, "logps/chosen": -333.9029541015625, "logps/rejected": -283.934814453125, "loss": 0.3338, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.0048600672744214535, "rewards/margins": 0.007962927222251892, "rewards/rejected": -0.0031028599478304386, "step": 90 }, { "epoch": 0.11, "learning_rate": 4.999375059004058e-06, "logits/chosen": 0.10690341144800186, "logits/rejected": 0.22842469811439514, "logps/chosen": -286.8883972167969, "logps/rejected": -270.21636962890625, "loss": 0.3183, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.0035039782524108887, "rewards/margins": 0.011926149018108845, "rewards/rejected": -0.008422170765697956, "step": 100 }, { "epoch": 0.12, "learning_rate": 4.9955571065548795e-06, "logits/chosen": 0.11143064498901367, "logits/rejected": 0.3082962930202484, "logps/chosen": -356.7191162109375, "logps/rejected": -320.5379638671875, "loss": 0.3428, "rewards/accuracies": 0.6875, "rewards/chosen": 0.002516907872632146, "rewards/margins": 0.0226505808532238, "rewards/rejected": -0.020133674144744873, "step": 110 }, { "epoch": 0.13, "learning_rate": 4.9882736864879e-06, "logits/chosen": 0.15117068588733673, "logits/rejected": 0.2508260905742645, "logps/chosen": -333.42791748046875, "logps/rejected": -307.9425964355469, "loss": 0.3065, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.005369984079152346, "rewards/margins": 0.0370098277926445, "rewards/rejected": -0.04237980768084526, "step": 120 }, { "epoch": 0.14, "learning_rate": 4.977534912960124e-06, "logits/chosen": 0.14773675799369812, "logits/rejected": 0.333048552274704, "logps/chosen": -437.2005310058594, "logps/rejected": -409.57098388671875, "loss": 0.3111, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06779468804597855, "rewards/margins": 0.05138751119375229, "rewards/rejected": -0.11918219178915024, "step": 130 }, { "epoch": 0.15, "learning_rate": 4.963355698422092e-06, "logits/chosen": 0.24007591605186462, "logits/rejected": 0.379871666431427, "logps/chosen": -504.1204528808594, "logps/rejected": -520.1692504882812, "loss": 0.2847, "rewards/accuracies": 0.71875, "rewards/chosen": -0.130012646317482, "rewards/margins": 0.10443723201751709, "rewards/rejected": -0.23444989323616028, "step": 140 }, { "epoch": 0.16, "learning_rate": 4.945755732909625e-06, "logits/chosen": 0.18426091969013214, "logits/rejected": 0.3188936114311218, "logps/chosen": -540.79638671875, "logps/rejected": -607.8611450195312, "loss": 0.229, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.19553306698799133, "rewards/margins": 0.12807996571063995, "rewards/rejected": -0.3236130475997925, "step": 150 }, { "epoch": 0.17, "learning_rate": 4.924759456701167e-06, "logits/chosen": 0.17972585558891296, "logits/rejected": 0.3777519166469574, "logps/chosen": -586.1607666015625, "logps/rejected": -647.55517578125, "loss": 0.2666, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2424735128879547, "rewards/margins": 0.13558030128479004, "rewards/rejected": -0.37805378437042236, "step": 160 }, { "epoch": 0.18, "learning_rate": 4.900396026378671e-06, "logits/chosen": 0.23550477623939514, "logits/rejected": 0.288230299949646, "logps/chosen": -510.2723083496094, "logps/rejected": -617.4466552734375, "loss": 0.2584, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17818708717823029, "rewards/margins": 0.13861213624477386, "rewards/rejected": -0.31679922342300415, "step": 170 }, { "epoch": 0.19, "learning_rate": 4.872699274339169e-06, "logits/chosen": 0.2329758107662201, "logits/rejected": 0.3874257206916809, "logps/chosen": -516.7825927734375, "logps/rejected": -531.4446411132812, "loss": 0.2942, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.14970800280570984, "rewards/margins": 0.11483931541442871, "rewards/rejected": -0.26454734802246094, "step": 180 }, { "epoch": 0.2, "learning_rate": 4.8417076618132434e-06, "logits/chosen": 0.26790374517440796, "logits/rejected": 0.3111529052257538, "logps/chosen": -448.8946838378906, "logps/rejected": -521.92236328125, "loss": 0.2939, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.14995862543582916, "rewards/margins": 0.09569428861141205, "rewards/rejected": -0.2456529140472412, "step": 190 }, { "epoch": 0.21, "learning_rate": 4.807464225455655e-06, "logits/chosen": 0.2344820201396942, "logits/rejected": 0.32830706238746643, "logps/chosen": -490.4266662597656, "logps/rejected": -567.8482666015625, "loss": 0.2961, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.15942268073558807, "rewards/margins": 0.11248819530010223, "rewards/rejected": -0.2719108760356903, "step": 200 }, { "epoch": 0.22, "learning_rate": 4.770016517582283e-06, "logits/chosen": 0.20260851085186005, "logits/rejected": 0.3340357840061188, "logps/chosen": -467.0673828125, "logps/rejected": -551.0185546875, "loss": 0.2705, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.1531032770872116, "rewards/margins": 0.1318296641111374, "rewards/rejected": -0.284932941198349, "step": 210 }, { "epoch": 0.23, "learning_rate": 4.7294165401363616e-06, "logits/chosen": 0.2734786868095398, "logits/rejected": 0.419080913066864, "logps/chosen": -520.6450805664062, "logps/rejected": -588.216796875, "loss": 0.2437, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.143020898103714, "rewards/margins": 0.15081192553043365, "rewards/rejected": -0.29383280873298645, "step": 220 }, { "epoch": 0.25, "learning_rate": 4.68572067247573e-06, "logits/chosen": 0.2819536328315735, "logits/rejected": 0.35720211267471313, "logps/chosen": -458.4796447753906, "logps/rejected": -594.9754028320312, "loss": 0.255, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.15209491550922394, "rewards/margins": 0.16111071407794952, "rewards/rejected": -0.3132055997848511, "step": 230 }, { "epoch": 0.26, "learning_rate": 4.638989593081364e-06, "logits/chosen": 0.24540099501609802, "logits/rejected": 0.4436759948730469, "logps/chosen": -600.3782958984375, "logps/rejected": -671.3284301757812, "loss": 0.2501, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.22624830901622772, "rewards/margins": 0.17018134891986847, "rewards/rejected": -0.3964296281337738, "step": 240 }, { "epoch": 0.27, "learning_rate": 4.5892881952959015e-06, "logits/chosen": 0.3086146414279938, "logits/rejected": 0.47908204793930054, "logps/chosen": -587.990234375, "logps/rejected": -724.9801025390625, "loss": 0.2206, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.2429129183292389, "rewards/margins": 0.1858929842710495, "rewards/rejected": -0.4288058876991272, "step": 250 }, { "epoch": 0.28, "learning_rate": 4.536685497209182e-06, "logits/chosen": 0.326163113117218, "logits/rejected": 0.4275685250759125, "logps/chosen": -625.9121704101562, "logps/rejected": -707.8150634765625, "loss": 0.2552, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2774854898452759, "rewards/margins": 0.15565654635429382, "rewards/rejected": -0.4331420361995697, "step": 260 }, { "epoch": 0.29, "learning_rate": 4.481254545815943e-06, "logits/chosen": 0.2850882411003113, "logits/rejected": 0.41088026762008667, "logps/chosen": -549.1213989257812, "logps/rejected": -665.7386474609375, "loss": 0.222, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.22084204852581024, "rewards/margins": 0.1726454496383667, "rewards/rejected": -0.39348751306533813, "step": 270 }, { "epoch": 0.3, "learning_rate": 4.42307231557875e-06, "logits/chosen": 0.27213165163993835, "logits/rejected": 0.40975937247276306, "logps/chosen": -601.2066040039062, "logps/rejected": -666.4470825195312, "loss": 0.23, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2208627462387085, "rewards/margins": 0.1570657640695572, "rewards/rejected": -0.3779284954071045, "step": 280 }, { "epoch": 0.31, "learning_rate": 4.3622196015370305e-06, "logits/chosen": 0.2959277331829071, "logits/rejected": 0.4144068658351898, "logps/chosen": -615.7034912109375, "logps/rejected": -677.7984619140625, "loss": 0.2944, "rewards/accuracies": 0.625, "rewards/chosen": -0.28542643785476685, "rewards/margins": 0.12878485023975372, "rewards/rejected": -0.4142112731933594, "step": 290 }, { "epoch": 0.32, "learning_rate": 4.298780907110648e-06, "logits/chosen": 0.28264912962913513, "logits/rejected": 0.4257001280784607, "logps/chosen": -620.7975463867188, "logps/rejected": -741.2415771484375, "loss": 0.2519, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.27887052297592163, "rewards/margins": 0.18175610899925232, "rewards/rejected": -0.46062660217285156, "step": 300 }, { "epoch": 0.33, "learning_rate": 4.23284432675381e-06, "logits/chosen": 0.22528712451457977, "logits/rejected": 0.4026212692260742, "logps/chosen": -702.7637939453125, "logps/rejected": -753.7025146484375, "loss": 0.2726, "rewards/accuracies": 0.75, "rewards/chosen": -0.3129212558269501, "rewards/margins": 0.15961073338985443, "rewards/rejected": -0.4725319743156433, "step": 310 }, { "epoch": 0.34, "learning_rate": 4.164501423622277e-06, "logits/chosen": 0.23186448216438293, "logits/rejected": 0.4424917697906494, "logps/chosen": -649.0548095703125, "logps/rejected": -796.4383544921875, "loss": 0.1962, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2649792432785034, "rewards/margins": 0.2140999585390091, "rewards/rejected": -0.4790791869163513, "step": 320 }, { "epoch": 0.35, "learning_rate": 4.0938471024237355e-06, "logits/chosen": 0.25485163927078247, "logits/rejected": 0.3516165018081665, "logps/chosen": -578.9785766601562, "logps/rejected": -730.439453125, "loss": 0.225, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.26622021198272705, "rewards/margins": 0.17464616894721985, "rewards/rejected": -0.4408663809299469, "step": 330 }, { "epoch": 0.36, "learning_rate": 4.020979477627907e-06, "logits/chosen": 0.2578740119934082, "logits/rejected": 0.3376823961734772, "logps/chosen": -637.73828125, "logps/rejected": -729.6270751953125, "loss": 0.2377, "rewards/accuracies": 0.71875, "rewards/chosen": -0.26909753680229187, "rewards/margins": 0.15882256627082825, "rewards/rejected": -0.4279201626777649, "step": 340 }, { "epoch": 0.37, "learning_rate": 3.9459997372194105e-06, "logits/chosen": 0.2474416196346283, "logits/rejected": 0.39516907930374146, "logps/chosen": -584.4064331054688, "logps/rejected": -655.7855224609375, "loss": 0.2418, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.237878680229187, "rewards/margins": 0.15419434010982513, "rewards/rejected": -0.39207297563552856, "step": 350 }, { "epoch": 0.38, "learning_rate": 3.869012002182573e-06, "logits/chosen": 0.21178965270519257, "logits/rejected": 0.3619839549064636, "logps/chosen": -579.821533203125, "logps/rejected": -719.3638916015625, "loss": 0.1948, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2462320625782013, "rewards/margins": 0.19054508209228516, "rewards/rejected": -0.43677717447280884, "step": 360 }, { "epoch": 0.39, "learning_rate": 3.7901231819133104e-06, "logits/chosen": 0.23618245124816895, "logits/rejected": 0.3496639132499695, "logps/chosen": -612.3772583007812, "logps/rejected": -759.8063354492188, "loss": 0.21, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.28270620107650757, "rewards/margins": 0.206351637840271, "rewards/rejected": -0.48905783891677856, "step": 370 }, { "epoch": 0.41, "learning_rate": 3.709442825758875e-06, "logits/chosen": 0.25568825006484985, "logits/rejected": 0.4055866599082947, "logps/chosen": -689.156982421875, "logps/rejected": -858.4132690429688, "loss": 0.2195, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3555639684200287, "rewards/margins": 0.2100004404783249, "rewards/rejected": -0.5655643939971924, "step": 380 }, { "epoch": 0.42, "learning_rate": 3.6270829708916113e-06, "logits/chosen": 0.2990753650665283, "logits/rejected": 0.4023717939853668, "logps/chosen": -691.2890014648438, "logps/rejected": -849.8240356445312, "loss": 0.2338, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.32989585399627686, "rewards/margins": 0.20805878937244415, "rewards/rejected": -0.5379546880722046, "step": 390 }, { "epoch": 0.43, "learning_rate": 3.543157986727991e-06, "logits/chosen": 0.2829650044441223, "logits/rejected": 0.4152089059352875, "logps/chosen": -629.3040771484375, "logps/rejected": -772.66064453125, "loss": 0.2258, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2793223261833191, "rewards/margins": 0.22787272930145264, "rewards/rejected": -0.5071950554847717, "step": 400 }, { "epoch": 0.44, "learning_rate": 3.4577844161089614e-06, "logits/chosen": 0.2335791140794754, "logits/rejected": 0.35757821798324585, "logps/chosen": -566.3199462890625, "logps/rejected": -715.8108520507812, "loss": 0.2621, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2617657482624054, "rewards/margins": 0.18395525217056274, "rewards/rejected": -0.44572100043296814, "step": 410 }, { "epoch": 0.45, "learning_rate": 3.3710808134621577e-06, "logits/chosen": 0.21092458069324493, "logits/rejected": 0.38082176446914673, "logps/chosen": -635.6312255859375, "logps/rejected": -758.2023315429688, "loss": 0.2283, "rewards/accuracies": 0.6875, "rewards/chosen": -0.27461183071136475, "rewards/margins": 0.20775285363197327, "rewards/rejected": -0.4823646545410156, "step": 420 }, { "epoch": 0.46, "learning_rate": 3.2831675801707126e-06, "logits/chosen": 0.19468629360198975, "logits/rejected": 0.376936674118042, "logps/chosen": -635.4735717773438, "logps/rejected": -755.5081787109375, "loss": 0.2945, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3128860294818878, "rewards/margins": 0.13630416989326477, "rewards/rejected": -0.449190229177475, "step": 430 }, { "epoch": 0.47, "learning_rate": 3.194166797377289e-06, "logits/chosen": 0.21043789386749268, "logits/rejected": 0.29150551557540894, "logps/chosen": -612.1212158203125, "logps/rejected": -746.0543212890625, "loss": 0.2548, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2695164978504181, "rewards/margins": 0.16553311049938202, "rewards/rejected": -0.4350495934486389, "step": 440 }, { "epoch": 0.48, "learning_rate": 3.104202056455501e-06, "logits/chosen": 0.20291252434253693, "logits/rejected": 0.3731236457824707, "logps/chosen": -630.1588134765625, "logps/rejected": -718.7623291015625, "loss": 0.2554, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2742314636707306, "rewards/margins": 0.15337610244750977, "rewards/rejected": -0.42760753631591797, "step": 450 }, { "epoch": 0.49, "learning_rate": 3.013398287384144e-06, "logits/chosen": 0.16522249579429626, "logits/rejected": 0.3707042336463928, "logps/chosen": -645.4251708984375, "logps/rejected": -777.720947265625, "loss": 0.2542, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.28105100989341736, "rewards/margins": 0.20097048580646515, "rewards/rejected": -0.4820214807987213, "step": 460 }, { "epoch": 0.5, "learning_rate": 2.9218815852625717e-06, "logits/chosen": 0.22193937003612518, "logits/rejected": 0.28407761454582214, "logps/chosen": -629.5750732421875, "logps/rejected": -736.9579467773438, "loss": 0.2498, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2655861973762512, "rewards/margins": 0.18186791241168976, "rewards/rejected": -0.4474540650844574, "step": 470 }, { "epoch": 0.51, "learning_rate": 2.829779035208113e-06, "logits/chosen": 0.19652318954467773, "logits/rejected": 0.339162677526474, "logps/chosen": -617.4735107421875, "logps/rejected": -749.2583618164062, "loss": 0.2276, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2735213041305542, "rewards/margins": 0.1885720044374466, "rewards/rejected": -0.4620933532714844, "step": 480 }, { "epoch": 0.52, "learning_rate": 2.737218535878705e-06, "logits/chosen": 0.21091029047966003, "logits/rejected": 0.25410202145576477, "logps/chosen": -610.21484375, "logps/rejected": -735.4616088867188, "loss": 0.249, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.307112455368042, "rewards/margins": 0.1565089374780655, "rewards/rejected": -0.4636213183403015, "step": 490 }, { "epoch": 0.53, "learning_rate": 2.64432862186579e-06, "logits/chosen": 0.1500285118818283, "logits/rejected": 0.23882384598255157, "logps/chosen": -594.6583251953125, "logps/rejected": -767.1025390625, "loss": 0.239, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.27507561445236206, "rewards/margins": 0.21139319241046906, "rewards/rejected": -0.4864687919616699, "step": 500 }, { "epoch": 0.54, "learning_rate": 2.551238285204126e-06, "logits/chosen": 0.1958198994398117, "logits/rejected": 0.31719350814819336, "logps/chosen": -642.8175048828125, "logps/rejected": -761.66796875, "loss": 0.2385, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2955992817878723, "rewards/margins": 0.16950063407421112, "rewards/rejected": -0.465099960565567, "step": 510 }, { "epoch": 0.55, "learning_rate": 2.4580767962463688e-06, "logits/chosen": 0.20254361629486084, "logits/rejected": 0.33361297845840454, "logps/chosen": -586.4736328125, "logps/rejected": -720.5748901367188, "loss": 0.2246, "rewards/accuracies": 0.75, "rewards/chosen": -0.2558278441429138, "rewards/margins": 0.19546028971672058, "rewards/rejected": -0.4512881636619568, "step": 520 }, { "epoch": 0.57, "learning_rate": 2.3649735241511546e-06, "logits/chosen": 0.1802414357662201, "logits/rejected": 0.29412373900413513, "logps/chosen": -620.1038818359375, "logps/rejected": -744.6738891601562, "loss": 0.2586, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.28716325759887695, "rewards/margins": 0.17247943580150604, "rewards/rejected": -0.4596427381038666, "step": 530 }, { "epoch": 0.58, "learning_rate": 2.2720577572339914e-06, "logits/chosen": 0.15679052472114563, "logits/rejected": 0.31615105271339417, "logps/chosen": -628.2031860351562, "logps/rejected": -721.74462890625, "loss": 0.2295, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.28228306770324707, "rewards/margins": 0.17121247947216034, "rewards/rejected": -0.4534955620765686, "step": 540 }, { "epoch": 0.59, "learning_rate": 2.1794585234303995e-06, "logits/chosen": 0.145944282412529, "logits/rejected": 0.282092809677124, "logps/chosen": -628.9989624023438, "logps/rejected": -768.2745361328125, "loss": 0.2681, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.3033018112182617, "rewards/margins": 0.18239369988441467, "rewards/rejected": -0.4856954514980316, "step": 550 }, { "epoch": 0.6, "learning_rate": 2.0873044111206407e-06, "logits/chosen": 0.13779011368751526, "logits/rejected": 0.2679106891155243, "logps/chosen": -597.4417724609375, "logps/rejected": -686.5075073242188, "loss": 0.2452, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2510097622871399, "rewards/margins": 0.1661645472049713, "rewards/rejected": -0.4171743392944336, "step": 560 }, { "epoch": 0.61, "learning_rate": 1.9957233905648293e-06, "logits/chosen": 0.1775943785905838, "logits/rejected": 0.24674908816814423, "logps/chosen": -569.0006103515625, "logps/rejected": -673.6175537109375, "loss": 0.2581, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.24182912707328796, "rewards/margins": 0.16644051671028137, "rewards/rejected": -0.40826964378356934, "step": 570 }, { "epoch": 0.62, "learning_rate": 1.904842636196402e-06, "logits/chosen": 0.14105424284934998, "logits/rejected": 0.25376424193382263, "logps/chosen": -622.407958984375, "logps/rejected": -726.170166015625, "loss": 0.2563, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2561160922050476, "rewards/margins": 0.17688779532909393, "rewards/rejected": -0.43300390243530273, "step": 580 }, { "epoch": 0.63, "learning_rate": 1.814788350020726e-06, "logits/chosen": 0.14403316378593445, "logits/rejected": 0.2685418128967285, "logps/chosen": -623.8209838867188, "logps/rejected": -749.579833984375, "loss": 0.2268, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.26271942257881165, "rewards/margins": 0.18578986823558807, "rewards/rejected": -0.4485093057155609, "step": 590 }, { "epoch": 0.64, "learning_rate": 1.725685586364051e-06, "logits/chosen": 0.14330841600894928, "logits/rejected": 0.2794835567474365, "logps/chosen": -633.2759399414062, "logps/rejected": -730.0501098632812, "loss": 0.2244, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.23276320099830627, "rewards/margins": 0.20662634074687958, "rewards/rejected": -0.43938955664634705, "step": 600 }, { "epoch": 0.65, "learning_rate": 1.6376580782162172e-06, "logits/chosen": 0.15507976710796356, "logits/rejected": 0.24705450236797333, "logps/chosen": -593.1032104492188, "logps/rejected": -688.7366333007812, "loss": 0.2542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2516687214374542, "rewards/margins": 0.15713506937026978, "rewards/rejected": -0.4088037610054016, "step": 610 }, { "epoch": 0.66, "learning_rate": 1.550828065408227e-06, "logits/chosen": 0.11955428123474121, "logits/rejected": 0.26153725385665894, "logps/chosen": -596.6570434570312, "logps/rejected": -714.2159423828125, "loss": 0.2373, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.2651021182537079, "rewards/margins": 0.16985554993152618, "rewards/rejected": -0.43495768308639526, "step": 620 }, { "epoch": 0.67, "learning_rate": 1.4653161248633053e-06, "logits/chosen": 0.055382657796144485, "logits/rejected": 0.16070407629013062, "logps/chosen": -588.3499755859375, "logps/rejected": -749.0435791015625, "loss": 0.1873, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.2617392838001251, "rewards/margins": 0.20763972401618958, "rewards/rejected": -0.4693790078163147, "step": 630 }, { "epoch": 0.68, "learning_rate": 1.381241003157162e-06, "logits/chosen": 0.16330929100513458, "logits/rejected": 0.25931382179260254, "logps/chosen": -658.1799926757812, "logps/rejected": -720.4083862304688, "loss": 0.2555, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2981792390346527, "rewards/margins": 0.15325401723384857, "rewards/rejected": -0.4514332711696625, "step": 640 }, { "epoch": 0.69, "learning_rate": 1.298719451619979e-06, "logits/chosen": 0.08289843797683716, "logits/rejected": 0.25149843096733093, "logps/chosen": -598.2789306640625, "logps/rejected": -685.8959350585938, "loss": 0.2311, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.27237817645072937, "rewards/margins": 0.15401718020439148, "rewards/rejected": -0.42639535665512085, "step": 650 }, { "epoch": 0.7, "learning_rate": 1.2178660642091036e-06, "logits/chosen": 0.1427321434020996, "logits/rejected": 0.21098168194293976, "logps/chosen": -609.9461669921875, "logps/rejected": -746.7469482421875, "loss": 0.2398, "rewards/accuracies": 0.75, "rewards/chosen": -0.27476319670677185, "rewards/margins": 0.1802576333284378, "rewards/rejected": -0.45502081513404846, "step": 660 }, { "epoch": 0.71, "learning_rate": 1.1387931183775821e-06, "logits/chosen": 0.11782477051019669, "logits/rejected": 0.27250903844833374, "logps/chosen": -621.0406494140625, "logps/rejected": -755.3724365234375, "loss": 0.2159, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2698996365070343, "rewards/margins": 0.18436777591705322, "rewards/rejected": -0.4542674124240875, "step": 670 }, { "epoch": 0.73, "learning_rate": 1.061610419159532e-06, "logits/chosen": 0.1588212251663208, "logits/rejected": 0.23087497055530548, "logps/chosen": -632.2388305664062, "logps/rejected": -750.19677734375, "loss": 0.2295, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.27237120270729065, "rewards/margins": 0.1973101943731308, "rewards/rejected": -0.46968141198158264, "step": 680 }, { "epoch": 0.74, "learning_rate": 9.864251466888364e-07, "logits/chosen": 0.18594476580619812, "logits/rejected": 0.2678997814655304, "logps/chosen": -632.8572998046875, "logps/rejected": -726.9435424804688, "loss": 0.2412, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2735942006111145, "rewards/margins": 0.1667368859052658, "rewards/rejected": -0.4403310716152191, "step": 690 }, { "epoch": 0.75, "learning_rate": 9.133417073629288e-07, "logits/chosen": 0.12347595393657684, "logits/rejected": 0.23116078972816467, "logps/chosen": -619.8026123046875, "logps/rejected": -769.6065063476562, "loss": 0.224, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.26983651518821716, "rewards/margins": 0.19323134422302246, "rewards/rejected": -0.4630679190158844, "step": 700 }, { "epoch": 0.76, "learning_rate": 8.424615888583332e-07, "logits/chosen": 0.09643986076116562, "logits/rejected": 0.23500403761863708, "logps/chosen": -629.8443603515625, "logps/rejected": -750.6370849609375, "loss": 0.2241, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.30303314328193665, "rewards/margins": 0.175406351685524, "rewards/rejected": -0.47843948006629944, "step": 710 }, { "epoch": 0.77, "learning_rate": 7.738832191993092e-07, "logits/chosen": 0.13513672351837158, "logits/rejected": 0.25501731038093567, "logps/chosen": -655.4788208007812, "logps/rejected": -755.9259033203125, "loss": 0.2427, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.29584428668022156, "rewards/margins": 0.19102725386619568, "rewards/rejected": -0.48687154054641724, "step": 720 }, { "epoch": 0.78, "learning_rate": 7.077018300752917e-07, "logits/chosen": 0.13387581706047058, "logits/rejected": 0.3162155747413635, "logps/chosen": -683.8049926757812, "logps/rejected": -774.6620483398438, "loss": 0.2255, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3081107437610626, "rewards/margins": 0.196131631731987, "rewards/rejected": -0.5042424201965332, "step": 730 }, { "epoch": 0.79, "learning_rate": 6.440093245969342e-07, "logits/chosen": 0.1714894026517868, "logits/rejected": 0.2547430396080017, "logps/chosen": -685.7855224609375, "logps/rejected": -804.7208251953125, "loss": 0.2481, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.35218387842178345, "rewards/margins": 0.1600029170513153, "rewards/rejected": -0.5121868848800659, "step": 740 }, { "epoch": 0.8, "learning_rate": 5.828941496744075e-07, "logits/chosen": 0.1680620014667511, "logits/rejected": 0.22125045955181122, "logps/chosen": -645.6912841796875, "logps/rejected": -773.2764892578125, "loss": 0.2372, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3005642592906952, "rewards/margins": 0.19485433399677277, "rewards/rejected": -0.49541860818862915, "step": 750 }, { "epoch": 0.81, "learning_rate": 5.244411731951671e-07, "logits/chosen": 0.1580239236354828, "logits/rejected": 0.2352636307477951, "logps/chosen": -608.561767578125, "logps/rejected": -763.673583984375, "loss": 0.1923, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2971411347389221, "rewards/margins": 0.21372155845165253, "rewards/rejected": -0.5108626484870911, "step": 760 }, { "epoch": 0.82, "learning_rate": 4.6873156617173594e-07, "logits/chosen": 0.14726164937019348, "logits/rejected": 0.3369542360305786, "logps/chosen": -687.8447265625, "logps/rejected": -795.39892578125, "loss": 0.2234, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.320822536945343, "rewards/margins": 0.19546931982040405, "rewards/rejected": -0.5162919163703918, "step": 770 }, { "epoch": 0.83, "learning_rate": 4.1584269002318653e-07, "logits/chosen": 0.12434519827365875, "logits/rejected": 0.3057369291782379, "logps/chosen": -636.1790161132812, "logps/rejected": -772.4307250976562, "loss": 0.2355, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2854897379875183, "rewards/margins": 0.22439303994178772, "rewards/rejected": -0.5098827481269836, "step": 780 }, { "epoch": 0.84, "learning_rate": 3.658479891468258e-07, "logits/chosen": 0.09943689405918121, "logits/rejected": 0.29146888852119446, "logps/chosen": -649.2669677734375, "logps/rejected": -783.7434692382812, "loss": 0.2545, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3191539943218231, "rewards/margins": 0.1720370650291443, "rewards/rejected": -0.49119099974632263, "step": 790 }, { "epoch": 0.85, "learning_rate": 3.18816888929272e-07, "logits/chosen": 0.18064996600151062, "logits/rejected": 0.20455940067768097, "logps/chosen": -644.3905029296875, "logps/rejected": -734.707275390625, "loss": 0.2728, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.31058043241500854, "rewards/margins": 0.15593388676643372, "rewards/rejected": -0.46651434898376465, "step": 800 }, { "epoch": 0.86, "learning_rate": 2.748146993385484e-07, "logits/chosen": 0.14735907316207886, "logits/rejected": 0.25722530484199524, "logps/chosen": -685.1055908203125, "logps/rejected": -850.9083251953125, "loss": 0.2172, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.3123786449432373, "rewards/margins": 0.21349501609802246, "rewards/rejected": -0.525873601436615, "step": 810 }, { "epoch": 0.87, "learning_rate": 2.3390252423108077e-07, "logits/chosen": 0.2024962604045868, "logits/rejected": 0.24823208153247833, "logps/chosen": -589.3237915039062, "logps/rejected": -780.3211059570312, "loss": 0.1994, "rewards/accuracies": 0.78125, "rewards/chosen": -0.27285903692245483, "rewards/margins": 0.2184310257434845, "rewards/rejected": -0.49129003286361694, "step": 820 }, { "epoch": 0.89, "learning_rate": 1.961371764995243e-07, "logits/chosen": 0.1840062439441681, "logits/rejected": 0.2939746677875519, "logps/chosen": -656.8392333984375, "logps/rejected": -794.8123779296875, "loss": 0.2099, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.29705482721328735, "rewards/margins": 0.19384784996509552, "rewards/rejected": -0.4909026622772217, "step": 830 }, { "epoch": 0.9, "learning_rate": 1.61571099179261e-07, "logits/chosen": 0.13038434088230133, "logits/rejected": 0.29925939440727234, "logps/chosen": -636.65576171875, "logps/rejected": -767.6978759765625, "loss": 0.1888, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.3050754964351654, "rewards/margins": 0.18477264046669006, "rewards/rejected": -0.48984819650650024, "step": 840 }, { "epoch": 0.91, "learning_rate": 1.3025229262312367e-07, "logits/chosen": 0.1295488178730011, "logits/rejected": 0.2491920441389084, "logps/chosen": -636.8143920898438, "logps/rejected": -763.8482666015625, "loss": 0.22, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.30084288120269775, "rewards/margins": 0.19896382093429565, "rewards/rejected": -0.4998067319393158, "step": 850 }, { "epoch": 0.92, "learning_rate": 1.0222424784546853e-07, "logits/chosen": 0.16568364202976227, "logits/rejected": 0.24035637080669403, "logps/chosen": -644.7538452148438, "logps/rejected": -731.6928100585938, "loss": 0.2069, "rewards/accuracies": 0.75, "rewards/chosen": -0.2750314176082611, "rewards/margins": 0.19259348511695862, "rewards/rejected": -0.4676249027252197, "step": 860 }, { "epoch": 0.93, "learning_rate": 7.752588612816553e-08, "logits/chosen": 0.16033487021923065, "logits/rejected": 0.29462411999702454, "logps/chosen": -650.9522705078125, "logps/rejected": -800.7526245117188, "loss": 0.2255, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3084677755832672, "rewards/margins": 0.2076472043991089, "rewards/rejected": -0.5161150097846985, "step": 870 }, { "epoch": 0.94, "learning_rate": 5.619150497236991e-08, "logits/chosen": 0.16950352489948273, "logits/rejected": 0.2941722869873047, "logps/chosen": -653.6286010742188, "logps/rejected": -795.2503662109375, "loss": 0.1774, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.28428733348846436, "rewards/margins": 0.21010974049568176, "rewards/rejected": -0.4943970739841461, "step": 880 }, { "epoch": 0.95, "learning_rate": 3.825073047112743e-08, "logits/chosen": 0.10091813653707504, "logits/rejected": 0.2567567527294159, "logps/chosen": -651.4176025390625, "logps/rejected": -775.3624877929688, "loss": 0.1963, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.30356308817863464, "rewards/margins": 0.20189008116722107, "rewards/rejected": -0.5054532289505005, "step": 890 }, { "epoch": 0.96, "learning_rate": 2.372847616895685e-08, "logits/chosen": 0.1418648213148117, "logits/rejected": 0.2986980974674225, "logps/chosen": -694.026611328125, "logps/rejected": -802.0440673828125, "loss": 0.2128, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2955307364463806, "rewards/margins": 0.2045350968837738, "rewards/rejected": -0.5000658631324768, "step": 900 }, { "epoch": 0.97, "learning_rate": 1.264490846553279e-08, "logits/chosen": 0.11403369903564453, "logits/rejected": 0.2827546298503876, "logps/chosen": -661.4728393554688, "logps/rejected": -776.9918823242188, "loss": 0.2075, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.29672855138778687, "rewards/margins": 0.2093483954668045, "rewards/rejected": -0.5060769319534302, "step": 910 }, { "epoch": 0.98, "learning_rate": 5.015418611516165e-09, "logits/chosen": 0.14227314293384552, "logits/rejected": 0.30497756600379944, "logps/chosen": -682.4346923828125, "logps/rejected": -821.7689208984375, "loss": 0.2051, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3154338002204895, "rewards/margins": 0.21862968802452087, "rewards/rejected": -0.534063458442688, "step": 920 }, { "epoch": 0.99, "learning_rate": 8.506013354186993e-10, "logits/chosen": 0.09600396454334259, "logits/rejected": 0.24382761120796204, "logps/chosen": -640.611328125, "logps/rejected": -818.2303466796875, "loss": 0.1915, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.2940900921821594, "rewards/margins": 0.23137716948986053, "rewards/rejected": -0.5254672169685364, "step": 930 }, { "epoch": 1.0, "step": 937, "total_flos": 0.0, "train_loss": 0.2500629702398082, "train_runtime": 8446.1639, "train_samples_per_second": 1.776, "train_steps_per_second": 0.111 } ], "logging_steps": 10, "max_steps": 937, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }