{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.064, "grad_norm": 48.569252014160156, "learning_rate": 6.249999999999999e-07, "logits/chosen": -1.8510873317718506, "logits/rejected": -0.29376277327537537, "logps/chosen": -214.10960388183594, "logps/rejected": -737.373291015625, "loss": 0.6961, "rewards/accuracies": 0.4375, "rewards/chosen": 0.006722441408783197, "rewards/margins": 0.026040678843855858, "rewards/rejected": -0.019318239763379097, "step": 10 }, { "epoch": 0.128, "grad_norm": 14.449111938476562, "learning_rate": 9.979871469976195e-07, "logits/chosen": -1.8497416973114014, "logits/rejected": -0.2751065790653229, "logps/chosen": -240.32241821289062, "logps/rejected": -844.6730346679688, "loss": 0.4739, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 0.029739724472165108, "rewards/margins": 0.6849702596664429, "rewards/rejected": -0.6552305817604065, "step": 20 }, { "epoch": 0.192, "grad_norm": 0.7451657056808472, "learning_rate": 9.755282581475767e-07, "logits/chosen": -2.152054786682129, "logits/rejected": -0.8167506456375122, "logps/chosen": -241.89797973632812, "logps/rejected": -830.0802612304688, "loss": 0.1085, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -0.33556511998176575, "rewards/margins": 4.663994312286377, "rewards/rejected": -4.99955940246582, "step": 30 }, { "epoch": 0.256, "grad_norm": 0.9800211191177368, "learning_rate": 9.29224396800933e-07, "logits/chosen": -2.6119227409362793, "logits/rejected": -1.6330392360687256, "logps/chosen": -261.77215576171875, "logps/rejected": -951.4508666992188, "loss": 0.0372, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.5927298069000244, "rewards/margins": 14.331648826599121, "rewards/rejected": -16.924379348754883, "step": 40 }, { "epoch": 0.32, "grad_norm": 0.08055471628904343, "learning_rate": 8.613974319136957e-07, "logits/chosen": -2.8284103870391846, "logits/rejected": -2.016091823577881, "logps/chosen": -267.1216735839844, "logps/rejected": -1083.85693359375, "loss": 0.0246, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.838659286499023, "rewards/margins": 24.708829879760742, "rewards/rejected": -29.547487258911133, "step": 50 }, { "epoch": 0.384, "grad_norm": 0.07454250752925873, "learning_rate": 7.754484907260512e-07, "logits/chosen": -2.8392865657806396, "logits/rejected": -2.14508056640625, "logps/chosen": -296.3800964355469, "logps/rejected": -1137.64892578125, "loss": 0.02, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.760532379150391, "rewards/margins": 29.1066951751709, "rewards/rejected": -35.86723327636719, "step": 60 }, { "epoch": 0.448, "grad_norm": 0.10226113349199295, "learning_rate": 6.756874120406714e-07, "logits/chosen": -2.863455295562744, "logits/rejected": -2.1374523639678955, "logps/chosen": -282.3721008300781, "logps/rejected": -1143.27587890625, "loss": 0.0304, "rewards/accuracies": 0.9874999523162842, "rewards/chosen": -6.247722148895264, "rewards/margins": 30.036664962768555, "rewards/rejected": -36.28438949584961, "step": 70 }, { "epoch": 0.512, "grad_norm": 0.06721244752407074, "learning_rate": 5.671166329088277e-07, "logits/chosen": -2.756829023361206, "logits/rejected": -2.045252799987793, "logps/chosen": -291.5985412597656, "logps/rejected": -1122.9361572265625, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": -5.791654586791992, "rewards/margins": 27.7723331451416, "rewards/rejected": -33.563987731933594, "step": 80 }, { "epoch": 0.576, "grad_norm": 0.07295263558626175, "learning_rate": 4.5518034554828327e-07, "logits/chosen": -2.751217842102051, "logits/rejected": -2.025172233581543, "logps/chosen": -286.55694580078125, "logps/rejected": -1081.6651611328125, "loss": 0.0353, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.1540751457214355, "rewards/margins": 25.992502212524414, "rewards/rejected": -31.146577835083008, "step": 90 }, { "epoch": 0.64, "grad_norm": 0.08157353848218918, "learning_rate": 3.454915028125263e-07, "logits/chosen": -2.746480703353882, "logits/rejected": -1.9662470817565918, "logps/chosen": -271.390869140625, "logps/rejected": -1062.62109375, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -4.289627552032471, "rewards/margins": 24.77456283569336, "rewards/rejected": -29.06418800354004, "step": 100 }, { "epoch": 0.704, "grad_norm": 0.1165793165564537, "learning_rate": 2.4355036129704696e-07, "logits/chosen": -2.7295165061950684, "logits/rejected": -1.951841950416565, "logps/chosen": -265.2632141113281, "logps/rejected": -1057.536376953125, "loss": 0.0252, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.240163803100586, "rewards/margins": 23.72610092163086, "rewards/rejected": -27.966266632080078, "step": 110 }, { "epoch": 0.768, "grad_norm": 0.06965842097997665, "learning_rate": 1.5446867550656767e-07, "logits/chosen": -2.7134926319122314, "logits/rejected": -1.9151828289031982, "logps/chosen": -272.508056640625, "logps/rejected": -1058.3094482421875, "loss": 0.0668, "rewards/accuracies": 0.9906249642372131, "rewards/chosen": -4.029051303863525, "rewards/margins": 23.791399002075195, "rewards/rejected": -27.82044792175293, "step": 120 }, { "epoch": 0.832, "grad_norm": 0.06551803648471832, "learning_rate": 8.271337313934867e-08, "logits/chosen": -2.6546225547790527, "logits/rejected": -1.8665531873703003, "logps/chosen": -285.0191650390625, "logps/rejected": -1041.3951416015625, "loss": 0.036, "rewards/accuracies": 0.9812500476837158, "rewards/chosen": -3.9543297290802, "rewards/margins": 22.66595458984375, "rewards/rejected": -26.620285034179688, "step": 130 }, { "epoch": 0.896, "grad_norm": 0.07770426571369171, "learning_rate": 3.188256468013139e-08, "logits/chosen": -2.7073161602020264, "logits/rejected": -1.8695634603500366, "logps/chosen": -265.3280029296875, "logps/rejected": -1062.6976318359375, "loss": 0.0197, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.531825304031372, "rewards/margins": 23.626256942749023, "rewards/rejected": -27.158079147338867, "step": 140 }, { "epoch": 0.96, "grad_norm": 0.06992675364017487, "learning_rate": 4.5251191160326495e-09, "logits/chosen": -2.6707656383514404, "logits/rejected": -1.817920446395874, "logps/chosen": -286.8282165527344, "logps/rejected": -1117.4290771484375, "loss": 0.0668, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.840498924255371, "rewards/margins": 23.849292755126953, "rewards/rejected": -27.68979263305664, "step": 150 }, { "epoch": 0.9984, "step": 156, "total_flos": 1.1115841451898962e+18, "train_loss": 0.10875998093531682, "train_runtime": 5515.1168, "train_samples_per_second": 0.907, "train_steps_per_second": 0.028 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1115841451898962e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }