{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 4407, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006809669731018045, "grad_norm": 17.744218826293945, "learning_rate": 2.7247956403269755e-08, "loss": 1.7534, "mean_token_accuracy": 0.6183247864246368, "step": 1 }, { "epoch": 0.001361933946203609, "grad_norm": 16.628562927246094, "learning_rate": 5.449591280653951e-08, "loss": 1.7337, "mean_token_accuracy": 0.616142064332962, "step": 2 }, { "epoch": 0.0020429009193054137, "grad_norm": 15.059070587158203, "learning_rate": 8.174386920980926e-08, "loss": 1.6696, "mean_token_accuracy": 0.62123242020607, "step": 3 }, { "epoch": 0.002723867892407218, "grad_norm": 17.773881912231445, "learning_rate": 1.0899182561307902e-07, "loss": 1.6612, "mean_token_accuracy": 0.637041300535202, "step": 4 }, { "epoch": 0.0034048348655090228, "grad_norm": 14.422760009765625, "learning_rate": 1.3623978201634878e-07, "loss": 1.7556, "mean_token_accuracy": 0.5891638994216919, "step": 5 }, { "epoch": 0.0040858018386108275, "grad_norm": 16.48126983642578, "learning_rate": 1.6348773841961852e-07, "loss": 1.6865, "mean_token_accuracy": 0.6126416325569153, "step": 6 }, { "epoch": 0.004766768811712632, "grad_norm": 15.791881561279297, "learning_rate": 1.907356948228883e-07, "loss": 1.6264, "mean_token_accuracy": 0.6345905959606171, "step": 7 }, { "epoch": 0.005447735784814436, "grad_norm": 14.880996704101562, "learning_rate": 2.1798365122615804e-07, "loss": 1.557, "mean_token_accuracy": 0.6506516337394714, "step": 8 }, { "epoch": 0.006128702757916241, "grad_norm": 17.625614166259766, "learning_rate": 2.4523160762942784e-07, "loss": 1.5469, "mean_token_accuracy": 0.6494502425193787, "step": 9 }, { "epoch": 0.0068096697310180455, "grad_norm": 15.422273635864258, "learning_rate": 2.7247956403269756e-07, "loss": 1.6215, "mean_token_accuracy": 0.6198446750640869, "step": 10 }, { "epoch": 0.00749063670411985, "grad_norm": 14.67065715789795, "learning_rate": 2.9972752043596733e-07, "loss": 1.7533, "mean_token_accuracy": 0.5920585691928864, "step": 11 }, { "epoch": 0.008171603677221655, "grad_norm": 15.088273048400879, "learning_rate": 3.2697547683923705e-07, "loss": 1.5777, "mean_token_accuracy": 0.6386546790599823, "step": 12 }, { "epoch": 0.008852570650323459, "grad_norm": 14.746537208557129, "learning_rate": 3.5422343324250687e-07, "loss": 1.6408, "mean_token_accuracy": 0.6166130900382996, "step": 13 }, { "epoch": 0.009533537623425264, "grad_norm": 13.136030197143555, "learning_rate": 3.814713896457766e-07, "loss": 1.6583, "mean_token_accuracy": 0.6069006621837616, "step": 14 }, { "epoch": 0.010214504596527068, "grad_norm": 14.007953643798828, "learning_rate": 4.0871934604904636e-07, "loss": 1.661, "mean_token_accuracy": 0.6201167106628418, "step": 15 }, { "epoch": 0.010895471569628872, "grad_norm": 16.278541564941406, "learning_rate": 4.359673024523161e-07, "loss": 1.5962, "mean_token_accuracy": 0.6324383914470673, "step": 16 }, { "epoch": 0.011576438542730678, "grad_norm": 13.380023002624512, "learning_rate": 4.6321525885558585e-07, "loss": 1.6741, "mean_token_accuracy": 0.6079131364822388, "step": 17 }, { "epoch": 0.012257405515832482, "grad_norm": 16.455663681030273, "learning_rate": 4.904632152588557e-07, "loss": 1.5948, "mean_token_accuracy": 0.6489735245704651, "step": 18 }, { "epoch": 0.012938372488934287, "grad_norm": 12.887166976928711, "learning_rate": 5.177111716621253e-07, "loss": 1.5565, "mean_token_accuracy": 0.6351551115512848, "step": 19 }, { "epoch": 0.013619339462036091, "grad_norm": 12.680163383483887, "learning_rate": 5.449591280653951e-07, "loss": 1.5194, "mean_token_accuracy": 0.6452173292636871, "step": 20 }, { "epoch": 0.014300306435137897, "grad_norm": 13.48643970489502, "learning_rate": 5.722070844686649e-07, "loss": 1.5752, "mean_token_accuracy": 0.6124923229217529, "step": 21 }, { "epoch": 0.0149812734082397, "grad_norm": 12.560365676879883, "learning_rate": 5.994550408719347e-07, "loss": 1.4626, "mean_token_accuracy": 0.643330454826355, "step": 22 }, { "epoch": 0.015662240381341504, "grad_norm": 10.672174453735352, "learning_rate": 6.267029972752043e-07, "loss": 1.42, "mean_token_accuracy": 0.6483983397483826, "step": 23 }, { "epoch": 0.01634320735444331, "grad_norm": 9.607712745666504, "learning_rate": 6.539509536784741e-07, "loss": 1.47, "mean_token_accuracy": 0.6365831196308136, "step": 24 }, { "epoch": 0.017024174327545116, "grad_norm": 11.06523323059082, "learning_rate": 6.81198910081744e-07, "loss": 1.4064, "mean_token_accuracy": 0.6517190635204315, "step": 25 }, { "epoch": 0.017705141300646918, "grad_norm": 9.704989433288574, "learning_rate": 7.084468664850137e-07, "loss": 1.3956, "mean_token_accuracy": 0.6300347447395325, "step": 26 }, { "epoch": 0.018386108273748723, "grad_norm": 8.22385311126709, "learning_rate": 7.356948228882835e-07, "loss": 1.3535, "mean_token_accuracy": 0.6378402411937714, "step": 27 }, { "epoch": 0.01906707524685053, "grad_norm": 7.674307823181152, "learning_rate": 7.629427792915532e-07, "loss": 1.44, "mean_token_accuracy": 0.6249757707118988, "step": 28 }, { "epoch": 0.01974804221995233, "grad_norm": 8.440549850463867, "learning_rate": 7.90190735694823e-07, "loss": 1.3743, "mean_token_accuracy": 0.6349103450775146, "step": 29 }, { "epoch": 0.020429009193054137, "grad_norm": 7.452355861663818, "learning_rate": 8.174386920980927e-07, "loss": 1.3389, "mean_token_accuracy": 0.6474592089653015, "step": 30 }, { "epoch": 0.021109976166155942, "grad_norm": 6.534862041473389, "learning_rate": 8.446866485013625e-07, "loss": 1.3911, "mean_token_accuracy": 0.619406670331955, "step": 31 }, { "epoch": 0.021790943139257744, "grad_norm": 6.393230438232422, "learning_rate": 8.719346049046322e-07, "loss": 1.2479, "mean_token_accuracy": 0.6726954877376556, "step": 32 }, { "epoch": 0.02247191011235955, "grad_norm": 6.447970867156982, "learning_rate": 8.991825613079019e-07, "loss": 1.3521, "mean_token_accuracy": 0.6355825066566467, "step": 33 }, { "epoch": 0.023152877085461356, "grad_norm": 6.504948139190674, "learning_rate": 9.264305177111717e-07, "loss": 1.3118, "mean_token_accuracy": 0.6351439654827118, "step": 34 }, { "epoch": 0.02383384405856316, "grad_norm": 7.023467540740967, "learning_rate": 9.536784741144415e-07, "loss": 1.209, "mean_token_accuracy": 0.6648355722427368, "step": 35 }, { "epoch": 0.024514811031664963, "grad_norm": 5.188060283660889, "learning_rate": 9.809264305177114e-07, "loss": 1.361, "mean_token_accuracy": 0.6334840953350067, "step": 36 }, { "epoch": 0.02519577800476677, "grad_norm": 6.371267318725586, "learning_rate": 1.008174386920981e-06, "loss": 1.2929, "mean_token_accuracy": 0.6366410851478577, "step": 37 }, { "epoch": 0.025876744977868574, "grad_norm": 5.656979084014893, "learning_rate": 1.0354223433242507e-06, "loss": 1.2938, "mean_token_accuracy": 0.6391248106956482, "step": 38 }, { "epoch": 0.026557711950970377, "grad_norm": 4.702414035797119, "learning_rate": 1.0626702997275206e-06, "loss": 1.2412, "mean_token_accuracy": 0.6551194190979004, "step": 39 }, { "epoch": 0.027238678924072182, "grad_norm": 4.150415420532227, "learning_rate": 1.0899182561307902e-06, "loss": 1.3817, "mean_token_accuracy": 0.6272943019866943, "step": 40 }, { "epoch": 0.027919645897173988, "grad_norm": 4.169822692871094, "learning_rate": 1.1171662125340601e-06, "loss": 1.2176, "mean_token_accuracy": 0.6531647145748138, "step": 41 }, { "epoch": 0.028600612870275793, "grad_norm": 3.845724582672119, "learning_rate": 1.1444141689373298e-06, "loss": 1.4454, "mean_token_accuracy": 0.6163051724433899, "step": 42 }, { "epoch": 0.029281579843377595, "grad_norm": 4.076864242553711, "learning_rate": 1.1716621253405994e-06, "loss": 1.164, "mean_token_accuracy": 0.6724361777305603, "step": 43 }, { "epoch": 0.0299625468164794, "grad_norm": 3.7217981815338135, "learning_rate": 1.1989100817438693e-06, "loss": 1.1788, "mean_token_accuracy": 0.6656664311885834, "step": 44 }, { "epoch": 0.030643513789581207, "grad_norm": 3.4473254680633545, "learning_rate": 1.2261580381471392e-06, "loss": 1.1474, "mean_token_accuracy": 0.6589868068695068, "step": 45 }, { "epoch": 0.03132448076268301, "grad_norm": 3.5306637287139893, "learning_rate": 1.2534059945504087e-06, "loss": 1.1829, "mean_token_accuracy": 0.6635509431362152, "step": 46 }, { "epoch": 0.032005447735784814, "grad_norm": 3.613558053970337, "learning_rate": 1.2806539509536785e-06, "loss": 1.2617, "mean_token_accuracy": 0.6460641622543335, "step": 47 }, { "epoch": 0.03268641470888662, "grad_norm": 3.3054418563842773, "learning_rate": 1.3079019073569482e-06, "loss": 1.3102, "mean_token_accuracy": 0.6310848295688629, "step": 48 }, { "epoch": 0.033367381681988426, "grad_norm": 2.985588550567627, "learning_rate": 1.335149863760218e-06, "loss": 1.1888, "mean_token_accuracy": 0.6684805452823639, "step": 49 }, { "epoch": 0.03404834865509023, "grad_norm": 2.9784772396087646, "learning_rate": 1.362397820163488e-06, "loss": 1.1804, "mean_token_accuracy": 0.6657974421977997, "step": 50 }, { "epoch": 0.03472931562819203, "grad_norm": 2.862241268157959, "learning_rate": 1.3896457765667576e-06, "loss": 1.1715, "mean_token_accuracy": 0.6661404371261597, "step": 51 }, { "epoch": 0.035410282601293835, "grad_norm": 3.2732224464416504, "learning_rate": 1.4168937329700275e-06, "loss": 1.0741, "mean_token_accuracy": 0.6870141923427582, "step": 52 }, { "epoch": 0.03609124957439564, "grad_norm": 3.078209161758423, "learning_rate": 1.4441416893732972e-06, "loss": 1.1713, "mean_token_accuracy": 0.6462270617485046, "step": 53 }, { "epoch": 0.03677221654749745, "grad_norm": 2.8789873123168945, "learning_rate": 1.471389645776567e-06, "loss": 1.1162, "mean_token_accuracy": 0.6841877400875092, "step": 54 }, { "epoch": 0.03745318352059925, "grad_norm": 2.792794704437256, "learning_rate": 1.4986376021798365e-06, "loss": 1.1887, "mean_token_accuracy": 0.6529568731784821, "step": 55 }, { "epoch": 0.03813415049370106, "grad_norm": 3.4411230087280273, "learning_rate": 1.5258855585831064e-06, "loss": 1.1235, "mean_token_accuracy": 0.6761602163314819, "step": 56 }, { "epoch": 0.03881511746680286, "grad_norm": 4.2955098152160645, "learning_rate": 1.553133514986376e-06, "loss": 1.0569, "mean_token_accuracy": 0.694092720746994, "step": 57 }, { "epoch": 0.03949608443990466, "grad_norm": 3.0012683868408203, "learning_rate": 1.580381471389646e-06, "loss": 1.0756, "mean_token_accuracy": 0.684587687253952, "step": 58 }, { "epoch": 0.04017705141300647, "grad_norm": 2.8529465198516846, "learning_rate": 1.6076294277929156e-06, "loss": 1.1404, "mean_token_accuracy": 0.6569716334342957, "step": 59 }, { "epoch": 0.04085801838610827, "grad_norm": 2.7067220211029053, "learning_rate": 1.6348773841961855e-06, "loss": 1.0632, "mean_token_accuracy": 0.6862024962902069, "step": 60 }, { "epoch": 0.04153898535921008, "grad_norm": 2.6567788124084473, "learning_rate": 1.6621253405994553e-06, "loss": 1.0963, "mean_token_accuracy": 0.6705297231674194, "step": 61 }, { "epoch": 0.042219952332311884, "grad_norm": 2.992030382156372, "learning_rate": 1.689373297002725e-06, "loss": 1.084, "mean_token_accuracy": 0.6536529958248138, "step": 62 }, { "epoch": 0.04290091930541369, "grad_norm": 2.889591932296753, "learning_rate": 1.7166212534059949e-06, "loss": 1.0632, "mean_token_accuracy": 0.6923673152923584, "step": 63 }, { "epoch": 0.04358188627851549, "grad_norm": 2.6617159843444824, "learning_rate": 1.7438692098092643e-06, "loss": 1.1106, "mean_token_accuracy": 0.6748254001140594, "step": 64 }, { "epoch": 0.044262853251617294, "grad_norm": 2.793790340423584, "learning_rate": 1.7711171662125342e-06, "loss": 1.0039, "mean_token_accuracy": 0.6846573054790497, "step": 65 }, { "epoch": 0.0449438202247191, "grad_norm": 2.592761754989624, "learning_rate": 1.7983651226158039e-06, "loss": 1.1079, "mean_token_accuracy": 0.6690859496593475, "step": 66 }, { "epoch": 0.045624787197820905, "grad_norm": 2.716850519180298, "learning_rate": 1.8256130790190738e-06, "loss": 1.1051, "mean_token_accuracy": 0.6672638952732086, "step": 67 }, { "epoch": 0.04630575417092271, "grad_norm": 2.7497122287750244, "learning_rate": 1.8528610354223434e-06, "loss": 1.1368, "mean_token_accuracy": 0.6631091833114624, "step": 68 }, { "epoch": 0.04698672114402452, "grad_norm": 2.491223096847534, "learning_rate": 1.8801089918256133e-06, "loss": 1.0529, "mean_token_accuracy": 0.684629499912262, "step": 69 }, { "epoch": 0.04766768811712632, "grad_norm": 2.295053720474243, "learning_rate": 1.907356948228883e-06, "loss": 1.1802, "mean_token_accuracy": 0.6504453420639038, "step": 70 }, { "epoch": 0.04834865509022812, "grad_norm": 2.760028839111328, "learning_rate": 1.9346049046321526e-06, "loss": 1.0697, "mean_token_accuracy": 0.6640202403068542, "step": 71 }, { "epoch": 0.049029622063329927, "grad_norm": 2.484802007675171, "learning_rate": 1.9618528610354227e-06, "loss": 1.0432, "mean_token_accuracy": 0.6775356531143188, "step": 72 }, { "epoch": 0.04971058903643173, "grad_norm": 2.4951775074005127, "learning_rate": 1.9891008174386924e-06, "loss": 1.1407, "mean_token_accuracy": 0.670388251543045, "step": 73 }, { "epoch": 0.05039155600953354, "grad_norm": 2.446967601776123, "learning_rate": 2.016348773841962e-06, "loss": 1.0918, "mean_token_accuracy": 0.6763316988945007, "step": 74 }, { "epoch": 0.05107252298263534, "grad_norm": 3.105476140975952, "learning_rate": 2.0435967302452317e-06, "loss": 0.9959, "mean_token_accuracy": 0.7007286846637726, "step": 75 }, { "epoch": 0.05175348995573715, "grad_norm": 2.4898130893707275, "learning_rate": 2.0708446866485014e-06, "loss": 1.0704, "mean_token_accuracy": 0.7026256322860718, "step": 76 }, { "epoch": 0.052434456928838954, "grad_norm": 2.9134206771850586, "learning_rate": 2.098092643051771e-06, "loss": 1.0035, "mean_token_accuracy": 0.710890382528305, "step": 77 }, { "epoch": 0.05311542390194075, "grad_norm": 2.9803786277770996, "learning_rate": 2.125340599455041e-06, "loss": 1.028, "mean_token_accuracy": 0.6887361705303192, "step": 78 }, { "epoch": 0.05379639087504256, "grad_norm": 3.036717414855957, "learning_rate": 2.152588555858311e-06, "loss": 1.0579, "mean_token_accuracy": 0.6866554319858551, "step": 79 }, { "epoch": 0.054477357848144364, "grad_norm": 2.7029073238372803, "learning_rate": 2.1798365122615805e-06, "loss": 1.017, "mean_token_accuracy": 0.6999807059764862, "step": 80 }, { "epoch": 0.05515832482124617, "grad_norm": 2.434605360031128, "learning_rate": 2.2070844686648506e-06, "loss": 1.0108, "mean_token_accuracy": 0.6969626247882843, "step": 81 }, { "epoch": 0.055839291794347976, "grad_norm": 2.5236761569976807, "learning_rate": 2.2343324250681202e-06, "loss": 1.0022, "mean_token_accuracy": 0.7031078636646271, "step": 82 }, { "epoch": 0.05652025876744978, "grad_norm": 2.652069330215454, "learning_rate": 2.26158038147139e-06, "loss": 1.0497, "mean_token_accuracy": 0.6771947145462036, "step": 83 }, { "epoch": 0.05720122574055159, "grad_norm": 2.6862378120422363, "learning_rate": 2.2888283378746596e-06, "loss": 1.106, "mean_token_accuracy": 0.6691548824310303, "step": 84 }, { "epoch": 0.057882192713653385, "grad_norm": 2.58535099029541, "learning_rate": 2.3160762942779292e-06, "loss": 1.0854, "mean_token_accuracy": 0.6829398572444916, "step": 85 }, { "epoch": 0.05856315968675519, "grad_norm": 2.4868502616882324, "learning_rate": 2.343324250681199e-06, "loss": 1.1616, "mean_token_accuracy": 0.659432590007782, "step": 86 }, { "epoch": 0.059244126659857, "grad_norm": 2.488142728805542, "learning_rate": 2.370572207084469e-06, "loss": 1.0109, "mean_token_accuracy": 0.6848400235176086, "step": 87 }, { "epoch": 0.0599250936329588, "grad_norm": 2.5840916633605957, "learning_rate": 2.3978201634877386e-06, "loss": 1.0448, "mean_token_accuracy": 0.6854309141635895, "step": 88 }, { "epoch": 0.06060606060606061, "grad_norm": 2.370704174041748, "learning_rate": 2.4250681198910083e-06, "loss": 1.0505, "mean_token_accuracy": 0.6885493695735931, "step": 89 }, { "epoch": 0.06128702757916241, "grad_norm": 2.6293487548828125, "learning_rate": 2.4523160762942784e-06, "loss": 1.1105, "mean_token_accuracy": 0.6853353083133698, "step": 90 }, { "epoch": 0.06196799455226421, "grad_norm": 2.385545015335083, "learning_rate": 2.479564032697548e-06, "loss": 1.0591, "mean_token_accuracy": 0.6934334933757782, "step": 91 }, { "epoch": 0.06264896152536602, "grad_norm": 2.794443368911743, "learning_rate": 2.5068119891008173e-06, "loss": 0.9523, "mean_token_accuracy": 0.7201730608940125, "step": 92 }, { "epoch": 0.06332992849846783, "grad_norm": 2.5390946865081787, "learning_rate": 2.534059945504088e-06, "loss": 0.969, "mean_token_accuracy": 0.7006398737430573, "step": 93 }, { "epoch": 0.06401089547156963, "grad_norm": 2.4496519565582275, "learning_rate": 2.561307901907357e-06, "loss": 1.0456, "mean_token_accuracy": 0.6858505606651306, "step": 94 }, { "epoch": 0.06469186244467143, "grad_norm": 2.6312835216522217, "learning_rate": 2.5885558583106267e-06, "loss": 0.9594, "mean_token_accuracy": 0.7106945514678955, "step": 95 }, { "epoch": 0.06537282941777324, "grad_norm": 2.875840425491333, "learning_rate": 2.6158038147138964e-06, "loss": 0.9966, "mean_token_accuracy": 0.7031852006912231, "step": 96 }, { "epoch": 0.06605379639087504, "grad_norm": 2.6366593837738037, "learning_rate": 2.6430517711171665e-06, "loss": 0.9768, "mean_token_accuracy": 0.703648179769516, "step": 97 }, { "epoch": 0.06673476336397685, "grad_norm": 2.369938373565674, "learning_rate": 2.670299727520436e-06, "loss": 1.0657, "mean_token_accuracy": 0.6897700428962708, "step": 98 }, { "epoch": 0.06741573033707865, "grad_norm": 2.4735870361328125, "learning_rate": 2.697547683923706e-06, "loss": 0.9768, "mean_token_accuracy": 0.6987937390804291, "step": 99 }, { "epoch": 0.06809669731018046, "grad_norm": 2.575582981109619, "learning_rate": 2.724795640326976e-06, "loss": 1.0308, "mean_token_accuracy": 0.6795278191566467, "step": 100 }, { "epoch": 0.06877766428328226, "grad_norm": 2.6614322662353516, "learning_rate": 2.7520435967302456e-06, "loss": 0.9383, "mean_token_accuracy": 0.7131595313549042, "step": 101 }, { "epoch": 0.06945863125638406, "grad_norm": 2.7696969509124756, "learning_rate": 2.7792915531335152e-06, "loss": 1.0013, "mean_token_accuracy": 0.7046048641204834, "step": 102 }, { "epoch": 0.07013959822948587, "grad_norm": 2.4248838424682617, "learning_rate": 2.806539509536785e-06, "loss": 1.0794, "mean_token_accuracy": 0.6787083148956299, "step": 103 }, { "epoch": 0.07082056520258767, "grad_norm": 2.5439233779907227, "learning_rate": 2.833787465940055e-06, "loss": 0.9977, "mean_token_accuracy": 0.7063562870025635, "step": 104 }, { "epoch": 0.07150153217568948, "grad_norm": 2.5257863998413086, "learning_rate": 2.8610354223433247e-06, "loss": 0.9977, "mean_token_accuracy": 0.684896856546402, "step": 105 }, { "epoch": 0.07218249914879128, "grad_norm": 2.5758590698242188, "learning_rate": 2.8882833787465943e-06, "loss": 1.0042, "mean_token_accuracy": 0.7016769349575043, "step": 106 }, { "epoch": 0.0728634661218931, "grad_norm": 2.643832206726074, "learning_rate": 2.9155313351498636e-06, "loss": 0.9964, "mean_token_accuracy": 0.697417289018631, "step": 107 }, { "epoch": 0.0735444330949949, "grad_norm": 2.4691383838653564, "learning_rate": 2.942779291553134e-06, "loss": 0.9487, "mean_token_accuracy": 0.7178450524806976, "step": 108 }, { "epoch": 0.07422540006809669, "grad_norm": 2.6147122383117676, "learning_rate": 2.9700272479564033e-06, "loss": 0.964, "mean_token_accuracy": 0.7047536969184875, "step": 109 }, { "epoch": 0.0749063670411985, "grad_norm": 2.308856725692749, "learning_rate": 2.997275204359673e-06, "loss": 1.0433, "mean_token_accuracy": 0.696091890335083, "step": 110 }, { "epoch": 0.0755873340143003, "grad_norm": 2.14028000831604, "learning_rate": 3.024523160762943e-06, "loss": 1.1188, "mean_token_accuracy": 0.6687117218971252, "step": 111 }, { "epoch": 0.07626830098740212, "grad_norm": 2.546440601348877, "learning_rate": 3.0517711171662127e-06, "loss": 1.0665, "mean_token_accuracy": 0.672695130109787, "step": 112 }, { "epoch": 0.07694926796050391, "grad_norm": 2.512263059616089, "learning_rate": 3.0790190735694824e-06, "loss": 1.0152, "mean_token_accuracy": 0.6926726996898651, "step": 113 }, { "epoch": 0.07763023493360573, "grad_norm": 2.5159096717834473, "learning_rate": 3.106267029972752e-06, "loss": 0.9483, "mean_token_accuracy": 0.716807097196579, "step": 114 }, { "epoch": 0.07831120190670753, "grad_norm": 2.494687557220459, "learning_rate": 3.133514986376022e-06, "loss": 1.1383, "mean_token_accuracy": 0.6673609614372253, "step": 115 }, { "epoch": 0.07899216887980932, "grad_norm": 2.5116608142852783, "learning_rate": 3.160762942779292e-06, "loss": 0.9699, "mean_token_accuracy": 0.7078655362129211, "step": 116 }, { "epoch": 0.07967313585291114, "grad_norm": 2.5314524173736572, "learning_rate": 3.1880108991825615e-06, "loss": 1.0113, "mean_token_accuracy": 0.690326064825058, "step": 117 }, { "epoch": 0.08035410282601294, "grad_norm": 2.265902042388916, "learning_rate": 3.215258855585831e-06, "loss": 1.0477, "mean_token_accuracy": 0.6972506642341614, "step": 118 }, { "epoch": 0.08103506979911475, "grad_norm": 2.5491557121276855, "learning_rate": 3.2425068119891012e-06, "loss": 1.029, "mean_token_accuracy": 0.6906687319278717, "step": 119 }, { "epoch": 0.08171603677221655, "grad_norm": 2.4932515621185303, "learning_rate": 3.269754768392371e-06, "loss": 1.1385, "mean_token_accuracy": 0.672741562128067, "step": 120 }, { "epoch": 0.08239700374531835, "grad_norm": 2.7600481510162354, "learning_rate": 3.2970027247956406e-06, "loss": 1.0434, "mean_token_accuracy": 0.6859106123447418, "step": 121 }, { "epoch": 0.08307797071842016, "grad_norm": 2.338163375854492, "learning_rate": 3.3242506811989107e-06, "loss": 1.0289, "mean_token_accuracy": 0.6926295459270477, "step": 122 }, { "epoch": 0.08375893769152196, "grad_norm": 3.0703768730163574, "learning_rate": 3.3514986376021803e-06, "loss": 0.9286, "mean_token_accuracy": 0.7183296382427216, "step": 123 }, { "epoch": 0.08443990466462377, "grad_norm": 2.340935468673706, "learning_rate": 3.37874659400545e-06, "loss": 1.0804, "mean_token_accuracy": 0.6824978590011597, "step": 124 }, { "epoch": 0.08512087163772557, "grad_norm": 2.5963664054870605, "learning_rate": 3.4059945504087192e-06, "loss": 0.9423, "mean_token_accuracy": 0.7117791473865509, "step": 125 }, { "epoch": 0.08580183861082738, "grad_norm": 2.584036111831665, "learning_rate": 3.4332425068119898e-06, "loss": 0.8537, "mean_token_accuracy": 0.7335821688175201, "step": 126 }, { "epoch": 0.08648280558392918, "grad_norm": 2.415083408355713, "learning_rate": 3.460490463215259e-06, "loss": 1.0651, "mean_token_accuracy": 0.6860635578632355, "step": 127 }, { "epoch": 0.08716377255703098, "grad_norm": 2.392087697982788, "learning_rate": 3.4877384196185287e-06, "loss": 1.062, "mean_token_accuracy": 0.6778083443641663, "step": 128 }, { "epoch": 0.08784473953013279, "grad_norm": 2.430607795715332, "learning_rate": 3.5149863760217988e-06, "loss": 0.9574, "mean_token_accuracy": 0.7057070434093475, "step": 129 }, { "epoch": 0.08852570650323459, "grad_norm": 2.511870861053467, "learning_rate": 3.5422343324250684e-06, "loss": 1.0108, "mean_token_accuracy": 0.7002857327461243, "step": 130 }, { "epoch": 0.0892066734763364, "grad_norm": 2.5731821060180664, "learning_rate": 3.569482288828338e-06, "loss": 0.9415, "mean_token_accuracy": 0.7087344825267792, "step": 131 }, { "epoch": 0.0898876404494382, "grad_norm": 2.454852342605591, "learning_rate": 3.5967302452316077e-06, "loss": 1.0195, "mean_token_accuracy": 0.6891840398311615, "step": 132 }, { "epoch": 0.09056860742254001, "grad_norm": 2.158175230026245, "learning_rate": 3.623978201634878e-06, "loss": 1.0126, "mean_token_accuracy": 0.6920995116233826, "step": 133 }, { "epoch": 0.09124957439564181, "grad_norm": 2.2620928287506104, "learning_rate": 3.6512261580381475e-06, "loss": 0.9856, "mean_token_accuracy": 0.6989799737930298, "step": 134 }, { "epoch": 0.09193054136874361, "grad_norm": 2.561161518096924, "learning_rate": 3.678474114441417e-06, "loss": 0.957, "mean_token_accuracy": 0.7004507482051849, "step": 135 }, { "epoch": 0.09261150834184542, "grad_norm": 2.5761749744415283, "learning_rate": 3.705722070844687e-06, "loss": 1.1134, "mean_token_accuracy": 0.6762617528438568, "step": 136 }, { "epoch": 0.09329247531494722, "grad_norm": 2.4754040241241455, "learning_rate": 3.732970027247957e-06, "loss": 0.9496, "mean_token_accuracy": 0.7046872973442078, "step": 137 }, { "epoch": 0.09397344228804903, "grad_norm": 2.3019425868988037, "learning_rate": 3.7602179836512266e-06, "loss": 0.983, "mean_token_accuracy": 0.6907126307487488, "step": 138 }, { "epoch": 0.09465440926115083, "grad_norm": 2.622598171234131, "learning_rate": 3.7874659400544963e-06, "loss": 0.9334, "mean_token_accuracy": 0.7184892296791077, "step": 139 }, { "epoch": 0.09533537623425264, "grad_norm": 2.4993011951446533, "learning_rate": 3.814713896457766e-06, "loss": 1.0057, "mean_token_accuracy": 0.6826151311397552, "step": 140 }, { "epoch": 0.09601634320735444, "grad_norm": 2.4598031044006348, "learning_rate": 3.841961852861036e-06, "loss": 0.9786, "mean_token_accuracy": 0.7010819315910339, "step": 141 }, { "epoch": 0.09669731018045624, "grad_norm": 2.6761279106140137, "learning_rate": 3.869209809264305e-06, "loss": 0.897, "mean_token_accuracy": 0.7269688546657562, "step": 142 }, { "epoch": 0.09737827715355805, "grad_norm": 2.665964126586914, "learning_rate": 3.896457765667575e-06, "loss": 0.9526, "mean_token_accuracy": 0.7097197473049164, "step": 143 }, { "epoch": 0.09805924412665985, "grad_norm": 2.2451531887054443, "learning_rate": 3.9237057220708454e-06, "loss": 0.9658, "mean_token_accuracy": 0.7135612964630127, "step": 144 }, { "epoch": 0.09874021109976167, "grad_norm": 2.8205313682556152, "learning_rate": 3.950953678474115e-06, "loss": 0.9969, "mean_token_accuracy": 0.7105589807033539, "step": 145 }, { "epoch": 0.09942117807286346, "grad_norm": 2.2606446743011475, "learning_rate": 3.978201634877385e-06, "loss": 1.0601, "mean_token_accuracy": 0.6769811809062958, "step": 146 }, { "epoch": 0.10010214504596528, "grad_norm": 2.512885332107544, "learning_rate": 4.005449591280654e-06, "loss": 1.0474, "mean_token_accuracy": 0.6872524321079254, "step": 147 }, { "epoch": 0.10078311201906708, "grad_norm": 2.314659595489502, "learning_rate": 4.032697547683924e-06, "loss": 1.0248, "mean_token_accuracy": 0.694789320230484, "step": 148 }, { "epoch": 0.10146407899216887, "grad_norm": 2.1484744548797607, "learning_rate": 4.059945504087194e-06, "loss": 1.0551, "mean_token_accuracy": 0.6894827485084534, "step": 149 }, { "epoch": 0.10214504596527069, "grad_norm": 2.422956943511963, "learning_rate": 4.0871934604904634e-06, "loss": 0.9552, "mean_token_accuracy": 0.7020109593868256, "step": 150 }, { "epoch": 0.10282601293837249, "grad_norm": 2.276228904724121, "learning_rate": 4.1144414168937335e-06, "loss": 1.071, "mean_token_accuracy": 0.6777399182319641, "step": 151 }, { "epoch": 0.1035069799114743, "grad_norm": 2.2785444259643555, "learning_rate": 4.141689373297003e-06, "loss": 1.0352, "mean_token_accuracy": 0.6892713010311127, "step": 152 }, { "epoch": 0.1041879468845761, "grad_norm": 2.485365867614746, "learning_rate": 4.168937329700273e-06, "loss": 0.9749, "mean_token_accuracy": 0.7063421010971069, "step": 153 }, { "epoch": 0.10486891385767791, "grad_norm": 2.4318220615386963, "learning_rate": 4.196185286103542e-06, "loss": 1.0547, "mean_token_accuracy": 0.6888841986656189, "step": 154 }, { "epoch": 0.10554988083077971, "grad_norm": 2.4517734050750732, "learning_rate": 4.223433242506812e-06, "loss": 1.0071, "mean_token_accuracy": 0.7006440758705139, "step": 155 }, { "epoch": 0.1062308478038815, "grad_norm": 2.5244476795196533, "learning_rate": 4.250681198910082e-06, "loss": 0.9586, "mean_token_accuracy": 0.7127312421798706, "step": 156 }, { "epoch": 0.10691181477698332, "grad_norm": 2.47511625289917, "learning_rate": 4.2779291553133515e-06, "loss": 0.8524, "mean_token_accuracy": 0.7397986650466919, "step": 157 }, { "epoch": 0.10759278175008512, "grad_norm": 2.3621816635131836, "learning_rate": 4.305177111716622e-06, "loss": 0.9814, "mean_token_accuracy": 0.6979590952396393, "step": 158 }, { "epoch": 0.10827374872318693, "grad_norm": 2.432455062866211, "learning_rate": 4.332425068119892e-06, "loss": 1.0161, "mean_token_accuracy": 0.6944882571697235, "step": 159 }, { "epoch": 0.10895471569628873, "grad_norm": 2.5219204425811768, "learning_rate": 4.359673024523161e-06, "loss": 0.9949, "mean_token_accuracy": 0.6847666203975677, "step": 160 }, { "epoch": 0.10963568266939054, "grad_norm": 2.718780279159546, "learning_rate": 4.386920980926431e-06, "loss": 0.9776, "mean_token_accuracy": 0.7063867747783661, "step": 161 }, { "epoch": 0.11031664964249234, "grad_norm": 2.5605454444885254, "learning_rate": 4.414168937329701e-06, "loss": 1.0304, "mean_token_accuracy": 0.6841610372066498, "step": 162 }, { "epoch": 0.11099761661559414, "grad_norm": 2.226867914199829, "learning_rate": 4.44141689373297e-06, "loss": 1.0658, "mean_token_accuracy": 0.6781592071056366, "step": 163 }, { "epoch": 0.11167858358869595, "grad_norm": 2.5243077278137207, "learning_rate": 4.4686648501362404e-06, "loss": 0.9968, "mean_token_accuracy": 0.7050759494304657, "step": 164 }, { "epoch": 0.11235955056179775, "grad_norm": 2.2741949558258057, "learning_rate": 4.49591280653951e-06, "loss": 0.9635, "mean_token_accuracy": 0.7061051726341248, "step": 165 }, { "epoch": 0.11304051753489956, "grad_norm": 2.6713883876800537, "learning_rate": 4.52316076294278e-06, "loss": 0.9944, "mean_token_accuracy": 0.7002812922000885, "step": 166 }, { "epoch": 0.11372148450800136, "grad_norm": 2.417252779006958, "learning_rate": 4.55040871934605e-06, "loss": 0.944, "mean_token_accuracy": 0.7049722969532013, "step": 167 }, { "epoch": 0.11440245148110317, "grad_norm": 2.8692402839660645, "learning_rate": 4.577656675749319e-06, "loss": 0.9709, "mean_token_accuracy": 0.7091069519519806, "step": 168 }, { "epoch": 0.11508341845420497, "grad_norm": 2.4961907863616943, "learning_rate": 4.604904632152589e-06, "loss": 0.9386, "mean_token_accuracy": 0.7128314971923828, "step": 169 }, { "epoch": 0.11576438542730677, "grad_norm": 2.322838306427002, "learning_rate": 4.6321525885558584e-06, "loss": 0.9771, "mean_token_accuracy": 0.6940089166164398, "step": 170 }, { "epoch": 0.11644535240040858, "grad_norm": 2.4721970558166504, "learning_rate": 4.6594005449591285e-06, "loss": 0.9951, "mean_token_accuracy": 0.6994805634021759, "step": 171 }, { "epoch": 0.11712631937351038, "grad_norm": 2.55180025100708, "learning_rate": 4.686648501362398e-06, "loss": 1.027, "mean_token_accuracy": 0.675321489572525, "step": 172 }, { "epoch": 0.1178072863466122, "grad_norm": 2.5230815410614014, "learning_rate": 4.713896457765668e-06, "loss": 0.9502, "mean_token_accuracy": 0.7187055051326752, "step": 173 }, { "epoch": 0.118488253319714, "grad_norm": 2.5215113162994385, "learning_rate": 4.741144414168938e-06, "loss": 0.9234, "mean_token_accuracy": 0.7097808420658112, "step": 174 }, { "epoch": 0.11916922029281579, "grad_norm": 2.520535945892334, "learning_rate": 4.768392370572207e-06, "loss": 0.9665, "mean_token_accuracy": 0.7100746929645538, "step": 175 }, { "epoch": 0.1198501872659176, "grad_norm": 2.813154458999634, "learning_rate": 4.795640326975477e-06, "loss": 0.9709, "mean_token_accuracy": 0.7124538421630859, "step": 176 }, { "epoch": 0.1205311542390194, "grad_norm": 2.269540786743164, "learning_rate": 4.822888283378747e-06, "loss": 0.9112, "mean_token_accuracy": 0.7184945642948151, "step": 177 }, { "epoch": 0.12121212121212122, "grad_norm": 2.592312812805176, "learning_rate": 4.850136239782017e-06, "loss": 0.9121, "mean_token_accuracy": 0.7240852117538452, "step": 178 }, { "epoch": 0.12189308818522301, "grad_norm": 2.327709197998047, "learning_rate": 4.877384196185287e-06, "loss": 0.9142, "mean_token_accuracy": 0.7164303064346313, "step": 179 }, { "epoch": 0.12257405515832483, "grad_norm": 2.4850354194641113, "learning_rate": 4.904632152588557e-06, "loss": 1.0291, "mean_token_accuracy": 0.6823698282241821, "step": 180 }, { "epoch": 0.12325502213142663, "grad_norm": 2.5522592067718506, "learning_rate": 4.931880108991826e-06, "loss": 1.051, "mean_token_accuracy": 0.673934817314148, "step": 181 }, { "epoch": 0.12393598910452842, "grad_norm": 2.531865119934082, "learning_rate": 4.959128065395096e-06, "loss": 0.9938, "mean_token_accuracy": 0.692850649356842, "step": 182 }, { "epoch": 0.12461695607763024, "grad_norm": 2.4927761554718018, "learning_rate": 4.986376021798365e-06, "loss": 0.9936, "mean_token_accuracy": 0.698630690574646, "step": 183 }, { "epoch": 0.12529792305073204, "grad_norm": 2.607694149017334, "learning_rate": 5.013623978201635e-06, "loss": 1.0279, "mean_token_accuracy": 0.681338757276535, "step": 184 }, { "epoch": 0.12597889002383383, "grad_norm": 2.2768726348876953, "learning_rate": 5.040871934604905e-06, "loss": 0.9991, "mean_token_accuracy": 0.6902775466442108, "step": 185 }, { "epoch": 0.12665985699693566, "grad_norm": 2.7199325561523438, "learning_rate": 5.068119891008176e-06, "loss": 0.8921, "mean_token_accuracy": 0.730594664812088, "step": 186 }, { "epoch": 0.12734082397003746, "grad_norm": 2.3718113899230957, "learning_rate": 5.095367847411444e-06, "loss": 1.1041, "mean_token_accuracy": 0.6754767894744873, "step": 187 }, { "epoch": 0.12802179094313926, "grad_norm": 2.449296712875366, "learning_rate": 5.122615803814714e-06, "loss": 1.0576, "mean_token_accuracy": 0.6681184768676758, "step": 188 }, { "epoch": 0.12870275791624106, "grad_norm": 2.450751781463623, "learning_rate": 5.149863760217984e-06, "loss": 1.0117, "mean_token_accuracy": 0.6874967515468597, "step": 189 }, { "epoch": 0.12938372488934285, "grad_norm": 2.274921178817749, "learning_rate": 5.1771117166212534e-06, "loss": 0.9557, "mean_token_accuracy": 0.7093174159526825, "step": 190 }, { "epoch": 0.13006469186244468, "grad_norm": 2.471034526824951, "learning_rate": 5.2043596730245235e-06, "loss": 0.9651, "mean_token_accuracy": 0.6923124194145203, "step": 191 }, { "epoch": 0.13074565883554648, "grad_norm": 2.6597113609313965, "learning_rate": 5.231607629427793e-06, "loss": 0.9818, "mean_token_accuracy": 0.7001949548721313, "step": 192 }, { "epoch": 0.13142662580864828, "grad_norm": 2.5628018379211426, "learning_rate": 5.258855585831063e-06, "loss": 0.9273, "mean_token_accuracy": 0.7266919612884521, "step": 193 }, { "epoch": 0.13210759278175008, "grad_norm": 2.568950891494751, "learning_rate": 5.286103542234333e-06, "loss": 1.1235, "mean_token_accuracy": 0.6690371632575989, "step": 194 }, { "epoch": 0.13278855975485188, "grad_norm": 2.5272791385650635, "learning_rate": 5.313351498637602e-06, "loss": 0.8979, "mean_token_accuracy": 0.7243370413780212, "step": 195 }, { "epoch": 0.1334695267279537, "grad_norm": 2.6023268699645996, "learning_rate": 5.340599455040872e-06, "loss": 0.8665, "mean_token_accuracy": 0.7338777184486389, "step": 196 }, { "epoch": 0.1341504937010555, "grad_norm": 2.3100478649139404, "learning_rate": 5.367847411444142e-06, "loss": 0.9544, "mean_token_accuracy": 0.7191211879253387, "step": 197 }, { "epoch": 0.1348314606741573, "grad_norm": 2.636134386062622, "learning_rate": 5.395095367847412e-06, "loss": 0.9613, "mean_token_accuracy": 0.7035630643367767, "step": 198 }, { "epoch": 0.1355124276472591, "grad_norm": 2.2212631702423096, "learning_rate": 5.422343324250682e-06, "loss": 1.0467, "mean_token_accuracy": 0.6657994091510773, "step": 199 }, { "epoch": 0.13619339462036092, "grad_norm": 2.7496116161346436, "learning_rate": 5.449591280653952e-06, "loss": 0.9424, "mean_token_accuracy": 0.7123024761676788, "step": 200 }, { "epoch": 0.13687436159346272, "grad_norm": 2.5591156482696533, "learning_rate": 5.476839237057221e-06, "loss": 0.8457, "mean_token_accuracy": 0.7378284633159637, "step": 201 }, { "epoch": 0.13755532856656452, "grad_norm": 2.70920729637146, "learning_rate": 5.504087193460491e-06, "loss": 0.7901, "mean_token_accuracy": 0.7509130537509918, "step": 202 }, { "epoch": 0.13823629553966632, "grad_norm": 2.4922616481781006, "learning_rate": 5.53133514986376e-06, "loss": 1.0036, "mean_token_accuracy": 0.6984784603118896, "step": 203 }, { "epoch": 0.13891726251276812, "grad_norm": 2.6958937644958496, "learning_rate": 5.5585831062670305e-06, "loss": 1.0225, "mean_token_accuracy": 0.6768087446689606, "step": 204 }, { "epoch": 0.13959822948586995, "grad_norm": 2.3955788612365723, "learning_rate": 5.5858310626703006e-06, "loss": 0.9787, "mean_token_accuracy": 0.7038420140743256, "step": 205 }, { "epoch": 0.14027919645897174, "grad_norm": 2.412672758102417, "learning_rate": 5.61307901907357e-06, "loss": 0.9374, "mean_token_accuracy": 0.7079489827156067, "step": 206 }, { "epoch": 0.14096016343207354, "grad_norm": 2.303535223007202, "learning_rate": 5.64032697547684e-06, "loss": 0.9901, "mean_token_accuracy": 0.6918758153915405, "step": 207 }, { "epoch": 0.14164113040517534, "grad_norm": 2.575371503829956, "learning_rate": 5.66757493188011e-06, "loss": 0.928, "mean_token_accuracy": 0.7228056192398071, "step": 208 }, { "epoch": 0.14232209737827714, "grad_norm": 2.449110984802246, "learning_rate": 5.694822888283379e-06, "loss": 0.9307, "mean_token_accuracy": 0.7218072712421417, "step": 209 }, { "epoch": 0.14300306435137897, "grad_norm": 2.466862678527832, "learning_rate": 5.722070844686649e-06, "loss": 0.9489, "mean_token_accuracy": 0.7062229514122009, "step": 210 }, { "epoch": 0.14368403132448077, "grad_norm": 2.4145846366882324, "learning_rate": 5.749318801089919e-06, "loss": 0.9657, "mean_token_accuracy": 0.716933012008667, "step": 211 }, { "epoch": 0.14436499829758256, "grad_norm": 2.608534097671509, "learning_rate": 5.776566757493189e-06, "loss": 0.9286, "mean_token_accuracy": 0.7296473383903503, "step": 212 }, { "epoch": 0.14504596527068436, "grad_norm": 2.9116387367248535, "learning_rate": 5.803814713896459e-06, "loss": 0.935, "mean_token_accuracy": 0.7242195308208466, "step": 213 }, { "epoch": 0.1457269322437862, "grad_norm": 2.5964434146881104, "learning_rate": 5.831062670299727e-06, "loss": 0.9514, "mean_token_accuracy": 0.7178096175193787, "step": 214 }, { "epoch": 0.146407899216888, "grad_norm": 2.455221652984619, "learning_rate": 5.858310626702998e-06, "loss": 0.9458, "mean_token_accuracy": 0.718619167804718, "step": 215 }, { "epoch": 0.1470888661899898, "grad_norm": 2.763239622116089, "learning_rate": 5.885558583106268e-06, "loss": 0.9629, "mean_token_accuracy": 0.7131022214889526, "step": 216 }, { "epoch": 0.14776983316309159, "grad_norm": 2.4266304969787598, "learning_rate": 5.9128065395095365e-06, "loss": 0.991, "mean_token_accuracy": 0.6853416860103607, "step": 217 }, { "epoch": 0.14845080013619338, "grad_norm": 2.602375030517578, "learning_rate": 5.940054495912807e-06, "loss": 0.9104, "mean_token_accuracy": 0.7143982350826263, "step": 218 }, { "epoch": 0.1491317671092952, "grad_norm": 2.4283480644226074, "learning_rate": 5.9673024523160776e-06, "loss": 1.0449, "mean_token_accuracy": 0.7012826204299927, "step": 219 }, { "epoch": 0.149812734082397, "grad_norm": 2.677830457687378, "learning_rate": 5.994550408719346e-06, "loss": 0.9327, "mean_token_accuracy": 0.7144048810005188, "step": 220 }, { "epoch": 0.1504937010554988, "grad_norm": 2.710820436477661, "learning_rate": 6.021798365122616e-06, "loss": 0.9482, "mean_token_accuracy": 0.7169874012470245, "step": 221 }, { "epoch": 0.1511746680286006, "grad_norm": 2.7551326751708984, "learning_rate": 6.049046321525886e-06, "loss": 0.8981, "mean_token_accuracy": 0.7344523370265961, "step": 222 }, { "epoch": 0.1518556350017024, "grad_norm": 2.4803273677825928, "learning_rate": 6.076294277929155e-06, "loss": 0.9777, "mean_token_accuracy": 0.7069870829582214, "step": 223 }, { "epoch": 0.15253660197480423, "grad_norm": 2.3140079975128174, "learning_rate": 6.1035422343324255e-06, "loss": 1.1179, "mean_token_accuracy": 0.6708060503005981, "step": 224 }, { "epoch": 0.15321756894790603, "grad_norm": 2.5226223468780518, "learning_rate": 6.130790190735695e-06, "loss": 0.9824, "mean_token_accuracy": 0.7010678648948669, "step": 225 }, { "epoch": 0.15389853592100783, "grad_norm": 2.795421600341797, "learning_rate": 6.158038147138965e-06, "loss": 0.9542, "mean_token_accuracy": 0.7052801251411438, "step": 226 }, { "epoch": 0.15457950289410963, "grad_norm": 2.6137564182281494, "learning_rate": 6.185286103542235e-06, "loss": 0.9724, "mean_token_accuracy": 0.7098724842071533, "step": 227 }, { "epoch": 0.15526046986721145, "grad_norm": 2.7376022338867188, "learning_rate": 6.212534059945504e-06, "loss": 0.9, "mean_token_accuracy": 0.7245730459690094, "step": 228 }, { "epoch": 0.15594143684031325, "grad_norm": 2.656113624572754, "learning_rate": 6.239782016348774e-06, "loss": 0.9532, "mean_token_accuracy": 0.7064759433269501, "step": 229 }, { "epoch": 0.15662240381341505, "grad_norm": 2.446074962615967, "learning_rate": 6.267029972752044e-06, "loss": 0.9334, "mean_token_accuracy": 0.7031857073307037, "step": 230 }, { "epoch": 0.15730337078651685, "grad_norm": 2.4970176219940186, "learning_rate": 6.2942779291553136e-06, "loss": 1.1164, "mean_token_accuracy": 0.6511681973934174, "step": 231 }, { "epoch": 0.15798433775961865, "grad_norm": 2.610807180404663, "learning_rate": 6.321525885558584e-06, "loss": 0.9708, "mean_token_accuracy": 0.7075773775577545, "step": 232 }, { "epoch": 0.15866530473272047, "grad_norm": 3.2579939365386963, "learning_rate": 6.348773841961854e-06, "loss": 0.9027, "mean_token_accuracy": 0.719416469335556, "step": 233 }, { "epoch": 0.15934627170582227, "grad_norm": 2.651886463165283, "learning_rate": 6.376021798365123e-06, "loss": 0.9376, "mean_token_accuracy": 0.718554675579071, "step": 234 }, { "epoch": 0.16002723867892407, "grad_norm": 2.4120869636535645, "learning_rate": 6.403269754768393e-06, "loss": 0.9804, "mean_token_accuracy": 0.6983655095100403, "step": 235 }, { "epoch": 0.16070820565202587, "grad_norm": 2.4595322608947754, "learning_rate": 6.430517711171662e-06, "loss": 0.9201, "mean_token_accuracy": 0.7112562954425812, "step": 236 }, { "epoch": 0.16138917262512767, "grad_norm": 2.4260218143463135, "learning_rate": 6.457765667574932e-06, "loss": 0.976, "mean_token_accuracy": 0.7086548507213593, "step": 237 }, { "epoch": 0.1620701395982295, "grad_norm": 2.335083484649658, "learning_rate": 6.4850136239782025e-06, "loss": 0.8805, "mean_token_accuracy": 0.7343010008335114, "step": 238 }, { "epoch": 0.1627511065713313, "grad_norm": 2.3388257026672363, "learning_rate": 6.512261580381472e-06, "loss": 0.9398, "mean_token_accuracy": 0.7115568220615387, "step": 239 }, { "epoch": 0.1634320735444331, "grad_norm": 2.500811815261841, "learning_rate": 6.539509536784742e-06, "loss": 0.9647, "mean_token_accuracy": 0.7099231779575348, "step": 240 }, { "epoch": 0.1641130405175349, "grad_norm": 2.3447611331939697, "learning_rate": 6.566757493188012e-06, "loss": 0.9123, "mean_token_accuracy": 0.7282127737998962, "step": 241 }, { "epoch": 0.1647940074906367, "grad_norm": 2.5431621074676514, "learning_rate": 6.594005449591281e-06, "loss": 0.9492, "mean_token_accuracy": 0.7077975869178772, "step": 242 }, { "epoch": 0.16547497446373852, "grad_norm": 2.2503278255462646, "learning_rate": 6.621253405994551e-06, "loss": 0.8914, "mean_token_accuracy": 0.7168066203594208, "step": 243 }, { "epoch": 0.16615594143684032, "grad_norm": 2.6269919872283936, "learning_rate": 6.648501362397821e-06, "loss": 0.9241, "mean_token_accuracy": 0.7060259282588959, "step": 244 }, { "epoch": 0.16683690840994211, "grad_norm": 2.503176689147949, "learning_rate": 6.6757493188010906e-06, "loss": 0.9986, "mean_token_accuracy": 0.7027491331100464, "step": 245 }, { "epoch": 0.1675178753830439, "grad_norm": 2.456500291824341, "learning_rate": 6.702997275204361e-06, "loss": 0.9783, "mean_token_accuracy": 0.6997337937355042, "step": 246 }, { "epoch": 0.16819884235614574, "grad_norm": 2.400103807449341, "learning_rate": 6.730245231607629e-06, "loss": 1.1245, "mean_token_accuracy": 0.6615694165229797, "step": 247 }, { "epoch": 0.16887980932924754, "grad_norm": 2.743508815765381, "learning_rate": 6.7574931880109e-06, "loss": 0.9589, "mean_token_accuracy": 0.7100839614868164, "step": 248 }, { "epoch": 0.16956077630234934, "grad_norm": 2.486752510070801, "learning_rate": 6.78474114441417e-06, "loss": 0.9832, "mean_token_accuracy": 0.6934075355529785, "step": 249 }, { "epoch": 0.17024174327545114, "grad_norm": 2.3947010040283203, "learning_rate": 6.8119891008174385e-06, "loss": 0.9391, "mean_token_accuracy": 0.71944859623909, "step": 250 }, { "epoch": 0.17092271024855293, "grad_norm": 2.32716965675354, "learning_rate": 6.8392370572207086e-06, "loss": 1.099, "mean_token_accuracy": 0.6843545436859131, "step": 251 }, { "epoch": 0.17160367722165476, "grad_norm": 2.4989588260650635, "learning_rate": 6.8664850136239795e-06, "loss": 1.1542, "mean_token_accuracy": 0.67347651720047, "step": 252 }, { "epoch": 0.17228464419475656, "grad_norm": 2.3616156578063965, "learning_rate": 6.893732970027248e-06, "loss": 0.9829, "mean_token_accuracy": 0.6981527507305145, "step": 253 }, { "epoch": 0.17296561116785836, "grad_norm": 2.570284605026245, "learning_rate": 6.920980926430518e-06, "loss": 0.9055, "mean_token_accuracy": 0.7147378921508789, "step": 254 }, { "epoch": 0.17364657814096016, "grad_norm": 2.319035053253174, "learning_rate": 6.948228882833788e-06, "loss": 0.9799, "mean_token_accuracy": 0.711506724357605, "step": 255 }, { "epoch": 0.17432754511406195, "grad_norm": 2.4279024600982666, "learning_rate": 6.975476839237057e-06, "loss": 0.9019, "mean_token_accuracy": 0.7186145484447479, "step": 256 }, { "epoch": 0.17500851208716378, "grad_norm": 2.4472837448120117, "learning_rate": 7.002724795640327e-06, "loss": 1.1417, "mean_token_accuracy": 0.6674116253852844, "step": 257 }, { "epoch": 0.17568947906026558, "grad_norm": 2.6238808631896973, "learning_rate": 7.0299727520435975e-06, "loss": 1.0453, "mean_token_accuracy": 0.6689874827861786, "step": 258 }, { "epoch": 0.17637044603336738, "grad_norm": 2.4434561729431152, "learning_rate": 7.057220708446867e-06, "loss": 1.0102, "mean_token_accuracy": 0.6811538338661194, "step": 259 }, { "epoch": 0.17705141300646918, "grad_norm": 2.323099374771118, "learning_rate": 7.084468664850137e-06, "loss": 0.8767, "mean_token_accuracy": 0.7273410856723785, "step": 260 }, { "epoch": 0.177732379979571, "grad_norm": 2.391322135925293, "learning_rate": 7.111716621253406e-06, "loss": 0.9588, "mean_token_accuracy": 0.7081874310970306, "step": 261 }, { "epoch": 0.1784133469526728, "grad_norm": 2.4588427543640137, "learning_rate": 7.138964577656676e-06, "loss": 1.1244, "mean_token_accuracy": 0.6721414625644684, "step": 262 }, { "epoch": 0.1790943139257746, "grad_norm": 2.247615337371826, "learning_rate": 7.166212534059946e-06, "loss": 0.9917, "mean_token_accuracy": 0.6989217102527618, "step": 263 }, { "epoch": 0.1797752808988764, "grad_norm": 2.3428924083709717, "learning_rate": 7.1934604904632155e-06, "loss": 1.068, "mean_token_accuracy": 0.6718483865261078, "step": 264 }, { "epoch": 0.1804562478719782, "grad_norm": 2.6452176570892334, "learning_rate": 7.220708446866486e-06, "loss": 0.8249, "mean_token_accuracy": 0.7430061995983124, "step": 265 }, { "epoch": 0.18113721484508002, "grad_norm": 2.172834634780884, "learning_rate": 7.247956403269756e-06, "loss": 0.9925, "mean_token_accuracy": 0.6928633749485016, "step": 266 }, { "epoch": 0.18181818181818182, "grad_norm": 2.367096185684204, "learning_rate": 7.275204359673025e-06, "loss": 1.054, "mean_token_accuracy": 0.6881313025951385, "step": 267 }, { "epoch": 0.18249914879128362, "grad_norm": 2.3216845989227295, "learning_rate": 7.302452316076295e-06, "loss": 0.9478, "mean_token_accuracy": 0.7130939662456512, "step": 268 }, { "epoch": 0.18318011576438542, "grad_norm": 2.4975552558898926, "learning_rate": 7.329700272479565e-06, "loss": 1.0033, "mean_token_accuracy": 0.6932645440101624, "step": 269 }, { "epoch": 0.18386108273748722, "grad_norm": 2.485316514968872, "learning_rate": 7.356948228882834e-06, "loss": 0.9401, "mean_token_accuracy": 0.711159348487854, "step": 270 }, { "epoch": 0.18454204971058905, "grad_norm": 2.4259192943573, "learning_rate": 7.3841961852861044e-06, "loss": 0.9775, "mean_token_accuracy": 0.6948962509632111, "step": 271 }, { "epoch": 0.18522301668369084, "grad_norm": 2.3818612098693848, "learning_rate": 7.411444141689374e-06, "loss": 0.958, "mean_token_accuracy": 0.7085221409797668, "step": 272 }, { "epoch": 0.18590398365679264, "grad_norm": 2.6644647121429443, "learning_rate": 7.438692098092644e-06, "loss": 0.9456, "mean_token_accuracy": 0.7025474607944489, "step": 273 }, { "epoch": 0.18658495062989444, "grad_norm": 2.5717861652374268, "learning_rate": 7.465940054495914e-06, "loss": 1.0075, "mean_token_accuracy": 0.6934919059276581, "step": 274 }, { "epoch": 0.18726591760299627, "grad_norm": 2.402524471282959, "learning_rate": 7.493188010899183e-06, "loss": 1.0108, "mean_token_accuracy": 0.6931455433368683, "step": 275 }, { "epoch": 0.18794688457609807, "grad_norm": 2.4184415340423584, "learning_rate": 7.520435967302453e-06, "loss": 0.9809, "mean_token_accuracy": 0.6840399503707886, "step": 276 }, { "epoch": 0.18862785154919987, "grad_norm": 2.7560830116271973, "learning_rate": 7.547683923705723e-06, "loss": 0.959, "mean_token_accuracy": 0.6913843154907227, "step": 277 }, { "epoch": 0.18930881852230166, "grad_norm": 2.579984664916992, "learning_rate": 7.5749318801089925e-06, "loss": 0.8703, "mean_token_accuracy": 0.721640408039093, "step": 278 }, { "epoch": 0.18998978549540346, "grad_norm": 2.2343556880950928, "learning_rate": 7.602179836512263e-06, "loss": 1.0668, "mean_token_accuracy": 0.6854850053787231, "step": 279 }, { "epoch": 0.1906707524685053, "grad_norm": 2.5698769092559814, "learning_rate": 7.629427792915532e-06, "loss": 0.9535, "mean_token_accuracy": 0.7032313048839569, "step": 280 }, { "epoch": 0.1913517194416071, "grad_norm": 2.5981273651123047, "learning_rate": 7.656675749318802e-06, "loss": 0.9924, "mean_token_accuracy": 0.6908433437347412, "step": 281 }, { "epoch": 0.1920326864147089, "grad_norm": 2.462510108947754, "learning_rate": 7.683923705722072e-06, "loss": 1.0571, "mean_token_accuracy": 0.6791988611221313, "step": 282 }, { "epoch": 0.19271365338781068, "grad_norm": 2.387613534927368, "learning_rate": 7.71117166212534e-06, "loss": 0.939, "mean_token_accuracy": 0.7133424878120422, "step": 283 }, { "epoch": 0.19339462036091248, "grad_norm": 2.877833843231201, "learning_rate": 7.73841961852861e-06, "loss": 1.0557, "mean_token_accuracy": 0.6960656940937042, "step": 284 }, { "epoch": 0.1940755873340143, "grad_norm": 2.4619460105895996, "learning_rate": 7.76566757493188e-06, "loss": 0.9272, "mean_token_accuracy": 0.7261895537376404, "step": 285 }, { "epoch": 0.1947565543071161, "grad_norm": 2.7265989780426025, "learning_rate": 7.79291553133515e-06, "loss": 0.9358, "mean_token_accuracy": 0.7168986797332764, "step": 286 }, { "epoch": 0.1954375212802179, "grad_norm": 2.5815954208374023, "learning_rate": 7.82016348773842e-06, "loss": 0.995, "mean_token_accuracy": 0.7055497765541077, "step": 287 }, { "epoch": 0.1961184882533197, "grad_norm": 2.3219621181488037, "learning_rate": 7.847411444141691e-06, "loss": 0.9848, "mean_token_accuracy": 0.6835529208183289, "step": 288 }, { "epoch": 0.1967994552264215, "grad_norm": 2.4272947311401367, "learning_rate": 7.87465940054496e-06, "loss": 0.9764, "mean_token_accuracy": 0.7100883722305298, "step": 289 }, { "epoch": 0.19748042219952333, "grad_norm": 2.350102424621582, "learning_rate": 7.90190735694823e-06, "loss": 0.9725, "mean_token_accuracy": 0.7077441811561584, "step": 290 }, { "epoch": 0.19816138917262513, "grad_norm": 2.3876049518585205, "learning_rate": 7.9291553133515e-06, "loss": 1.0205, "mean_token_accuracy": 0.6861305832862854, "step": 291 }, { "epoch": 0.19884235614572693, "grad_norm": 2.520486831665039, "learning_rate": 7.95640326975477e-06, "loss": 1.0407, "mean_token_accuracy": 0.6845992803573608, "step": 292 }, { "epoch": 0.19952332311882873, "grad_norm": 2.416527509689331, "learning_rate": 7.98365122615804e-06, "loss": 0.9914, "mean_token_accuracy": 0.7020111083984375, "step": 293 }, { "epoch": 0.20020429009193055, "grad_norm": 2.2531235218048096, "learning_rate": 8.010899182561308e-06, "loss": 0.9473, "mean_token_accuracy": 0.7028734683990479, "step": 294 }, { "epoch": 0.20088525706503235, "grad_norm": 2.466238498687744, "learning_rate": 8.038147138964578e-06, "loss": 0.9347, "mean_token_accuracy": 0.7202176153659821, "step": 295 }, { "epoch": 0.20156622403813415, "grad_norm": 2.2435078620910645, "learning_rate": 8.065395095367848e-06, "loss": 0.9595, "mean_token_accuracy": 0.7071370184421539, "step": 296 }, { "epoch": 0.20224719101123595, "grad_norm": 2.4662270545959473, "learning_rate": 8.092643051771117e-06, "loss": 0.9398, "mean_token_accuracy": 0.7014140784740448, "step": 297 }, { "epoch": 0.20292815798433775, "grad_norm": 2.5057976245880127, "learning_rate": 8.119891008174388e-06, "loss": 0.996, "mean_token_accuracy": 0.7048616111278534, "step": 298 }, { "epoch": 0.20360912495743957, "grad_norm": 2.6503593921661377, "learning_rate": 8.147138964577658e-06, "loss": 0.9977, "mean_token_accuracy": 0.6928986310958862, "step": 299 }, { "epoch": 0.20429009193054137, "grad_norm": 2.4223880767822266, "learning_rate": 8.174386920980927e-06, "loss": 1.0133, "mean_token_accuracy": 0.6957108378410339, "step": 300 }, { "epoch": 0.20497105890364317, "grad_norm": 2.4027483463287354, "learning_rate": 8.201634877384197e-06, "loss": 1.0812, "mean_token_accuracy": 0.6634381413459778, "step": 301 }, { "epoch": 0.20565202587674497, "grad_norm": 2.9333536624908447, "learning_rate": 8.228882833787467e-06, "loss": 0.977, "mean_token_accuracy": 0.7075672149658203, "step": 302 }, { "epoch": 0.20633299284984677, "grad_norm": 2.7351715564727783, "learning_rate": 8.256130790190735e-06, "loss": 0.9169, "mean_token_accuracy": 0.7256814539432526, "step": 303 }, { "epoch": 0.2070139598229486, "grad_norm": 2.495316982269287, "learning_rate": 8.283378746594006e-06, "loss": 0.9583, "mean_token_accuracy": 0.701157420873642, "step": 304 }, { "epoch": 0.2076949267960504, "grad_norm": 2.4682910442352295, "learning_rate": 8.310626702997276e-06, "loss": 0.9527, "mean_token_accuracy": 0.7134932577610016, "step": 305 }, { "epoch": 0.2083758937691522, "grad_norm": 2.6380937099456787, "learning_rate": 8.337874659400546e-06, "loss": 1.0083, "mean_token_accuracy": 0.6828243732452393, "step": 306 }, { "epoch": 0.209056860742254, "grad_norm": 3.055964946746826, "learning_rate": 8.365122615803816e-06, "loss": 0.9526, "mean_token_accuracy": 0.7107780575752258, "step": 307 }, { "epoch": 0.20973782771535582, "grad_norm": 2.55999493598938, "learning_rate": 8.392370572207084e-06, "loss": 1.0403, "mean_token_accuracy": 0.6866686344146729, "step": 308 }, { "epoch": 0.21041879468845762, "grad_norm": 2.549211263656616, "learning_rate": 8.419618528610354e-06, "loss": 0.966, "mean_token_accuracy": 0.7059124708175659, "step": 309 }, { "epoch": 0.21109976166155942, "grad_norm": 2.3441457748413086, "learning_rate": 8.446866485013624e-06, "loss": 0.9401, "mean_token_accuracy": 0.6920433938503265, "step": 310 }, { "epoch": 0.21178072863466121, "grad_norm": 2.298445463180542, "learning_rate": 8.474114441416894e-06, "loss": 1.0637, "mean_token_accuracy": 0.6682690382003784, "step": 311 }, { "epoch": 0.212461695607763, "grad_norm": 2.202338695526123, "learning_rate": 8.501362397820165e-06, "loss": 1.1619, "mean_token_accuracy": 0.6640399992465973, "step": 312 }, { "epoch": 0.21314266258086484, "grad_norm": 2.3283450603485107, "learning_rate": 8.528610354223435e-06, "loss": 0.9897, "mean_token_accuracy": 0.7044062912464142, "step": 313 }, { "epoch": 0.21382362955396664, "grad_norm": 2.195401906967163, "learning_rate": 8.555858310626703e-06, "loss": 0.9777, "mean_token_accuracy": 0.7087279260158539, "step": 314 }, { "epoch": 0.21450459652706844, "grad_norm": 2.6659677028656006, "learning_rate": 8.583106267029973e-06, "loss": 0.98, "mean_token_accuracy": 0.6985203623771667, "step": 315 }, { "epoch": 0.21518556350017023, "grad_norm": 2.5308899879455566, "learning_rate": 8.610354223433243e-06, "loss": 0.9558, "mean_token_accuracy": 0.7016755938529968, "step": 316 }, { "epoch": 0.21586653047327203, "grad_norm": 2.7001047134399414, "learning_rate": 8.637602179836513e-06, "loss": 0.9174, "mean_token_accuracy": 0.7183741629123688, "step": 317 }, { "epoch": 0.21654749744637386, "grad_norm": 2.492892026901245, "learning_rate": 8.664850136239783e-06, "loss": 0.9722, "mean_token_accuracy": 0.7036256194114685, "step": 318 }, { "epoch": 0.21722846441947566, "grad_norm": 2.5683815479278564, "learning_rate": 8.692098092643052e-06, "loss": 0.9999, "mean_token_accuracy": 0.6882671415805817, "step": 319 }, { "epoch": 0.21790943139257746, "grad_norm": 2.2723357677459717, "learning_rate": 8.719346049046322e-06, "loss": 0.8257, "mean_token_accuracy": 0.7404394149780273, "step": 320 }, { "epoch": 0.21859039836567926, "grad_norm": 2.3367652893066406, "learning_rate": 8.746594005449592e-06, "loss": 0.9718, "mean_token_accuracy": 0.7087131440639496, "step": 321 }, { "epoch": 0.21927136533878108, "grad_norm": 2.419600248336792, "learning_rate": 8.773841961852862e-06, "loss": 0.9725, "mean_token_accuracy": 0.7068630754947662, "step": 322 }, { "epoch": 0.21995233231188288, "grad_norm": 2.4433059692382812, "learning_rate": 8.801089918256132e-06, "loss": 0.8592, "mean_token_accuracy": 0.7334822416305542, "step": 323 }, { "epoch": 0.22063329928498468, "grad_norm": 2.5063095092773438, "learning_rate": 8.828337874659402e-06, "loss": 0.94, "mean_token_accuracy": 0.7205169200897217, "step": 324 }, { "epoch": 0.22131426625808648, "grad_norm": 2.4766459465026855, "learning_rate": 8.85558583106267e-06, "loss": 0.9161, "mean_token_accuracy": 0.7160871922969818, "step": 325 }, { "epoch": 0.22199523323118828, "grad_norm": 2.384495735168457, "learning_rate": 8.88283378746594e-06, "loss": 0.9065, "mean_token_accuracy": 0.7170606553554535, "step": 326 }, { "epoch": 0.2226762002042901, "grad_norm": 2.6508545875549316, "learning_rate": 8.91008174386921e-06, "loss": 0.9459, "mean_token_accuracy": 0.7112259864807129, "step": 327 }, { "epoch": 0.2233571671773919, "grad_norm": 2.3608648777008057, "learning_rate": 8.937329700272481e-06, "loss": 1.1193, "mean_token_accuracy": 0.6953580677509308, "step": 328 }, { "epoch": 0.2240381341504937, "grad_norm": 2.0368385314941406, "learning_rate": 8.964577656675751e-06, "loss": 0.9833, "mean_token_accuracy": 0.6988213956356049, "step": 329 }, { "epoch": 0.2247191011235955, "grad_norm": 2.294339179992676, "learning_rate": 8.99182561307902e-06, "loss": 0.984, "mean_token_accuracy": 0.68337282538414, "step": 330 }, { "epoch": 0.2254000680966973, "grad_norm": 2.459087371826172, "learning_rate": 9.01907356948229e-06, "loss": 0.9652, "mean_token_accuracy": 0.7128430902957916, "step": 331 }, { "epoch": 0.22608103506979912, "grad_norm": 2.511704206466675, "learning_rate": 9.04632152588556e-06, "loss": 0.9963, "mean_token_accuracy": 0.7006899416446686, "step": 332 }, { "epoch": 0.22676200204290092, "grad_norm": 2.388558864593506, "learning_rate": 9.073569482288828e-06, "loss": 0.9417, "mean_token_accuracy": 0.724176287651062, "step": 333 }, { "epoch": 0.22744296901600272, "grad_norm": 2.467257499694824, "learning_rate": 9.1008174386921e-06, "loss": 0.9312, "mean_token_accuracy": 0.7206425666809082, "step": 334 }, { "epoch": 0.22812393598910452, "grad_norm": 2.3229639530181885, "learning_rate": 9.12806539509537e-06, "loss": 0.9523, "mean_token_accuracy": 0.7173760533332825, "step": 335 }, { "epoch": 0.22880490296220635, "grad_norm": 2.275330066680908, "learning_rate": 9.155313351498638e-06, "loss": 1.0007, "mean_token_accuracy": 0.6973693668842316, "step": 336 }, { "epoch": 0.22948586993530815, "grad_norm": 2.328838348388672, "learning_rate": 9.182561307901908e-06, "loss": 1.0192, "mean_token_accuracy": 0.6913915872573853, "step": 337 }, { "epoch": 0.23016683690840994, "grad_norm": 2.316513776779175, "learning_rate": 9.209809264305178e-06, "loss": 1.011, "mean_token_accuracy": 0.6970493495464325, "step": 338 }, { "epoch": 0.23084780388151174, "grad_norm": 2.4107186794281006, "learning_rate": 9.237057220708447e-06, "loss": 0.922, "mean_token_accuracy": 0.7231188714504242, "step": 339 }, { "epoch": 0.23152877085461354, "grad_norm": 2.7293355464935303, "learning_rate": 9.264305177111717e-06, "loss": 1.0344, "mean_token_accuracy": 0.6808110475540161, "step": 340 }, { "epoch": 0.23220973782771537, "grad_norm": 2.186462640762329, "learning_rate": 9.291553133514987e-06, "loss": 0.9567, "mean_token_accuracy": 0.7124492526054382, "step": 341 }, { "epoch": 0.23289070480081717, "grad_norm": 2.1887826919555664, "learning_rate": 9.318801089918257e-06, "loss": 0.9745, "mean_token_accuracy": 0.7040637731552124, "step": 342 }, { "epoch": 0.23357167177391897, "grad_norm": 2.1994571685791016, "learning_rate": 9.346049046321527e-06, "loss": 1.0573, "mean_token_accuracy": 0.6907682716846466, "step": 343 }, { "epoch": 0.23425263874702076, "grad_norm": 2.253206968307495, "learning_rate": 9.373297002724796e-06, "loss": 1.0407, "mean_token_accuracy": 0.6822247505187988, "step": 344 }, { "epoch": 0.23493360572012256, "grad_norm": 2.1321213245391846, "learning_rate": 9.400544959128066e-06, "loss": 1.0727, "mean_token_accuracy": 0.6847171187400818, "step": 345 }, { "epoch": 0.2356145726932244, "grad_norm": 2.367784023284912, "learning_rate": 9.427792915531336e-06, "loss": 1.0256, "mean_token_accuracy": 0.6882806718349457, "step": 346 }, { "epoch": 0.2362955396663262, "grad_norm": 2.4051103591918945, "learning_rate": 9.455040871934606e-06, "loss": 0.9729, "mean_token_accuracy": 0.7015352249145508, "step": 347 }, { "epoch": 0.236976506639428, "grad_norm": 2.1547183990478516, "learning_rate": 9.482288828337876e-06, "loss": 1.0023, "mean_token_accuracy": 0.7006651759147644, "step": 348 }, { "epoch": 0.23765747361252978, "grad_norm": 2.21321439743042, "learning_rate": 9.509536784741146e-06, "loss": 0.9917, "mean_token_accuracy": 0.6978866159915924, "step": 349 }, { "epoch": 0.23833844058563158, "grad_norm": 2.161283493041992, "learning_rate": 9.536784741144414e-06, "loss": 0.9426, "mean_token_accuracy": 0.7070726454257965, "step": 350 }, { "epoch": 0.2390194075587334, "grad_norm": 2.279925584793091, "learning_rate": 9.564032697547684e-06, "loss": 0.9655, "mean_token_accuracy": 0.7124730348587036, "step": 351 }, { "epoch": 0.2397003745318352, "grad_norm": 2.3233535289764404, "learning_rate": 9.591280653950955e-06, "loss": 0.9763, "mean_token_accuracy": 0.6976611614227295, "step": 352 }, { "epoch": 0.240381341504937, "grad_norm": 2.2919647693634033, "learning_rate": 9.618528610354225e-06, "loss": 0.9922, "mean_token_accuracy": 0.6854059994220734, "step": 353 }, { "epoch": 0.2410623084780388, "grad_norm": 2.397564172744751, "learning_rate": 9.645776566757495e-06, "loss": 0.9863, "mean_token_accuracy": 0.7029752731323242, "step": 354 }, { "epoch": 0.24174327545114063, "grad_norm": 2.644530773162842, "learning_rate": 9.673024523160763e-06, "loss": 0.9979, "mean_token_accuracy": 0.6844356954097748, "step": 355 }, { "epoch": 0.24242424242424243, "grad_norm": 2.487659215927124, "learning_rate": 9.700272479564033e-06, "loss": 1.003, "mean_token_accuracy": 0.6811676621437073, "step": 356 }, { "epoch": 0.24310520939734423, "grad_norm": 2.4270448684692383, "learning_rate": 9.727520435967303e-06, "loss": 0.9357, "mean_token_accuracy": 0.7066078186035156, "step": 357 }, { "epoch": 0.24378617637044603, "grad_norm": 2.3406567573547363, "learning_rate": 9.754768392370573e-06, "loss": 1.0191, "mean_token_accuracy": 0.684261828660965, "step": 358 }, { "epoch": 0.24446714334354783, "grad_norm": 2.3558475971221924, "learning_rate": 9.782016348773843e-06, "loss": 0.9388, "mean_token_accuracy": 0.705297976732254, "step": 359 }, { "epoch": 0.24514811031664965, "grad_norm": 2.3530051708221436, "learning_rate": 9.809264305177114e-06, "loss": 1.0846, "mean_token_accuracy": 0.6640603542327881, "step": 360 }, { "epoch": 0.24582907728975145, "grad_norm": 2.5148766040802, "learning_rate": 9.836512261580382e-06, "loss": 0.9652, "mean_token_accuracy": 0.7036284506320953, "step": 361 }, { "epoch": 0.24651004426285325, "grad_norm": 2.160193681716919, "learning_rate": 9.863760217983652e-06, "loss": 1.0482, "mean_token_accuracy": 0.6774969398975372, "step": 362 }, { "epoch": 0.24719101123595505, "grad_norm": 2.145566701889038, "learning_rate": 9.89100817438692e-06, "loss": 0.9555, "mean_token_accuracy": 0.7080468833446503, "step": 363 }, { "epoch": 0.24787197820905685, "grad_norm": 2.586097478866577, "learning_rate": 9.918256130790192e-06, "loss": 0.9914, "mean_token_accuracy": 0.7039324939250946, "step": 364 }, { "epoch": 0.24855294518215867, "grad_norm": 2.1454222202301025, "learning_rate": 9.945504087193462e-06, "loss": 1.0387, "mean_token_accuracy": 0.6773696839809418, "step": 365 }, { "epoch": 0.24923391215526047, "grad_norm": 2.5065720081329346, "learning_rate": 9.97275204359673e-06, "loss": 0.8749, "mean_token_accuracy": 0.7354214191436768, "step": 366 }, { "epoch": 0.24991487912836227, "grad_norm": 2.5121684074401855, "learning_rate": 1e-05, "loss": 0.9886, "mean_token_accuracy": 0.6992575228214264, "step": 367 }, { "epoch": 0.25059584610146407, "grad_norm": 2.3112854957580566, "learning_rate": 9.999999492541635e-06, "loss": 0.9718, "mean_token_accuracy": 0.7064096927642822, "step": 368 }, { "epoch": 0.2512768130745659, "grad_norm": 2.301638126373291, "learning_rate": 9.999997970166641e-06, "loss": 1.0474, "mean_token_accuracy": 0.671990841627121, "step": 369 }, { "epoch": 0.25195778004766767, "grad_norm": 2.18278169631958, "learning_rate": 9.99999543287533e-06, "loss": 1.0648, "mean_token_accuracy": 0.6842436194419861, "step": 370 }, { "epoch": 0.2526387470207695, "grad_norm": 2.495713949203491, "learning_rate": 9.999991880668213e-06, "loss": 0.9089, "mean_token_accuracy": 0.7111038863658905, "step": 371 }, { "epoch": 0.2533197139938713, "grad_norm": 2.2637078762054443, "learning_rate": 9.999987313546014e-06, "loss": 0.963, "mean_token_accuracy": 0.7125892639160156, "step": 372 }, { "epoch": 0.2540006809669731, "grad_norm": 2.4778506755828857, "learning_rate": 9.999981731509658e-06, "loss": 0.9855, "mean_token_accuracy": 0.7086583971977234, "step": 373 }, { "epoch": 0.2546816479400749, "grad_norm": 2.415813446044922, "learning_rate": 9.99997513456028e-06, "loss": 1.0185, "mean_token_accuracy": 0.692923903465271, "step": 374 }, { "epoch": 0.2553626149131767, "grad_norm": 2.618952989578247, "learning_rate": 9.999967522699218e-06, "loss": 0.8848, "mean_token_accuracy": 0.7289597690105438, "step": 375 }, { "epoch": 0.2560435818862785, "grad_norm": 2.1689367294311523, "learning_rate": 9.999958895928017e-06, "loss": 0.9416, "mean_token_accuracy": 0.7054961323738098, "step": 376 }, { "epoch": 0.25672454885938034, "grad_norm": 2.352041006088257, "learning_rate": 9.99994925424843e-06, "loss": 0.936, "mean_token_accuracy": 0.720831036567688, "step": 377 }, { "epoch": 0.2574055158324821, "grad_norm": 2.3896071910858154, "learning_rate": 9.99993859766241e-06, "loss": 0.9807, "mean_token_accuracy": 0.6900866627693176, "step": 378 }, { "epoch": 0.25808648280558394, "grad_norm": 2.2575409412384033, "learning_rate": 9.999926926172123e-06, "loss": 0.9962, "mean_token_accuracy": 0.6807062923908234, "step": 379 }, { "epoch": 0.2587674497786857, "grad_norm": 2.5600075721740723, "learning_rate": 9.99991423977994e-06, "loss": 0.9661, "mean_token_accuracy": 0.709142804145813, "step": 380 }, { "epoch": 0.25944841675178754, "grad_norm": 2.301405906677246, "learning_rate": 9.999900538488432e-06, "loss": 0.9109, "mean_token_accuracy": 0.7192485630512238, "step": 381 }, { "epoch": 0.26012938372488936, "grad_norm": 2.1667094230651855, "learning_rate": 9.999885822300381e-06, "loss": 1.0335, "mean_token_accuracy": 0.6861271262168884, "step": 382 }, { "epoch": 0.26081035069799113, "grad_norm": 2.2174015045166016, "learning_rate": 9.999870091218777e-06, "loss": 1.0364, "mean_token_accuracy": 0.6887018978595734, "step": 383 }, { "epoch": 0.26149131767109296, "grad_norm": 2.4415950775146484, "learning_rate": 9.999853345246811e-06, "loss": 0.9581, "mean_token_accuracy": 0.7032292485237122, "step": 384 }, { "epoch": 0.26217228464419473, "grad_norm": 2.193195104598999, "learning_rate": 9.999835584387883e-06, "loss": 0.9404, "mean_token_accuracy": 0.7065910696983337, "step": 385 }, { "epoch": 0.26285325161729656, "grad_norm": 2.4571070671081543, "learning_rate": 9.999816808645598e-06, "loss": 0.897, "mean_token_accuracy": 0.7248788475990295, "step": 386 }, { "epoch": 0.2635342185903984, "grad_norm": 2.296298027038574, "learning_rate": 9.999797018023764e-06, "loss": 0.9434, "mean_token_accuracy": 0.7135510146617889, "step": 387 }, { "epoch": 0.26421518556350015, "grad_norm": 2.4014034271240234, "learning_rate": 9.999776212526402e-06, "loss": 0.953, "mean_token_accuracy": 0.7156078517436981, "step": 388 }, { "epoch": 0.264896152536602, "grad_norm": 2.1783459186553955, "learning_rate": 9.999754392157736e-06, "loss": 0.9404, "mean_token_accuracy": 0.7128735780715942, "step": 389 }, { "epoch": 0.26557711950970375, "grad_norm": 2.330338478088379, "learning_rate": 9.999731556922193e-06, "loss": 0.995, "mean_token_accuracy": 0.680774450302124, "step": 390 }, { "epoch": 0.2662580864828056, "grad_norm": 2.079843282699585, "learning_rate": 9.999707706824406e-06, "loss": 1.0203, "mean_token_accuracy": 0.6842207610607147, "step": 391 }, { "epoch": 0.2669390534559074, "grad_norm": 2.2989661693573, "learning_rate": 9.99968284186922e-06, "loss": 1.0058, "mean_token_accuracy": 0.6958125531673431, "step": 392 }, { "epoch": 0.2676200204290092, "grad_norm": 2.1404666900634766, "learning_rate": 9.999656962061681e-06, "loss": 0.9758, "mean_token_accuracy": 0.7088800370693207, "step": 393 }, { "epoch": 0.268300987402111, "grad_norm": 2.3489840030670166, "learning_rate": 9.999630067407042e-06, "loss": 0.9357, "mean_token_accuracy": 0.7240843176841736, "step": 394 }, { "epoch": 0.26898195437521283, "grad_norm": 2.5099830627441406, "learning_rate": 9.999602157910762e-06, "loss": 0.922, "mean_token_accuracy": 0.7219201922416687, "step": 395 }, { "epoch": 0.2696629213483146, "grad_norm": 2.2246532440185547, "learning_rate": 9.999573233578506e-06, "loss": 1.0191, "mean_token_accuracy": 0.6845537424087524, "step": 396 }, { "epoch": 0.2703438883214164, "grad_norm": 2.494414806365967, "learning_rate": 9.999543294416146e-06, "loss": 0.898, "mean_token_accuracy": 0.7185659110546112, "step": 397 }, { "epoch": 0.2710248552945182, "grad_norm": 2.1388471126556396, "learning_rate": 9.999512340429757e-06, "loss": 0.9064, "mean_token_accuracy": 0.7293615341186523, "step": 398 }, { "epoch": 0.27170582226762, "grad_norm": 2.5193071365356445, "learning_rate": 9.999480371625624e-06, "loss": 0.8977, "mean_token_accuracy": 0.7205256223678589, "step": 399 }, { "epoch": 0.27238678924072185, "grad_norm": 2.391765832901001, "learning_rate": 9.999447388010238e-06, "loss": 0.873, "mean_token_accuracy": 0.7368955612182617, "step": 400 }, { "epoch": 0.2730677562138236, "grad_norm": 2.3082003593444824, "learning_rate": 9.999413389590292e-06, "loss": 0.9472, "mean_token_accuracy": 0.7172841131687164, "step": 401 }, { "epoch": 0.27374872318692545, "grad_norm": 2.313432216644287, "learning_rate": 9.999378376372684e-06, "loss": 0.9829, "mean_token_accuracy": 0.6981299519538879, "step": 402 }, { "epoch": 0.2744296901600272, "grad_norm": 2.3099231719970703, "learning_rate": 9.999342348364527e-06, "loss": 0.9992, "mean_token_accuracy": 0.7011659741401672, "step": 403 }, { "epoch": 0.27511065713312904, "grad_norm": 2.2572107315063477, "learning_rate": 9.99930530557313e-06, "loss": 0.9776, "mean_token_accuracy": 0.6955945193767548, "step": 404 }, { "epoch": 0.27579162410623087, "grad_norm": 2.396789073944092, "learning_rate": 9.999267248006013e-06, "loss": 0.9004, "mean_token_accuracy": 0.7098581492900848, "step": 405 }, { "epoch": 0.27647259107933264, "grad_norm": 2.311366558074951, "learning_rate": 9.999228175670904e-06, "loss": 1.0551, "mean_token_accuracy": 0.6658292412757874, "step": 406 }, { "epoch": 0.27715355805243447, "grad_norm": 2.6561806201934814, "learning_rate": 9.999188088575729e-06, "loss": 0.9699, "mean_token_accuracy": 0.690764993429184, "step": 407 }, { "epoch": 0.27783452502553624, "grad_norm": 2.292785882949829, "learning_rate": 9.999146986728628e-06, "loss": 0.9438, "mean_token_accuracy": 0.7159900963306427, "step": 408 }, { "epoch": 0.27851549199863807, "grad_norm": 2.4209635257720947, "learning_rate": 9.999104870137944e-06, "loss": 0.9311, "mean_token_accuracy": 0.70830437541008, "step": 409 }, { "epoch": 0.2791964589717399, "grad_norm": 2.251513957977295, "learning_rate": 9.999061738812224e-06, "loss": 0.9874, "mean_token_accuracy": 0.6831661462783813, "step": 410 }, { "epoch": 0.27987742594484166, "grad_norm": 2.220999002456665, "learning_rate": 9.999017592760225e-06, "loss": 0.8926, "mean_token_accuracy": 0.7355753779411316, "step": 411 }, { "epoch": 0.2805583929179435, "grad_norm": 2.2663512229919434, "learning_rate": 9.998972431990908e-06, "loss": 0.9616, "mean_token_accuracy": 0.6977421939373016, "step": 412 }, { "epoch": 0.28123935989104526, "grad_norm": 2.118224620819092, "learning_rate": 9.99892625651344e-06, "loss": 1.0099, "mean_token_accuracy": 0.7042803168296814, "step": 413 }, { "epoch": 0.2819203268641471, "grad_norm": 2.4323995113372803, "learning_rate": 9.99887906633719e-06, "loss": 0.9246, "mean_token_accuracy": 0.7234799861907959, "step": 414 }, { "epoch": 0.2826012938372489, "grad_norm": 2.3241310119628906, "learning_rate": 9.998830861471742e-06, "loss": 1.0192, "mean_token_accuracy": 0.6849901378154755, "step": 415 }, { "epoch": 0.2832822608103507, "grad_norm": 2.248262405395508, "learning_rate": 9.998781641926878e-06, "loss": 0.8546, "mean_token_accuracy": 0.7386434674263, "step": 416 }, { "epoch": 0.2839632277834525, "grad_norm": 2.4624736309051514, "learning_rate": 9.99873140771259e-06, "loss": 0.8895, "mean_token_accuracy": 0.7341000437736511, "step": 417 }, { "epoch": 0.2846441947565543, "grad_norm": 2.5308661460876465, "learning_rate": 9.998680158839074e-06, "loss": 0.9607, "mean_token_accuracy": 0.7149818539619446, "step": 418 }, { "epoch": 0.2853251617296561, "grad_norm": 2.2556023597717285, "learning_rate": 9.998627895316732e-06, "loss": 0.9395, "mean_token_accuracy": 0.7168782353401184, "step": 419 }, { "epoch": 0.28600612870275793, "grad_norm": 2.21291184425354, "learning_rate": 9.998574617156174e-06, "loss": 0.9675, "mean_token_accuracy": 0.695939302444458, "step": 420 }, { "epoch": 0.2866870956758597, "grad_norm": 2.2812483310699463, "learning_rate": 9.998520324368213e-06, "loss": 0.9476, "mean_token_accuracy": 0.7099915146827698, "step": 421 }, { "epoch": 0.28736806264896153, "grad_norm": 2.1854047775268555, "learning_rate": 9.998465016963871e-06, "loss": 0.907, "mean_token_accuracy": 0.7135538756847382, "step": 422 }, { "epoch": 0.28804902962206336, "grad_norm": 2.392322301864624, "learning_rate": 9.998408694954374e-06, "loss": 0.9677, "mean_token_accuracy": 0.7063857316970825, "step": 423 }, { "epoch": 0.28872999659516513, "grad_norm": 2.2250330448150635, "learning_rate": 9.998351358351154e-06, "loss": 0.9555, "mean_token_accuracy": 0.7150605916976929, "step": 424 }, { "epoch": 0.28941096356826695, "grad_norm": 2.1784615516662598, "learning_rate": 9.998293007165851e-06, "loss": 1.004, "mean_token_accuracy": 0.6984578371047974, "step": 425 }, { "epoch": 0.2900919305413687, "grad_norm": 2.0191972255706787, "learning_rate": 9.998233641410306e-06, "loss": 0.9937, "mean_token_accuracy": 0.7006504535675049, "step": 426 }, { "epoch": 0.29077289751447055, "grad_norm": 2.389235734939575, "learning_rate": 9.998173261096572e-06, "loss": 1.034, "mean_token_accuracy": 0.6822924613952637, "step": 427 }, { "epoch": 0.2914538644875724, "grad_norm": 2.383467197418213, "learning_rate": 9.998111866236907e-06, "loss": 0.9927, "mean_token_accuracy": 0.7044452130794525, "step": 428 }, { "epoch": 0.29213483146067415, "grad_norm": 2.514758586883545, "learning_rate": 9.99804945684377e-06, "loss": 0.9226, "mean_token_accuracy": 0.7227389216423035, "step": 429 }, { "epoch": 0.292815798433776, "grad_norm": 2.6307339668273926, "learning_rate": 9.997986032929827e-06, "loss": 0.9717, "mean_token_accuracy": 0.7060000598430634, "step": 430 }, { "epoch": 0.29349676540687775, "grad_norm": 2.841378927230835, "learning_rate": 9.997921594507957e-06, "loss": 0.8762, "mean_token_accuracy": 0.7270539700984955, "step": 431 }, { "epoch": 0.2941777323799796, "grad_norm": 2.506187677383423, "learning_rate": 9.997856141591238e-06, "loss": 0.9453, "mean_token_accuracy": 0.7163543403148651, "step": 432 }, { "epoch": 0.2948586993530814, "grad_norm": 2.1927988529205322, "learning_rate": 9.997789674192953e-06, "loss": 1.0324, "mean_token_accuracy": 0.6783124208450317, "step": 433 }, { "epoch": 0.29553966632618317, "grad_norm": 2.3827314376831055, "learning_rate": 9.997722192326597e-06, "loss": 1.0696, "mean_token_accuracy": 0.6848655641078949, "step": 434 }, { "epoch": 0.296220633299285, "grad_norm": 2.1834776401519775, "learning_rate": 9.997653696005869e-06, "loss": 0.9743, "mean_token_accuracy": 0.7039608955383301, "step": 435 }, { "epoch": 0.29690160027238677, "grad_norm": 2.314269781112671, "learning_rate": 9.99758418524467e-06, "loss": 0.9919, "mean_token_accuracy": 0.6982151567935944, "step": 436 }, { "epoch": 0.2975825672454886, "grad_norm": 2.2779734134674072, "learning_rate": 9.99751366005711e-06, "loss": 0.9201, "mean_token_accuracy": 0.7111532688140869, "step": 437 }, { "epoch": 0.2982635342185904, "grad_norm": 2.3451437950134277, "learning_rate": 9.997442120457504e-06, "loss": 0.9657, "mean_token_accuracy": 0.7050490975379944, "step": 438 }, { "epoch": 0.2989445011916922, "grad_norm": 2.2787466049194336, "learning_rate": 9.997369566460374e-06, "loss": 0.9628, "mean_token_accuracy": 0.7027779221534729, "step": 439 }, { "epoch": 0.299625468164794, "grad_norm": 2.3270509243011475, "learning_rate": 9.997295998080448e-06, "loss": 0.9046, "mean_token_accuracy": 0.7255857586860657, "step": 440 }, { "epoch": 0.3003064351378958, "grad_norm": 2.145206928253174, "learning_rate": 9.997221415332657e-06, "loss": 0.9419, "mean_token_accuracy": 0.7090073823928833, "step": 441 }, { "epoch": 0.3009874021109976, "grad_norm": 2.1246674060821533, "learning_rate": 9.99714581823214e-06, "loss": 1.0545, "mean_token_accuracy": 0.6652484238147736, "step": 442 }, { "epoch": 0.30166836908409944, "grad_norm": 2.597639560699463, "learning_rate": 9.997069206794246e-06, "loss": 0.9664, "mean_token_accuracy": 0.7039353549480438, "step": 443 }, { "epoch": 0.3023493360572012, "grad_norm": 2.178722858428955, "learning_rate": 9.996991581034523e-06, "loss": 1.0516, "mean_token_accuracy": 0.6813865900039673, "step": 444 }, { "epoch": 0.30303030303030304, "grad_norm": 2.178940534591675, "learning_rate": 9.996912940968727e-06, "loss": 0.9114, "mean_token_accuracy": 0.7116894721984863, "step": 445 }, { "epoch": 0.3037112700034048, "grad_norm": 2.1776633262634277, "learning_rate": 9.996833286612821e-06, "loss": 0.9535, "mean_token_accuracy": 0.707748532295227, "step": 446 }, { "epoch": 0.30439223697650664, "grad_norm": 2.371168375015259, "learning_rate": 9.996752617982976e-06, "loss": 0.9438, "mean_token_accuracy": 0.7154693007469177, "step": 447 }, { "epoch": 0.30507320394960846, "grad_norm": 2.382378578186035, "learning_rate": 9.996670935095563e-06, "loss": 0.9669, "mean_token_accuracy": 0.6985623836517334, "step": 448 }, { "epoch": 0.30575417092271023, "grad_norm": 2.4315671920776367, "learning_rate": 9.996588237967165e-06, "loss": 0.9813, "mean_token_accuracy": 0.7010200619697571, "step": 449 }, { "epoch": 0.30643513789581206, "grad_norm": 2.119302988052368, "learning_rate": 9.996504526614565e-06, "loss": 0.9737, "mean_token_accuracy": 0.7059412896633148, "step": 450 }, { "epoch": 0.30711610486891383, "grad_norm": 2.069575548171997, "learning_rate": 9.996419801054757e-06, "loss": 0.9462, "mean_token_accuracy": 0.7143499553203583, "step": 451 }, { "epoch": 0.30779707184201566, "grad_norm": 2.2435014247894287, "learning_rate": 9.99633406130494e-06, "loss": 0.9389, "mean_token_accuracy": 0.7252131700515747, "step": 452 }, { "epoch": 0.3084780388151175, "grad_norm": 2.324235677719116, "learning_rate": 9.996247307382517e-06, "loss": 0.9772, "mean_token_accuracy": 0.7116954028606415, "step": 453 }, { "epoch": 0.30915900578821925, "grad_norm": 2.054978370666504, "learning_rate": 9.996159539305095e-06, "loss": 0.896, "mean_token_accuracy": 0.7191433012485504, "step": 454 }, { "epoch": 0.3098399727613211, "grad_norm": 2.382418632507324, "learning_rate": 9.996070757090493e-06, "loss": 0.9285, "mean_token_accuracy": 0.7156614065170288, "step": 455 }, { "epoch": 0.3105209397344229, "grad_norm": 2.231088638305664, "learning_rate": 9.99598096075673e-06, "loss": 0.9523, "mean_token_accuracy": 0.6906564831733704, "step": 456 }, { "epoch": 0.3112019067075247, "grad_norm": 2.3891990184783936, "learning_rate": 9.995890150322034e-06, "loss": 0.9728, "mean_token_accuracy": 0.6983745396137238, "step": 457 }, { "epoch": 0.3118828736806265, "grad_norm": 2.3807315826416016, "learning_rate": 9.995798325804838e-06, "loss": 0.9907, "mean_token_accuracy": 0.7059471011161804, "step": 458 }, { "epoch": 0.3125638406537283, "grad_norm": 2.4477615356445312, "learning_rate": 9.995705487223782e-06, "loss": 0.8999, "mean_token_accuracy": 0.7105642855167389, "step": 459 }, { "epoch": 0.3132448076268301, "grad_norm": 2.3295702934265137, "learning_rate": 9.995611634597709e-06, "loss": 1.0631, "mean_token_accuracy": 0.6763808131217957, "step": 460 }, { "epoch": 0.31392577459993193, "grad_norm": 2.128000020980835, "learning_rate": 9.99551676794567e-06, "loss": 1.003, "mean_token_accuracy": 0.6838787794113159, "step": 461 }, { "epoch": 0.3146067415730337, "grad_norm": 2.1613800525665283, "learning_rate": 9.995420887286922e-06, "loss": 1.0006, "mean_token_accuracy": 0.7058010697364807, "step": 462 }, { "epoch": 0.3152877085461355, "grad_norm": 2.018577814102173, "learning_rate": 9.995323992640923e-06, "loss": 1.0473, "mean_token_accuracy": 0.6857033967971802, "step": 463 }, { "epoch": 0.3159686755192373, "grad_norm": 2.3593859672546387, "learning_rate": 9.995226084027349e-06, "loss": 1.0465, "mean_token_accuracy": 0.6873573660850525, "step": 464 }, { "epoch": 0.3166496424923391, "grad_norm": 2.4677512645721436, "learning_rate": 9.995127161466068e-06, "loss": 0.9288, "mean_token_accuracy": 0.7148290276527405, "step": 465 }, { "epoch": 0.31733060946544095, "grad_norm": 2.428712844848633, "learning_rate": 9.99502722497716e-06, "loss": 0.9661, "mean_token_accuracy": 0.7114481627941132, "step": 466 }, { "epoch": 0.3180115764385427, "grad_norm": 2.1164419651031494, "learning_rate": 9.994926274580913e-06, "loss": 0.882, "mean_token_accuracy": 0.7335004508495331, "step": 467 }, { "epoch": 0.31869254341164455, "grad_norm": 2.323544502258301, "learning_rate": 9.994824310297815e-06, "loss": 0.9172, "mean_token_accuracy": 0.7188694775104523, "step": 468 }, { "epoch": 0.3193735103847463, "grad_norm": 2.09665846824646, "learning_rate": 9.994721332148566e-06, "loss": 0.9132, "mean_token_accuracy": 0.7153278291225433, "step": 469 }, { "epoch": 0.32005447735784814, "grad_norm": 2.375788927078247, "learning_rate": 9.994617340154068e-06, "loss": 0.8831, "mean_token_accuracy": 0.7218459248542786, "step": 470 }, { "epoch": 0.32073544433094997, "grad_norm": 2.0667102336883545, "learning_rate": 9.994512334335429e-06, "loss": 1.0085, "mean_token_accuracy": 0.6940750479698181, "step": 471 }, { "epoch": 0.32141641130405174, "grad_norm": 2.199164628982544, "learning_rate": 9.994406314713962e-06, "loss": 0.8958, "mean_token_accuracy": 0.7275827825069427, "step": 472 }, { "epoch": 0.32209737827715357, "grad_norm": 2.4325034618377686, "learning_rate": 9.994299281311191e-06, "loss": 0.8482, "mean_token_accuracy": 0.7408199906349182, "step": 473 }, { "epoch": 0.32277834525025534, "grad_norm": 2.1115922927856445, "learning_rate": 9.99419123414884e-06, "loss": 0.9211, "mean_token_accuracy": 0.717443197965622, "step": 474 }, { "epoch": 0.32345931222335716, "grad_norm": 2.0619957447052, "learning_rate": 9.99408217324884e-06, "loss": 1.0409, "mean_token_accuracy": 0.6862545609474182, "step": 475 }, { "epoch": 0.324140279196459, "grad_norm": 2.276205062866211, "learning_rate": 9.993972098633329e-06, "loss": 1.0093, "mean_token_accuracy": 0.6781291663646698, "step": 476 }, { "epoch": 0.32482124616956076, "grad_norm": 2.063065767288208, "learning_rate": 9.99386101032465e-06, "loss": 1.0383, "mean_token_accuracy": 0.6636005640029907, "step": 477 }, { "epoch": 0.3255022131426626, "grad_norm": 2.2338294982910156, "learning_rate": 9.993748908345353e-06, "loss": 1.007, "mean_token_accuracy": 0.6982744634151459, "step": 478 }, { "epoch": 0.32618318011576436, "grad_norm": 2.2464866638183594, "learning_rate": 9.993635792718194e-06, "loss": 0.9397, "mean_token_accuracy": 0.7108787298202515, "step": 479 }, { "epoch": 0.3268641470888662, "grad_norm": 2.2800803184509277, "learning_rate": 9.99352166346613e-06, "loss": 0.9427, "mean_token_accuracy": 0.7227087318897247, "step": 480 }, { "epoch": 0.327545114061968, "grad_norm": 1.8925576210021973, "learning_rate": 9.993406520612331e-06, "loss": 0.9593, "mean_token_accuracy": 0.7115826308727264, "step": 481 }, { "epoch": 0.3282260810350698, "grad_norm": 2.1063425540924072, "learning_rate": 9.993290364180166e-06, "loss": 0.9692, "mean_token_accuracy": 0.708788126707077, "step": 482 }, { "epoch": 0.3289070480081716, "grad_norm": 2.1262528896331787, "learning_rate": 9.993173194193215e-06, "loss": 0.9978, "mean_token_accuracy": 0.6914385855197906, "step": 483 }, { "epoch": 0.3295880149812734, "grad_norm": 2.1494603157043457, "learning_rate": 9.99305501067526e-06, "loss": 0.8965, "mean_token_accuracy": 0.7120167315006256, "step": 484 }, { "epoch": 0.3302689819543752, "grad_norm": 2.108220100402832, "learning_rate": 9.992935813650292e-06, "loss": 0.9194, "mean_token_accuracy": 0.7132964134216309, "step": 485 }, { "epoch": 0.33094994892747703, "grad_norm": 2.143585443496704, "learning_rate": 9.992815603142507e-06, "loss": 0.9647, "mean_token_accuracy": 0.7101857662200928, "step": 486 }, { "epoch": 0.3316309159005788, "grad_norm": 2.1708741188049316, "learning_rate": 9.992694379176302e-06, "loss": 0.965, "mean_token_accuracy": 0.6973857581615448, "step": 487 }, { "epoch": 0.33231188287368063, "grad_norm": 2.1467974185943604, "learning_rate": 9.992572141776285e-06, "loss": 0.9948, "mean_token_accuracy": 0.6805733144283295, "step": 488 }, { "epoch": 0.33299284984678246, "grad_norm": 2.1149961948394775, "learning_rate": 9.99244889096727e-06, "loss": 1.0826, "mean_token_accuracy": 0.6869708001613617, "step": 489 }, { "epoch": 0.33367381681988423, "grad_norm": 2.0201480388641357, "learning_rate": 9.99232462677427e-06, "loss": 0.9558, "mean_token_accuracy": 0.708832710981369, "step": 490 }, { "epoch": 0.33435478379298605, "grad_norm": 2.320496082305908, "learning_rate": 9.992199349222515e-06, "loss": 0.901, "mean_token_accuracy": 0.7193293273448944, "step": 491 }, { "epoch": 0.3350357507660878, "grad_norm": 2.266780376434326, "learning_rate": 9.99207305833743e-06, "loss": 0.9223, "mean_token_accuracy": 0.7176731526851654, "step": 492 }, { "epoch": 0.33571671773918965, "grad_norm": 2.0865797996520996, "learning_rate": 9.991945754144653e-06, "loss": 0.9775, "mean_token_accuracy": 0.7102729678153992, "step": 493 }, { "epoch": 0.3363976847122915, "grad_norm": 2.4216251373291016, "learning_rate": 9.99181743667002e-06, "loss": 0.8676, "mean_token_accuracy": 0.7354525625705719, "step": 494 }, { "epoch": 0.33707865168539325, "grad_norm": 2.5146193504333496, "learning_rate": 9.991688105939581e-06, "loss": 1.0171, "mean_token_accuracy": 0.6961574256420135, "step": 495 }, { "epoch": 0.3377596186584951, "grad_norm": 1.9460076093673706, "learning_rate": 9.991557761979587e-06, "loss": 1.0063, "mean_token_accuracy": 0.6894212067127228, "step": 496 }, { "epoch": 0.33844058563159685, "grad_norm": 2.065295696258545, "learning_rate": 9.991426404816497e-06, "loss": 1.0218, "mean_token_accuracy": 0.6860966682434082, "step": 497 }, { "epoch": 0.3391215526046987, "grad_norm": 2.0441572666168213, "learning_rate": 9.99129403447697e-06, "loss": 0.997, "mean_token_accuracy": 0.698438972234726, "step": 498 }, { "epoch": 0.3398025195778005, "grad_norm": 2.0648396015167236, "learning_rate": 9.99116065098788e-06, "loss": 1.0582, "mean_token_accuracy": 0.6840884983539581, "step": 499 }, { "epoch": 0.34048348655090227, "grad_norm": 2.0490524768829346, "learning_rate": 9.991026254376302e-06, "loss": 0.8992, "mean_token_accuracy": 0.7182393372058868, "step": 500 }, { "epoch": 0.3411644535240041, "grad_norm": 2.056997537612915, "learning_rate": 9.990890844669509e-06, "loss": 0.9099, "mean_token_accuracy": 0.7252838313579559, "step": 501 }, { "epoch": 0.34184542049710587, "grad_norm": 2.015801191329956, "learning_rate": 9.990754421894995e-06, "loss": 1.0158, "mean_token_accuracy": 0.6997462213039398, "step": 502 }, { "epoch": 0.3425263874702077, "grad_norm": 1.818291187286377, "learning_rate": 9.990616986080449e-06, "loss": 1.0129, "mean_token_accuracy": 0.695663720369339, "step": 503 }, { "epoch": 0.3432073544433095, "grad_norm": 2.0664823055267334, "learning_rate": 9.990478537253764e-06, "loss": 0.9668, "mean_token_accuracy": 0.7126889824867249, "step": 504 }, { "epoch": 0.3438883214164113, "grad_norm": 2.1332082748413086, "learning_rate": 9.99033907544305e-06, "loss": 0.9763, "mean_token_accuracy": 0.7068667411804199, "step": 505 }, { "epoch": 0.3445692883895131, "grad_norm": 2.3636858463287354, "learning_rate": 9.990198600676609e-06, "loss": 0.8593, "mean_token_accuracy": 0.7319839894771576, "step": 506 }, { "epoch": 0.3452502553626149, "grad_norm": 2.414386749267578, "learning_rate": 9.990057112982959e-06, "loss": 1.0025, "mean_token_accuracy": 0.69935142993927, "step": 507 }, { "epoch": 0.3459312223357167, "grad_norm": 2.180803060531616, "learning_rate": 9.989914612390818e-06, "loss": 0.9627, "mean_token_accuracy": 0.7047113478183746, "step": 508 }, { "epoch": 0.34661218930881854, "grad_norm": 2.111180067062378, "learning_rate": 9.989771098929113e-06, "loss": 0.9286, "mean_token_accuracy": 0.716584175825119, "step": 509 }, { "epoch": 0.3472931562819203, "grad_norm": 2.351958751678467, "learning_rate": 9.98962657262697e-06, "loss": 0.8639, "mean_token_accuracy": 0.7418828308582306, "step": 510 }, { "epoch": 0.34797412325502214, "grad_norm": 2.285104990005493, "learning_rate": 9.989481033513734e-06, "loss": 0.97, "mean_token_accuracy": 0.6935476362705231, "step": 511 }, { "epoch": 0.3486550902281239, "grad_norm": 2.2480032444000244, "learning_rate": 9.98933448161894e-06, "loss": 0.9335, "mean_token_accuracy": 0.7071749567985535, "step": 512 }, { "epoch": 0.34933605720122574, "grad_norm": 2.4739041328430176, "learning_rate": 9.989186916972337e-06, "loss": 0.9012, "mean_token_accuracy": 0.7204695343971252, "step": 513 }, { "epoch": 0.35001702417432756, "grad_norm": 2.243191957473755, "learning_rate": 9.989038339603878e-06, "loss": 0.9352, "mean_token_accuracy": 0.7080223560333252, "step": 514 }, { "epoch": 0.35069799114742933, "grad_norm": 2.2663280963897705, "learning_rate": 9.988888749543723e-06, "loss": 0.957, "mean_token_accuracy": 0.7068644464015961, "step": 515 }, { "epoch": 0.35137895812053116, "grad_norm": 2.1864943504333496, "learning_rate": 9.988738146822237e-06, "loss": 0.9628, "mean_token_accuracy": 0.7084029018878937, "step": 516 }, { "epoch": 0.352059925093633, "grad_norm": 2.2888705730438232, "learning_rate": 9.988586531469989e-06, "loss": 0.9475, "mean_token_accuracy": 0.7208441495895386, "step": 517 }, { "epoch": 0.35274089206673476, "grad_norm": 2.1136088371276855, "learning_rate": 9.988433903517752e-06, "loss": 0.9249, "mean_token_accuracy": 0.7197283506393433, "step": 518 }, { "epoch": 0.3534218590398366, "grad_norm": 2.151231527328491, "learning_rate": 9.988280262996507e-06, "loss": 0.8235, "mean_token_accuracy": 0.7396666407585144, "step": 519 }, { "epoch": 0.35410282601293835, "grad_norm": 2.0631818771362305, "learning_rate": 9.988125609937446e-06, "loss": 0.9998, "mean_token_accuracy": 0.7004601061344147, "step": 520 }, { "epoch": 0.3547837929860402, "grad_norm": 1.990782380104065, "learning_rate": 9.987969944371956e-06, "loss": 0.9629, "mean_token_accuracy": 0.7039539515972137, "step": 521 }, { "epoch": 0.355464759959142, "grad_norm": 1.908792495727539, "learning_rate": 9.987813266331638e-06, "loss": 0.9413, "mean_token_accuracy": 0.7171131670475006, "step": 522 }, { "epoch": 0.3561457269322438, "grad_norm": 2.00396728515625, "learning_rate": 9.987655575848291e-06, "loss": 1.0265, "mean_token_accuracy": 0.6882841885089874, "step": 523 }, { "epoch": 0.3568266939053456, "grad_norm": 2.116198778152466, "learning_rate": 9.987496872953925e-06, "loss": 1.0331, "mean_token_accuracy": 0.6940361261367798, "step": 524 }, { "epoch": 0.3575076608784474, "grad_norm": 2.1327991485595703, "learning_rate": 9.987337157680756e-06, "loss": 0.929, "mean_token_accuracy": 0.7191362082958221, "step": 525 }, { "epoch": 0.3581886278515492, "grad_norm": 1.9717823266983032, "learning_rate": 9.9871764300612e-06, "loss": 1.0399, "mean_token_accuracy": 0.6922063231468201, "step": 526 }, { "epoch": 0.35886959482465103, "grad_norm": 2.1069695949554443, "learning_rate": 9.987014690127886e-06, "loss": 1.0084, "mean_token_accuracy": 0.6877654194831848, "step": 527 }, { "epoch": 0.3595505617977528, "grad_norm": 2.1959080696105957, "learning_rate": 9.986851937913642e-06, "loss": 0.8952, "mean_token_accuracy": 0.7311255037784576, "step": 528 }, { "epoch": 0.3602315287708546, "grad_norm": 1.9723738431930542, "learning_rate": 9.986688173451505e-06, "loss": 1.0728, "mean_token_accuracy": 0.6637049317359924, "step": 529 }, { "epoch": 0.3609124957439564, "grad_norm": 2.0564956665039062, "learning_rate": 9.986523396774716e-06, "loss": 0.9337, "mean_token_accuracy": 0.6985387206077576, "step": 530 }, { "epoch": 0.3615934627170582, "grad_norm": 2.1851539611816406, "learning_rate": 9.98635760791672e-06, "loss": 0.9285, "mean_token_accuracy": 0.711342066526413, "step": 531 }, { "epoch": 0.36227442969016005, "grad_norm": 2.1020615100860596, "learning_rate": 9.986190806911174e-06, "loss": 0.9178, "mean_token_accuracy": 0.7237899005413055, "step": 532 }, { "epoch": 0.3629553966632618, "grad_norm": 1.9078176021575928, "learning_rate": 9.986022993791933e-06, "loss": 0.9987, "mean_token_accuracy": 0.6991820335388184, "step": 533 }, { "epoch": 0.36363636363636365, "grad_norm": 2.1014676094055176, "learning_rate": 9.98585416859306e-06, "loss": 0.9507, "mean_token_accuracy": 0.705043613910675, "step": 534 }, { "epoch": 0.3643173306094654, "grad_norm": 2.0235495567321777, "learning_rate": 9.985684331348824e-06, "loss": 0.9394, "mean_token_accuracy": 0.7120358049869537, "step": 535 }, { "epoch": 0.36499829758256724, "grad_norm": 1.9142918586730957, "learning_rate": 9.985513482093698e-06, "loss": 1.053, "mean_token_accuracy": 0.6946780383586884, "step": 536 }, { "epoch": 0.36567926455566907, "grad_norm": 2.115934133529663, "learning_rate": 9.985341620862365e-06, "loss": 0.8987, "mean_token_accuracy": 0.7217186391353607, "step": 537 }, { "epoch": 0.36636023152877084, "grad_norm": 2.407348155975342, "learning_rate": 9.985168747689706e-06, "loss": 0.9224, "mean_token_accuracy": 0.7153034210205078, "step": 538 }, { "epoch": 0.36704119850187267, "grad_norm": 2.053475856781006, "learning_rate": 9.984994862610815e-06, "loss": 0.8859, "mean_token_accuracy": 0.7238022089004517, "step": 539 }, { "epoch": 0.36772216547497444, "grad_norm": 2.3382248878479004, "learning_rate": 9.984819965660984e-06, "loss": 0.8163, "mean_token_accuracy": 0.7474624514579773, "step": 540 }, { "epoch": 0.36840313244807626, "grad_norm": 2.061796188354492, "learning_rate": 9.984644056875718e-06, "loss": 0.9944, "mean_token_accuracy": 0.7009611129760742, "step": 541 }, { "epoch": 0.3690840994211781, "grad_norm": 2.0382585525512695, "learning_rate": 9.98446713629072e-06, "loss": 0.9375, "mean_token_accuracy": 0.7107587158679962, "step": 542 }, { "epoch": 0.36976506639427986, "grad_norm": 2.282292366027832, "learning_rate": 9.984289203941905e-06, "loss": 0.9413, "mean_token_accuracy": 0.7087579369544983, "step": 543 }, { "epoch": 0.3704460333673817, "grad_norm": 2.4343342781066895, "learning_rate": 9.984110259865387e-06, "loss": 0.9673, "mean_token_accuracy": 0.686899870634079, "step": 544 }, { "epoch": 0.37112700034048346, "grad_norm": 2.2311418056488037, "learning_rate": 9.983930304097493e-06, "loss": 0.8543, "mean_token_accuracy": 0.7299860417842865, "step": 545 }, { "epoch": 0.3718079673135853, "grad_norm": 1.8539135456085205, "learning_rate": 9.983749336674747e-06, "loss": 1.1161, "mean_token_accuracy": 0.6837010383605957, "step": 546 }, { "epoch": 0.3724889342866871, "grad_norm": 2.045074224472046, "learning_rate": 9.983567357633885e-06, "loss": 0.9563, "mean_token_accuracy": 0.7182944416999817, "step": 547 }, { "epoch": 0.3731699012597889, "grad_norm": 2.2967560291290283, "learning_rate": 9.983384367011844e-06, "loss": 0.9959, "mean_token_accuracy": 0.7010380625724792, "step": 548 }, { "epoch": 0.3738508682328907, "grad_norm": 2.2485198974609375, "learning_rate": 9.983200364845769e-06, "loss": 0.8443, "mean_token_accuracy": 0.7442286312580109, "step": 549 }, { "epoch": 0.37453183520599254, "grad_norm": 2.1017518043518066, "learning_rate": 9.98301535117301e-06, "loss": 0.8794, "mean_token_accuracy": 0.7363519668579102, "step": 550 }, { "epoch": 0.3752128021790943, "grad_norm": 2.0562021732330322, "learning_rate": 9.982829326031118e-06, "loss": 1.0708, "mean_token_accuracy": 0.6778472661972046, "step": 551 }, { "epoch": 0.37589376915219613, "grad_norm": 1.8620537519454956, "learning_rate": 9.982642289457858e-06, "loss": 1.0228, "mean_token_accuracy": 0.6777461767196655, "step": 552 }, { "epoch": 0.3765747361252979, "grad_norm": 2.0999233722686768, "learning_rate": 9.982454241491193e-06, "loss": 0.9462, "mean_token_accuracy": 0.7129599452018738, "step": 553 }, { "epoch": 0.37725570309839973, "grad_norm": 2.1758105754852295, "learning_rate": 9.982265182169294e-06, "loss": 0.8948, "mean_token_accuracy": 0.7343894243240356, "step": 554 }, { "epoch": 0.37793667007150156, "grad_norm": 2.139699697494507, "learning_rate": 9.982075111530536e-06, "loss": 0.8539, "mean_token_accuracy": 0.7425749003887177, "step": 555 }, { "epoch": 0.37861763704460333, "grad_norm": 1.94044029712677, "learning_rate": 9.981884029613501e-06, "loss": 1.0351, "mean_token_accuracy": 0.6785255074501038, "step": 556 }, { "epoch": 0.37929860401770515, "grad_norm": 2.1411056518554688, "learning_rate": 9.981691936456975e-06, "loss": 0.8504, "mean_token_accuracy": 0.7406320869922638, "step": 557 }, { "epoch": 0.3799795709908069, "grad_norm": 2.130803346633911, "learning_rate": 9.98149883209995e-06, "loss": 0.9871, "mean_token_accuracy": 0.7010414600372314, "step": 558 }, { "epoch": 0.38066053796390875, "grad_norm": 1.9505959749221802, "learning_rate": 9.981304716581623e-06, "loss": 1.0783, "mean_token_accuracy": 0.6860813796520233, "step": 559 }, { "epoch": 0.3813415049370106, "grad_norm": 2.0795400142669678, "learning_rate": 9.981109589941398e-06, "loss": 0.862, "mean_token_accuracy": 0.7366137504577637, "step": 560 }, { "epoch": 0.38202247191011235, "grad_norm": 2.003538131713867, "learning_rate": 9.980913452218878e-06, "loss": 0.9154, "mean_token_accuracy": 0.7125462293624878, "step": 561 }, { "epoch": 0.3827034388832142, "grad_norm": 2.2340824604034424, "learning_rate": 9.98071630345388e-06, "loss": 1.0039, "mean_token_accuracy": 0.6887751817703247, "step": 562 }, { "epoch": 0.38338440585631595, "grad_norm": 1.9921391010284424, "learning_rate": 9.98051814368642e-06, "loss": 0.9984, "mean_token_accuracy": 0.6903782486915588, "step": 563 }, { "epoch": 0.3840653728294178, "grad_norm": 1.9752492904663086, "learning_rate": 9.98031897295672e-06, "loss": 0.9443, "mean_token_accuracy": 0.7117453217506409, "step": 564 }, { "epoch": 0.3847463398025196, "grad_norm": 2.038235902786255, "learning_rate": 9.98011879130521e-06, "loss": 0.9314, "mean_token_accuracy": 0.7081552147865295, "step": 565 }, { "epoch": 0.38542730677562137, "grad_norm": 2.1939196586608887, "learning_rate": 9.979917598772524e-06, "loss": 0.9961, "mean_token_accuracy": 0.6995570957660675, "step": 566 }, { "epoch": 0.3861082737487232, "grad_norm": 2.0766024589538574, "learning_rate": 9.9797153953995e-06, "loss": 0.8876, "mean_token_accuracy": 0.7295851111412048, "step": 567 }, { "epoch": 0.38678924072182497, "grad_norm": 2.005589485168457, "learning_rate": 9.979512181227181e-06, "loss": 0.9316, "mean_token_accuracy": 0.726781040430069, "step": 568 }, { "epoch": 0.3874702076949268, "grad_norm": 2.106043815612793, "learning_rate": 9.979307956296818e-06, "loss": 0.9891, "mean_token_accuracy": 0.6879585683345795, "step": 569 }, { "epoch": 0.3881511746680286, "grad_norm": 2.2960500717163086, "learning_rate": 9.979102720649863e-06, "loss": 0.8709, "mean_token_accuracy": 0.7292031347751617, "step": 570 }, { "epoch": 0.3888321416411304, "grad_norm": 1.9846009016036987, "learning_rate": 9.978896474327977e-06, "loss": 0.9993, "mean_token_accuracy": 0.6998837888240814, "step": 571 }, { "epoch": 0.3895131086142322, "grad_norm": 2.114276170730591, "learning_rate": 9.978689217373025e-06, "loss": 0.949, "mean_token_accuracy": 0.7219383418560028, "step": 572 }, { "epoch": 0.390194075587334, "grad_norm": 2.3693578243255615, "learning_rate": 9.978480949827077e-06, "loss": 0.8701, "mean_token_accuracy": 0.7302348911762238, "step": 573 }, { "epoch": 0.3908750425604358, "grad_norm": 1.9856659173965454, "learning_rate": 9.978271671732404e-06, "loss": 0.9761, "mean_token_accuracy": 0.7000784575939178, "step": 574 }, { "epoch": 0.39155600953353764, "grad_norm": 2.2501022815704346, "learning_rate": 9.97806138313149e-06, "loss": 1.0297, "mean_token_accuracy": 0.6900023519992828, "step": 575 }, { "epoch": 0.3922369765066394, "grad_norm": 2.256434440612793, "learning_rate": 9.97785008406702e-06, "loss": 0.8851, "mean_token_accuracy": 0.7297556102275848, "step": 576 }, { "epoch": 0.39291794347974124, "grad_norm": 2.0885965824127197, "learning_rate": 9.977637774581881e-06, "loss": 1.1122, "mean_token_accuracy": 0.6756476163864136, "step": 577 }, { "epoch": 0.393598910452843, "grad_norm": 2.2850258350372314, "learning_rate": 9.977424454719171e-06, "loss": 0.9108, "mean_token_accuracy": 0.7197953164577484, "step": 578 }, { "epoch": 0.39427987742594484, "grad_norm": 2.092280626296997, "learning_rate": 9.97721012452219e-06, "loss": 0.8519, "mean_token_accuracy": 0.7353374660015106, "step": 579 }, { "epoch": 0.39496084439904666, "grad_norm": 2.2645888328552246, "learning_rate": 9.976994784034445e-06, "loss": 0.9305, "mean_token_accuracy": 0.7200124561786652, "step": 580 }, { "epoch": 0.39564181137214843, "grad_norm": 2.1197614669799805, "learning_rate": 9.97677843329964e-06, "loss": 0.9297, "mean_token_accuracy": 0.708725243806839, "step": 581 }, { "epoch": 0.39632277834525026, "grad_norm": 2.176368236541748, "learning_rate": 9.9765610723617e-06, "loss": 1.0022, "mean_token_accuracy": 0.6968470215797424, "step": 582 }, { "epoch": 0.3970037453183521, "grad_norm": 2.344726085662842, "learning_rate": 9.976342701264738e-06, "loss": 1.0358, "mean_token_accuracy": 0.6861270368099213, "step": 583 }, { "epoch": 0.39768471229145386, "grad_norm": 2.0996220111846924, "learning_rate": 9.976123320053084e-06, "loss": 0.914, "mean_token_accuracy": 0.7158807814121246, "step": 584 }, { "epoch": 0.3983656792645557, "grad_norm": 2.066058874130249, "learning_rate": 9.975902928771267e-06, "loss": 0.9951, "mean_token_accuracy": 0.6941415667533875, "step": 585 }, { "epoch": 0.39904664623765745, "grad_norm": 2.064539670944214, "learning_rate": 9.975681527464023e-06, "loss": 0.9899, "mean_token_accuracy": 0.6973035335540771, "step": 586 }, { "epoch": 0.3997276132107593, "grad_norm": 2.0973117351531982, "learning_rate": 9.975459116176294e-06, "loss": 1.0227, "mean_token_accuracy": 0.689253956079483, "step": 587 }, { "epoch": 0.4004085801838611, "grad_norm": 1.9377282857894897, "learning_rate": 9.975235694953223e-06, "loss": 0.9606, "mean_token_accuracy": 0.7118803560733795, "step": 588 }, { "epoch": 0.4010895471569629, "grad_norm": 2.045855760574341, "learning_rate": 9.975011263840162e-06, "loss": 0.893, "mean_token_accuracy": 0.7228454649448395, "step": 589 }, { "epoch": 0.4017705141300647, "grad_norm": 1.93856680393219, "learning_rate": 9.97478582288267e-06, "loss": 1.0329, "mean_token_accuracy": 0.6868622303009033, "step": 590 }, { "epoch": 0.4024514811031665, "grad_norm": 2.3114349842071533, "learning_rate": 9.974559372126502e-06, "loss": 0.9521, "mean_token_accuracy": 0.6957688629627228, "step": 591 }, { "epoch": 0.4031324480762683, "grad_norm": 2.094921112060547, "learning_rate": 9.974331911617629e-06, "loss": 0.8842, "mean_token_accuracy": 0.7191742062568665, "step": 592 }, { "epoch": 0.40381341504937013, "grad_norm": 2.3438172340393066, "learning_rate": 9.974103441402218e-06, "loss": 0.9708, "mean_token_accuracy": 0.7044512927532196, "step": 593 }, { "epoch": 0.4044943820224719, "grad_norm": 1.8983951807022095, "learning_rate": 9.973873961526647e-06, "loss": 0.984, "mean_token_accuracy": 0.6940240263938904, "step": 594 }, { "epoch": 0.4051753489955737, "grad_norm": 1.8956478834152222, "learning_rate": 9.973643472037495e-06, "loss": 0.9669, "mean_token_accuracy": 0.7052456140518188, "step": 595 }, { "epoch": 0.4058563159686755, "grad_norm": 2.1105144023895264, "learning_rate": 9.973411972981548e-06, "loss": 0.9682, "mean_token_accuracy": 0.7080371677875519, "step": 596 }, { "epoch": 0.4065372829417773, "grad_norm": 2.0482707023620605, "learning_rate": 9.973179464405797e-06, "loss": 0.9861, "mean_token_accuracy": 0.7008861899375916, "step": 597 }, { "epoch": 0.40721824991487915, "grad_norm": 1.9573239088058472, "learning_rate": 9.972945946357437e-06, "loss": 0.9981, "mean_token_accuracy": 0.6969294548034668, "step": 598 }, { "epoch": 0.4078992168879809, "grad_norm": 2.0037004947662354, "learning_rate": 9.97271141888387e-06, "loss": 1.0507, "mean_token_accuracy": 0.6924587488174438, "step": 599 }, { "epoch": 0.40858018386108275, "grad_norm": 2.231537103652954, "learning_rate": 9.972475882032697e-06, "loss": 1.0018, "mean_token_accuracy": 0.6974407434463501, "step": 600 }, { "epoch": 0.4092611508341845, "grad_norm": 2.170747995376587, "learning_rate": 9.972239335851732e-06, "loss": 0.9194, "mean_token_accuracy": 0.7152959406375885, "step": 601 }, { "epoch": 0.40994211780728634, "grad_norm": 2.278907537460327, "learning_rate": 9.972001780388988e-06, "loss": 0.8124, "mean_token_accuracy": 0.747006356716156, "step": 602 }, { "epoch": 0.41062308478038817, "grad_norm": 1.9696708917617798, "learning_rate": 9.971763215692685e-06, "loss": 1.0646, "mean_token_accuracy": 0.684382438659668, "step": 603 }, { "epoch": 0.41130405175348994, "grad_norm": 1.7912229299545288, "learning_rate": 9.971523641811249e-06, "loss": 1.0783, "mean_token_accuracy": 0.6794540584087372, "step": 604 }, { "epoch": 0.41198501872659177, "grad_norm": 1.9268488883972168, "learning_rate": 9.971283058793306e-06, "loss": 0.9101, "mean_token_accuracy": 0.7199771404266357, "step": 605 }, { "epoch": 0.41266598569969354, "grad_norm": 2.0333728790283203, "learning_rate": 9.971041466687695e-06, "loss": 1.0271, "mean_token_accuracy": 0.6742640435695648, "step": 606 }, { "epoch": 0.41334695267279536, "grad_norm": 2.056119441986084, "learning_rate": 9.970798865543454e-06, "loss": 0.9948, "mean_token_accuracy": 0.7020864486694336, "step": 607 }, { "epoch": 0.4140279196458972, "grad_norm": 1.8139028549194336, "learning_rate": 9.970555255409823e-06, "loss": 0.9793, "mean_token_accuracy": 0.6967433393001556, "step": 608 }, { "epoch": 0.41470888661899896, "grad_norm": 2.233309507369995, "learning_rate": 9.970310636336255e-06, "loss": 0.9941, "mean_token_accuracy": 0.6957831084728241, "step": 609 }, { "epoch": 0.4153898535921008, "grad_norm": 1.9058560132980347, "learning_rate": 9.970065008372403e-06, "loss": 1.0711, "mean_token_accuracy": 0.6831269264221191, "step": 610 }, { "epoch": 0.4160708205652026, "grad_norm": 2.2319300174713135, "learning_rate": 9.969818371568122e-06, "loss": 0.9164, "mean_token_accuracy": 0.714230477809906, "step": 611 }, { "epoch": 0.4167517875383044, "grad_norm": 2.2464499473571777, "learning_rate": 9.96957072597348e-06, "loss": 0.8946, "mean_token_accuracy": 0.7272132635116577, "step": 612 }, { "epoch": 0.4174327545114062, "grad_norm": 2.2274551391601562, "learning_rate": 9.969322071638745e-06, "loss": 0.9402, "mean_token_accuracy": 0.7163926064968109, "step": 613 }, { "epoch": 0.418113721484508, "grad_norm": 2.062544822692871, "learning_rate": 9.969072408614385e-06, "loss": 0.9345, "mean_token_accuracy": 0.7095199227333069, "step": 614 }, { "epoch": 0.4187946884576098, "grad_norm": 2.050663471221924, "learning_rate": 9.96882173695108e-06, "loss": 1.1, "mean_token_accuracy": 0.6727085113525391, "step": 615 }, { "epoch": 0.41947565543071164, "grad_norm": 2.174123525619507, "learning_rate": 9.968570056699712e-06, "loss": 0.8519, "mean_token_accuracy": 0.7204224467277527, "step": 616 }, { "epoch": 0.4201566224038134, "grad_norm": 2.028103828430176, "learning_rate": 9.968317367911368e-06, "loss": 0.8488, "mean_token_accuracy": 0.727837473154068, "step": 617 }, { "epoch": 0.42083758937691523, "grad_norm": 2.0239365100860596, "learning_rate": 9.96806367063734e-06, "loss": 0.9974, "mean_token_accuracy": 0.7039609849452972, "step": 618 }, { "epoch": 0.421518556350017, "grad_norm": 1.9974486827850342, "learning_rate": 9.967808964929125e-06, "loss": 0.9317, "mean_token_accuracy": 0.7100875377655029, "step": 619 }, { "epoch": 0.42219952332311883, "grad_norm": 2.095792770385742, "learning_rate": 9.967553250838422e-06, "loss": 0.9143, "mean_token_accuracy": 0.7185491919517517, "step": 620 }, { "epoch": 0.42288049029622066, "grad_norm": 2.2308130264282227, "learning_rate": 9.967296528417138e-06, "loss": 0.8891, "mean_token_accuracy": 0.7242876291275024, "step": 621 }, { "epoch": 0.42356145726932243, "grad_norm": 2.371182441711426, "learning_rate": 9.967038797717383e-06, "loss": 0.873, "mean_token_accuracy": 0.7181953489780426, "step": 622 }, { "epoch": 0.42424242424242425, "grad_norm": 2.256418228149414, "learning_rate": 9.966780058791472e-06, "loss": 0.9664, "mean_token_accuracy": 0.7076472043991089, "step": 623 }, { "epoch": 0.424923391215526, "grad_norm": 2.089085340499878, "learning_rate": 9.966520311691926e-06, "loss": 1.0667, "mean_token_accuracy": 0.691466361284256, "step": 624 }, { "epoch": 0.42560435818862785, "grad_norm": 1.9843323230743408, "learning_rate": 9.966259556471465e-06, "loss": 1.077, "mean_token_accuracy": 0.6732761263847351, "step": 625 }, { "epoch": 0.4262853251617297, "grad_norm": 2.4986374378204346, "learning_rate": 9.965997793183023e-06, "loss": 1.0576, "mean_token_accuracy": 0.6843494772911072, "step": 626 }, { "epoch": 0.42696629213483145, "grad_norm": 1.9334033727645874, "learning_rate": 9.965735021879733e-06, "loss": 1.0103, "mean_token_accuracy": 0.6912826299667358, "step": 627 }, { "epoch": 0.4276472591079333, "grad_norm": 1.9408955574035645, "learning_rate": 9.96547124261493e-06, "loss": 0.9516, "mean_token_accuracy": 0.703145444393158, "step": 628 }, { "epoch": 0.42832822608103505, "grad_norm": 1.8618842363357544, "learning_rate": 9.965206455442157e-06, "loss": 0.9547, "mean_token_accuracy": 0.7176213264465332, "step": 629 }, { "epoch": 0.4290091930541369, "grad_norm": 1.9169127941131592, "learning_rate": 9.964940660415166e-06, "loss": 0.9252, "mean_token_accuracy": 0.7036819756031036, "step": 630 }, { "epoch": 0.4296901600272387, "grad_norm": 2.073636054992676, "learning_rate": 9.964673857587905e-06, "loss": 1.048, "mean_token_accuracy": 0.6817346811294556, "step": 631 }, { "epoch": 0.43037112700034047, "grad_norm": 2.0416038036346436, "learning_rate": 9.964406047014532e-06, "loss": 0.9051, "mean_token_accuracy": 0.7249629199504852, "step": 632 }, { "epoch": 0.4310520939734423, "grad_norm": 2.100457191467285, "learning_rate": 9.964137228749409e-06, "loss": 0.896, "mean_token_accuracy": 0.727032482624054, "step": 633 }, { "epoch": 0.43173306094654407, "grad_norm": 2.085439443588257, "learning_rate": 9.963867402847099e-06, "loss": 0.942, "mean_token_accuracy": 0.7149258553981781, "step": 634 }, { "epoch": 0.4324140279196459, "grad_norm": 1.9769618511199951, "learning_rate": 9.963596569362372e-06, "loss": 1.0228, "mean_token_accuracy": 0.6839258670806885, "step": 635 }, { "epoch": 0.4330949948927477, "grad_norm": 1.9976650476455688, "learning_rate": 9.963324728350205e-06, "loss": 0.9158, "mean_token_accuracy": 0.7227845788002014, "step": 636 }, { "epoch": 0.4337759618658495, "grad_norm": 2.3210809230804443, "learning_rate": 9.963051879865778e-06, "loss": 0.9808, "mean_token_accuracy": 0.7033112943172455, "step": 637 }, { "epoch": 0.4344569288389513, "grad_norm": 1.940053939819336, "learning_rate": 9.96277802396447e-06, "loss": 0.9457, "mean_token_accuracy": 0.7219929099082947, "step": 638 }, { "epoch": 0.4351378958120531, "grad_norm": 2.034972906112671, "learning_rate": 9.962503160701876e-06, "loss": 0.8927, "mean_token_accuracy": 0.7266741394996643, "step": 639 }, { "epoch": 0.4358188627851549, "grad_norm": 2.1074862480163574, "learning_rate": 9.962227290133782e-06, "loss": 0.9061, "mean_token_accuracy": 0.72125244140625, "step": 640 }, { "epoch": 0.43649982975825674, "grad_norm": 2.1324400901794434, "learning_rate": 9.961950412316188e-06, "loss": 0.9307, "mean_token_accuracy": 0.6968331634998322, "step": 641 }, { "epoch": 0.4371807967313585, "grad_norm": 1.9276288747787476, "learning_rate": 9.961672527305296e-06, "loss": 1.0535, "mean_token_accuracy": 0.682519406080246, "step": 642 }, { "epoch": 0.43786176370446034, "grad_norm": 1.9719661474227905, "learning_rate": 9.961393635157512e-06, "loss": 0.8967, "mean_token_accuracy": 0.7089483141899109, "step": 643 }, { "epoch": 0.43854273067756216, "grad_norm": 1.9920334815979004, "learning_rate": 9.961113735929446e-06, "loss": 0.9768, "mean_token_accuracy": 0.700070858001709, "step": 644 }, { "epoch": 0.43922369765066394, "grad_norm": 1.912139654159546, "learning_rate": 9.960832829677913e-06, "loss": 0.8511, "mean_token_accuracy": 0.7152249217033386, "step": 645 }, { "epoch": 0.43990466462376576, "grad_norm": 2.149502754211426, "learning_rate": 9.960550916459932e-06, "loss": 0.9399, "mean_token_accuracy": 0.7013199627399445, "step": 646 }, { "epoch": 0.44058563159686753, "grad_norm": 2.1375722885131836, "learning_rate": 9.96026799633273e-06, "loss": 0.9181, "mean_token_accuracy": 0.7151254415512085, "step": 647 }, { "epoch": 0.44126659856996936, "grad_norm": 2.105410575866699, "learning_rate": 9.959984069353727e-06, "loss": 0.9363, "mean_token_accuracy": 0.7172991931438446, "step": 648 }, { "epoch": 0.4419475655430712, "grad_norm": 1.9850544929504395, "learning_rate": 9.959699135580562e-06, "loss": 0.8946, "mean_token_accuracy": 0.7271723747253418, "step": 649 }, { "epoch": 0.44262853251617296, "grad_norm": 1.9183616638183594, "learning_rate": 9.959413195071073e-06, "loss": 1.0816, "mean_token_accuracy": 0.681822270154953, "step": 650 }, { "epoch": 0.4433094994892748, "grad_norm": 1.9954922199249268, "learning_rate": 9.959126247883295e-06, "loss": 0.9353, "mean_token_accuracy": 0.7001739144325256, "step": 651 }, { "epoch": 0.44399046646237655, "grad_norm": 1.9702192544937134, "learning_rate": 9.95883829407548e-06, "loss": 0.9032, "mean_token_accuracy": 0.72303706407547, "step": 652 }, { "epoch": 0.4446714334354784, "grad_norm": 2.046172618865967, "learning_rate": 9.958549333706072e-06, "loss": 1.0443, "mean_token_accuracy": 0.6873063147068024, "step": 653 }, { "epoch": 0.4453524004085802, "grad_norm": 1.936923861503601, "learning_rate": 9.958259366833729e-06, "loss": 0.9713, "mean_token_accuracy": 0.7027016580104828, "step": 654 }, { "epoch": 0.446033367381682, "grad_norm": 2.1309261322021484, "learning_rate": 9.957968393517307e-06, "loss": 1.0572, "mean_token_accuracy": 0.6934065818786621, "step": 655 }, { "epoch": 0.4467143343547838, "grad_norm": 1.980984091758728, "learning_rate": 9.957676413815873e-06, "loss": 0.9646, "mean_token_accuracy": 0.6901469230651855, "step": 656 }, { "epoch": 0.4473953013278856, "grad_norm": 2.145024299621582, "learning_rate": 9.95738342778869e-06, "loss": 0.9322, "mean_token_accuracy": 0.7145181596279144, "step": 657 }, { "epoch": 0.4480762683009874, "grad_norm": 1.9246867895126343, "learning_rate": 9.95708943549523e-06, "loss": 0.941, "mean_token_accuracy": 0.7177197635173798, "step": 658 }, { "epoch": 0.44875723527408923, "grad_norm": 1.9214816093444824, "learning_rate": 9.956794436995168e-06, "loss": 0.9778, "mean_token_accuracy": 0.6933042109012604, "step": 659 }, { "epoch": 0.449438202247191, "grad_norm": 2.157296895980835, "learning_rate": 9.956498432348386e-06, "loss": 0.9476, "mean_token_accuracy": 0.7143227458000183, "step": 660 }, { "epoch": 0.4501191692202928, "grad_norm": 2.1313626766204834, "learning_rate": 9.956201421614967e-06, "loss": 0.9222, "mean_token_accuracy": 0.7178921103477478, "step": 661 }, { "epoch": 0.4508001361933946, "grad_norm": 1.993490219116211, "learning_rate": 9.955903404855198e-06, "loss": 0.9561, "mean_token_accuracy": 0.7122507691383362, "step": 662 }, { "epoch": 0.4514811031664964, "grad_norm": 2.037540912628174, "learning_rate": 9.955604382129574e-06, "loss": 0.9649, "mean_token_accuracy": 0.7101705372333527, "step": 663 }, { "epoch": 0.45216207013959825, "grad_norm": 1.9457197189331055, "learning_rate": 9.955304353498787e-06, "loss": 0.9979, "mean_token_accuracy": 0.6999376118183136, "step": 664 }, { "epoch": 0.4528430371127, "grad_norm": 1.8266825675964355, "learning_rate": 9.955003319023743e-06, "loss": 0.9998, "mean_token_accuracy": 0.6861158609390259, "step": 665 }, { "epoch": 0.45352400408580185, "grad_norm": 1.893547773361206, "learning_rate": 9.954701278765544e-06, "loss": 0.8709, "mean_token_accuracy": 0.7341854870319366, "step": 666 }, { "epoch": 0.4542049710589036, "grad_norm": 1.9796000719070435, "learning_rate": 9.9543982327855e-06, "loss": 0.9526, "mean_token_accuracy": 0.6990080773830414, "step": 667 }, { "epoch": 0.45488593803200544, "grad_norm": 2.0276331901550293, "learning_rate": 9.954094181145126e-06, "loss": 1.0021, "mean_token_accuracy": 0.7051751017570496, "step": 668 }, { "epoch": 0.45556690500510727, "grad_norm": 1.7915668487548828, "learning_rate": 9.953789123906137e-06, "loss": 1.083, "mean_token_accuracy": 0.6819418668746948, "step": 669 }, { "epoch": 0.45624787197820904, "grad_norm": 1.9359142780303955, "learning_rate": 9.953483061130453e-06, "loss": 0.9675, "mean_token_accuracy": 0.7063851058483124, "step": 670 }, { "epoch": 0.45692883895131087, "grad_norm": 1.9997704029083252, "learning_rate": 9.953175992880204e-06, "loss": 0.9347, "mean_token_accuracy": 0.7216438353061676, "step": 671 }, { "epoch": 0.4576098059244127, "grad_norm": 1.8078317642211914, "learning_rate": 9.952867919217717e-06, "loss": 1.0136, "mean_token_accuracy": 0.6848367154598236, "step": 672 }, { "epoch": 0.45829077289751446, "grad_norm": 1.867315649986267, "learning_rate": 9.952558840205527e-06, "loss": 0.9574, "mean_token_accuracy": 0.7040181159973145, "step": 673 }, { "epoch": 0.4589717398706163, "grad_norm": 2.143455982208252, "learning_rate": 9.95224875590637e-06, "loss": 0.9996, "mean_token_accuracy": 0.6851459443569183, "step": 674 }, { "epoch": 0.45965270684371806, "grad_norm": 2.032160997390747, "learning_rate": 9.951937666383191e-06, "loss": 0.862, "mean_token_accuracy": 0.7173111736774445, "step": 675 }, { "epoch": 0.4603336738168199, "grad_norm": 1.7756341695785522, "learning_rate": 9.951625571699134e-06, "loss": 1.0391, "mean_token_accuracy": 0.6880825459957123, "step": 676 }, { "epoch": 0.4610146407899217, "grad_norm": 2.0563032627105713, "learning_rate": 9.95131247191755e-06, "loss": 0.9283, "mean_token_accuracy": 0.7128584086894989, "step": 677 }, { "epoch": 0.4616956077630235, "grad_norm": 2.1915323734283447, "learning_rate": 9.95099836710199e-06, "loss": 0.9135, "mean_token_accuracy": 0.7055534422397614, "step": 678 }, { "epoch": 0.4623765747361253, "grad_norm": 1.9432607889175415, "learning_rate": 9.950683257316215e-06, "loss": 0.9615, "mean_token_accuracy": 0.7036134004592896, "step": 679 }, { "epoch": 0.4630575417092271, "grad_norm": 1.954797625541687, "learning_rate": 9.950367142624187e-06, "loss": 0.9686, "mean_token_accuracy": 0.696353942155838, "step": 680 }, { "epoch": 0.4637385086823289, "grad_norm": 2.219552993774414, "learning_rate": 9.95005002309007e-06, "loss": 0.8896, "mean_token_accuracy": 0.737926572561264, "step": 681 }, { "epoch": 0.46441947565543074, "grad_norm": 2.047438144683838, "learning_rate": 9.949731898778238e-06, "loss": 0.9715, "mean_token_accuracy": 0.7020373344421387, "step": 682 }, { "epoch": 0.4651004426285325, "grad_norm": 1.949642300605774, "learning_rate": 9.949412769753262e-06, "loss": 0.9552, "mean_token_accuracy": 0.7100413143634796, "step": 683 }, { "epoch": 0.46578140960163433, "grad_norm": 2.051969528198242, "learning_rate": 9.949092636079918e-06, "loss": 1.0489, "mean_token_accuracy": 0.693653017282486, "step": 684 }, { "epoch": 0.4664623765747361, "grad_norm": 1.6370418071746826, "learning_rate": 9.94877149782319e-06, "loss": 1.0914, "mean_token_accuracy": 0.6859793066978455, "step": 685 }, { "epoch": 0.46714334354783793, "grad_norm": 1.7886003255844116, "learning_rate": 9.948449355048266e-06, "loss": 1.0186, "mean_token_accuracy": 0.6862069964408875, "step": 686 }, { "epoch": 0.46782431052093976, "grad_norm": 1.909091591835022, "learning_rate": 9.948126207820533e-06, "loss": 0.9658, "mean_token_accuracy": 0.6900152266025543, "step": 687 }, { "epoch": 0.4685052774940415, "grad_norm": 2.163109302520752, "learning_rate": 9.947802056205585e-06, "loss": 0.8034, "mean_token_accuracy": 0.7558697760105133, "step": 688 }, { "epoch": 0.46918624446714335, "grad_norm": 2.0253734588623047, "learning_rate": 9.947476900269219e-06, "loss": 0.9236, "mean_token_accuracy": 0.714254766702652, "step": 689 }, { "epoch": 0.4698672114402451, "grad_norm": 2.0825536251068115, "learning_rate": 9.947150740077436e-06, "loss": 0.9221, "mean_token_accuracy": 0.7179655134677887, "step": 690 }, { "epoch": 0.47054817841334695, "grad_norm": 2.0201995372772217, "learning_rate": 9.94682357569644e-06, "loss": 0.823, "mean_token_accuracy": 0.7478184700012207, "step": 691 }, { "epoch": 0.4712291453864488, "grad_norm": 1.884381890296936, "learning_rate": 9.946495407192644e-06, "loss": 0.9129, "mean_token_accuracy": 0.7053926885128021, "step": 692 }, { "epoch": 0.47191011235955055, "grad_norm": 2.088909149169922, "learning_rate": 9.946166234632656e-06, "loss": 0.8942, "mean_token_accuracy": 0.730154812335968, "step": 693 }, { "epoch": 0.4725910793326524, "grad_norm": 2.017477512359619, "learning_rate": 9.945836058083298e-06, "loss": 0.9474, "mean_token_accuracy": 0.6949655711650848, "step": 694 }, { "epoch": 0.47327204630575415, "grad_norm": 1.7892303466796875, "learning_rate": 9.945504877611585e-06, "loss": 0.9275, "mean_token_accuracy": 0.7247048020362854, "step": 695 }, { "epoch": 0.473953013278856, "grad_norm": 1.977307915687561, "learning_rate": 9.945172693284744e-06, "loss": 0.988, "mean_token_accuracy": 0.6966742873191833, "step": 696 }, { "epoch": 0.4746339802519578, "grad_norm": 2.183154344558716, "learning_rate": 9.944839505170202e-06, "loss": 0.9805, "mean_token_accuracy": 0.6983623504638672, "step": 697 }, { "epoch": 0.47531494722505957, "grad_norm": 1.9344450235366821, "learning_rate": 9.944505313335591e-06, "loss": 0.9364, "mean_token_accuracy": 0.7127936482429504, "step": 698 }, { "epoch": 0.4759959141981614, "grad_norm": 1.7469167709350586, "learning_rate": 9.944170117848746e-06, "loss": 0.9621, "mean_token_accuracy": 0.7121807634830475, "step": 699 }, { "epoch": 0.47667688117126317, "grad_norm": 1.7608751058578491, "learning_rate": 9.943833918777705e-06, "loss": 1.0246, "mean_token_accuracy": 0.6872872114181519, "step": 700 }, { "epoch": 0.477357848144365, "grad_norm": 1.9907276630401611, "learning_rate": 9.943496716190714e-06, "loss": 0.9482, "mean_token_accuracy": 0.7189425230026245, "step": 701 }, { "epoch": 0.4780388151174668, "grad_norm": 1.9066959619522095, "learning_rate": 9.943158510156216e-06, "loss": 0.9774, "mean_token_accuracy": 0.7021515071392059, "step": 702 }, { "epoch": 0.4787197820905686, "grad_norm": 2.0131351947784424, "learning_rate": 9.942819300742862e-06, "loss": 0.9781, "mean_token_accuracy": 0.7095919251441956, "step": 703 }, { "epoch": 0.4794007490636704, "grad_norm": 1.907072901725769, "learning_rate": 9.94247908801951e-06, "loss": 0.8782, "mean_token_accuracy": 0.7329100966453552, "step": 704 }, { "epoch": 0.48008171603677224, "grad_norm": 1.9332116842269897, "learning_rate": 9.942137872055212e-06, "loss": 0.947, "mean_token_accuracy": 0.7120083272457123, "step": 705 }, { "epoch": 0.480762683009874, "grad_norm": 2.118163824081421, "learning_rate": 9.941795652919232e-06, "loss": 1.0654, "mean_token_accuracy": 0.669248104095459, "step": 706 }, { "epoch": 0.48144364998297584, "grad_norm": 1.9311203956604004, "learning_rate": 9.941452430681034e-06, "loss": 0.9242, "mean_token_accuracy": 0.7218726575374603, "step": 707 }, { "epoch": 0.4821246169560776, "grad_norm": 2.0250048637390137, "learning_rate": 9.941108205410286e-06, "loss": 1.0102, "mean_token_accuracy": 0.6891524493694305, "step": 708 }, { "epoch": 0.48280558392917944, "grad_norm": 2.1405622959136963, "learning_rate": 9.94076297717686e-06, "loss": 0.9021, "mean_token_accuracy": 0.7277652025222778, "step": 709 }, { "epoch": 0.48348655090228126, "grad_norm": 1.9137346744537354, "learning_rate": 9.940416746050832e-06, "loss": 0.9342, "mean_token_accuracy": 0.7110306024551392, "step": 710 }, { "epoch": 0.48416751787538304, "grad_norm": 2.0861525535583496, "learning_rate": 9.940069512102483e-06, "loss": 1.0194, "mean_token_accuracy": 0.6923277080059052, "step": 711 }, { "epoch": 0.48484848484848486, "grad_norm": 2.2088606357574463, "learning_rate": 9.939721275402292e-06, "loss": 1.0265, "mean_token_accuracy": 0.6872153282165527, "step": 712 }, { "epoch": 0.48552945182158663, "grad_norm": 1.9329112768173218, "learning_rate": 9.939372036020949e-06, "loss": 0.999, "mean_token_accuracy": 0.6956731677055359, "step": 713 }, { "epoch": 0.48621041879468846, "grad_norm": 1.9481732845306396, "learning_rate": 9.93902179402934e-06, "loss": 0.9851, "mean_token_accuracy": 0.6963219046592712, "step": 714 }, { "epoch": 0.4868913857677903, "grad_norm": 1.808553695678711, "learning_rate": 9.938670549498562e-06, "loss": 1.1293, "mean_token_accuracy": 0.6719396114349365, "step": 715 }, { "epoch": 0.48757235274089206, "grad_norm": 2.1073861122131348, "learning_rate": 9.93831830249991e-06, "loss": 0.9626, "mean_token_accuracy": 0.7032277286052704, "step": 716 }, { "epoch": 0.4882533197139939, "grad_norm": 1.8933018445968628, "learning_rate": 9.937965053104883e-06, "loss": 1.0614, "mean_token_accuracy": 0.6832945644855499, "step": 717 }, { "epoch": 0.48893428668709565, "grad_norm": 1.7293285131454468, "learning_rate": 9.937610801385187e-06, "loss": 0.9983, "mean_token_accuracy": 0.6896238029003143, "step": 718 }, { "epoch": 0.4896152536601975, "grad_norm": 1.9454329013824463, "learning_rate": 9.937255547412727e-06, "loss": 1.0527, "mean_token_accuracy": 0.6708906292915344, "step": 719 }, { "epoch": 0.4902962206332993, "grad_norm": 2.06050443649292, "learning_rate": 9.936899291259616e-06, "loss": 0.9865, "mean_token_accuracy": 0.702019989490509, "step": 720 }, { "epoch": 0.4909771876064011, "grad_norm": 1.9606307744979858, "learning_rate": 9.936542032998168e-06, "loss": 0.9768, "mean_token_accuracy": 0.694132000207901, "step": 721 }, { "epoch": 0.4916581545795029, "grad_norm": 2.1822237968444824, "learning_rate": 9.936183772700898e-06, "loss": 0.9223, "mean_token_accuracy": 0.7197193205356598, "step": 722 }, { "epoch": 0.4923391215526047, "grad_norm": 2.14054799079895, "learning_rate": 9.935824510440527e-06, "loss": 0.8536, "mean_token_accuracy": 0.7351159453392029, "step": 723 }, { "epoch": 0.4930200885257065, "grad_norm": 2.074287176132202, "learning_rate": 9.935464246289983e-06, "loss": 0.9353, "mean_token_accuracy": 0.7193396687507629, "step": 724 }, { "epoch": 0.4937010554988083, "grad_norm": 1.9477976560592651, "learning_rate": 9.93510298032239e-06, "loss": 0.905, "mean_token_accuracy": 0.725794792175293, "step": 725 }, { "epoch": 0.4943820224719101, "grad_norm": 1.931867003440857, "learning_rate": 9.934740712611081e-06, "loss": 1.0519, "mean_token_accuracy": 0.6798489987850189, "step": 726 }, { "epoch": 0.4950629894450119, "grad_norm": 2.1636078357696533, "learning_rate": 9.93437744322959e-06, "loss": 0.8857, "mean_token_accuracy": 0.7310309112071991, "step": 727 }, { "epoch": 0.4957439564181137, "grad_norm": 1.8788206577301025, "learning_rate": 9.934013172251654e-06, "loss": 0.9381, "mean_token_accuracy": 0.7181128561496735, "step": 728 }, { "epoch": 0.4964249233912155, "grad_norm": 1.7820783853530884, "learning_rate": 9.933647899751213e-06, "loss": 0.9476, "mean_token_accuracy": 0.7086891233921051, "step": 729 }, { "epoch": 0.49710589036431735, "grad_norm": 2.0012993812561035, "learning_rate": 9.933281625802412e-06, "loss": 0.9349, "mean_token_accuracy": 0.7168751955032349, "step": 730 }, { "epoch": 0.4977868573374191, "grad_norm": 2.0575790405273438, "learning_rate": 9.932914350479598e-06, "loss": 0.9044, "mean_token_accuracy": 0.72686567902565, "step": 731 }, { "epoch": 0.49846782431052095, "grad_norm": 1.917966604232788, "learning_rate": 9.932546073857325e-06, "loss": 0.9705, "mean_token_accuracy": 0.6963739097118378, "step": 732 }, { "epoch": 0.4991487912836227, "grad_norm": 2.056650400161743, "learning_rate": 9.932176796010344e-06, "loss": 0.9526, "mean_token_accuracy": 0.7080669403076172, "step": 733 }, { "epoch": 0.49982975825672454, "grad_norm": 2.1169345378875732, "learning_rate": 9.931806517013612e-06, "loss": 0.8817, "mean_token_accuracy": 0.7178094685077667, "step": 734 }, { "epoch": 0.5005107252298263, "grad_norm": 1.9673900604248047, "learning_rate": 9.931435236942292e-06, "loss": 0.8904, "mean_token_accuracy": 0.7252759039402008, "step": 735 }, { "epoch": 0.5011916922029281, "grad_norm": 2.1647818088531494, "learning_rate": 9.931062955871744e-06, "loss": 0.9301, "mean_token_accuracy": 0.711367279291153, "step": 736 }, { "epoch": 0.50187265917603, "grad_norm": 2.033891439437866, "learning_rate": 9.930689673877539e-06, "loss": 0.8339, "mean_token_accuracy": 0.7378532290458679, "step": 737 }, { "epoch": 0.5025536261491318, "grad_norm": 2.0522334575653076, "learning_rate": 9.930315391035443e-06, "loss": 0.9032, "mean_token_accuracy": 0.7252960503101349, "step": 738 }, { "epoch": 0.5032345931222336, "grad_norm": 2.0311665534973145, "learning_rate": 9.929940107421432e-06, "loss": 0.8478, "mean_token_accuracy": 0.7436319291591644, "step": 739 }, { "epoch": 0.5039155600953353, "grad_norm": 1.8481192588806152, "learning_rate": 9.92956382311168e-06, "loss": 1.0108, "mean_token_accuracy": 0.6925513744354248, "step": 740 }, { "epoch": 0.5045965270684372, "grad_norm": 1.9213234186172485, "learning_rate": 9.92918653818257e-06, "loss": 0.9582, "mean_token_accuracy": 0.6983604729175568, "step": 741 }, { "epoch": 0.505277494041539, "grad_norm": 1.7660003900527954, "learning_rate": 9.92880825271068e-06, "loss": 1.0043, "mean_token_accuracy": 0.6950367391109467, "step": 742 }, { "epoch": 0.5059584610146408, "grad_norm": 1.9072715044021606, "learning_rate": 9.928428966772799e-06, "loss": 0.9501, "mean_token_accuracy": 0.7009510099887848, "step": 743 }, { "epoch": 0.5066394279877426, "grad_norm": 2.281757116317749, "learning_rate": 9.928048680445917e-06, "loss": 0.9497, "mean_token_accuracy": 0.720739096403122, "step": 744 }, { "epoch": 0.5073203949608444, "grad_norm": 1.842929720878601, "learning_rate": 9.927667393807221e-06, "loss": 0.9497, "mean_token_accuracy": 0.7054935991764069, "step": 745 }, { "epoch": 0.5080013619339462, "grad_norm": 1.811411738395691, "learning_rate": 9.927285106934108e-06, "loss": 0.9975, "mean_token_accuracy": 0.6893165707588196, "step": 746 }, { "epoch": 0.508682328907048, "grad_norm": 1.6135938167572021, "learning_rate": 9.926901819904179e-06, "loss": 1.0025, "mean_token_accuracy": 0.6801508665084839, "step": 747 }, { "epoch": 0.5093632958801498, "grad_norm": 1.8061493635177612, "learning_rate": 9.92651753279523e-06, "loss": 0.9372, "mean_token_accuracy": 0.7060791254043579, "step": 748 }, { "epoch": 0.5100442628532517, "grad_norm": 1.8768036365509033, "learning_rate": 9.926132245685266e-06, "loss": 1.0404, "mean_token_accuracy": 0.6986611485481262, "step": 749 }, { "epoch": 0.5107252298263534, "grad_norm": 2.167813301086426, "learning_rate": 9.925745958652498e-06, "loss": 0.9688, "mean_token_accuracy": 0.7065723836421967, "step": 750 }, { "epoch": 0.5114061967994552, "grad_norm": 1.7664332389831543, "learning_rate": 9.92535867177533e-06, "loss": 1.0481, "mean_token_accuracy": 0.6913266777992249, "step": 751 }, { "epoch": 0.512087163772557, "grad_norm": 1.9805361032485962, "learning_rate": 9.92497038513238e-06, "loss": 0.9364, "mean_token_accuracy": 0.7013558149337769, "step": 752 }, { "epoch": 0.5127681307456589, "grad_norm": 2.082899570465088, "learning_rate": 9.92458109880246e-06, "loss": 0.8761, "mean_token_accuracy": 0.7334899306297302, "step": 753 }, { "epoch": 0.5134490977187607, "grad_norm": 1.9784599542617798, "learning_rate": 9.924190812864588e-06, "loss": 0.8845, "mean_token_accuracy": 0.7322003245353699, "step": 754 }, { "epoch": 0.5141300646918624, "grad_norm": 2.133218765258789, "learning_rate": 9.92379952739799e-06, "loss": 0.872, "mean_token_accuracy": 0.7286185026168823, "step": 755 }, { "epoch": 0.5148110316649642, "grad_norm": 2.0035479068756104, "learning_rate": 9.923407242482088e-06, "loss": 0.9719, "mean_token_accuracy": 0.6978616118431091, "step": 756 }, { "epoch": 0.515491998638066, "grad_norm": 2.1847212314605713, "learning_rate": 9.923013958196508e-06, "loss": 0.9192, "mean_token_accuracy": 0.7162974178791046, "step": 757 }, { "epoch": 0.5161729656111679, "grad_norm": 2.1887242794036865, "learning_rate": 9.92261967462108e-06, "loss": 0.9257, "mean_token_accuracy": 0.7089274227619171, "step": 758 }, { "epoch": 0.5168539325842697, "grad_norm": 1.8136985301971436, "learning_rate": 9.922224391835842e-06, "loss": 0.9309, "mean_token_accuracy": 0.7014078497886658, "step": 759 }, { "epoch": 0.5175348995573714, "grad_norm": 1.9970958232879639, "learning_rate": 9.921828109921024e-06, "loss": 0.9735, "mean_token_accuracy": 0.7021349966526031, "step": 760 }, { "epoch": 0.5182158665304732, "grad_norm": 1.8212262392044067, "learning_rate": 9.921430828957068e-06, "loss": 0.9692, "mean_token_accuracy": 0.7001575529575348, "step": 761 }, { "epoch": 0.5188968335035751, "grad_norm": 2.089075803756714, "learning_rate": 9.921032549024611e-06, "loss": 0.931, "mean_token_accuracy": 0.7089270651340485, "step": 762 }, { "epoch": 0.5195778004766769, "grad_norm": 2.045870065689087, "learning_rate": 9.920633270204504e-06, "loss": 0.8568, "mean_token_accuracy": 0.7345936894416809, "step": 763 }, { "epoch": 0.5202587674497787, "grad_norm": 1.923740029335022, "learning_rate": 9.92023299257779e-06, "loss": 0.9144, "mean_token_accuracy": 0.7150928378105164, "step": 764 }, { "epoch": 0.5209397344228804, "grad_norm": 2.0945661067962646, "learning_rate": 9.919831716225718e-06, "loss": 1.0706, "mean_token_accuracy": 0.6830966472625732, "step": 765 }, { "epoch": 0.5216207013959823, "grad_norm": 1.9198462963104248, "learning_rate": 9.919429441229741e-06, "loss": 0.9896, "mean_token_accuracy": 0.7048326432704926, "step": 766 }, { "epoch": 0.5223016683690841, "grad_norm": 2.045456886291504, "learning_rate": 9.919026167671514e-06, "loss": 0.9873, "mean_token_accuracy": 0.6927386522293091, "step": 767 }, { "epoch": 0.5229826353421859, "grad_norm": 2.149042844772339, "learning_rate": 9.918621895632897e-06, "loss": 0.8868, "mean_token_accuracy": 0.7264110147953033, "step": 768 }, { "epoch": 0.5236636023152877, "grad_norm": 1.892850399017334, "learning_rate": 9.918216625195948e-06, "loss": 1.0319, "mean_token_accuracy": 0.6901819109916687, "step": 769 }, { "epoch": 0.5243445692883895, "grad_norm": 1.8740614652633667, "learning_rate": 9.917810356442932e-06, "loss": 0.9853, "mean_token_accuracy": 0.6793515682220459, "step": 770 }, { "epoch": 0.5250255362614913, "grad_norm": 2.115051031112671, "learning_rate": 9.917403089456313e-06, "loss": 0.8959, "mean_token_accuracy": 0.7053484618663788, "step": 771 }, { "epoch": 0.5257065032345931, "grad_norm": 2.0919225215911865, "learning_rate": 9.916994824318761e-06, "loss": 0.8681, "mean_token_accuracy": 0.7349603772163391, "step": 772 }, { "epoch": 0.5263874702076949, "grad_norm": 1.7744545936584473, "learning_rate": 9.916585561113145e-06, "loss": 0.921, "mean_token_accuracy": 0.7042178213596344, "step": 773 }, { "epoch": 0.5270684371807968, "grad_norm": 2.095860004425049, "learning_rate": 9.916175299922542e-06, "loss": 0.9886, "mean_token_accuracy": 0.6986028850078583, "step": 774 }, { "epoch": 0.5277494041538985, "grad_norm": 1.8552647829055786, "learning_rate": 9.915764040830224e-06, "loss": 1.0431, "mean_token_accuracy": 0.6864786446094513, "step": 775 }, { "epoch": 0.5284303711270003, "grad_norm": 2.2220215797424316, "learning_rate": 9.915351783919673e-06, "loss": 0.9325, "mean_token_accuracy": 0.7196587324142456, "step": 776 }, { "epoch": 0.5291113381001021, "grad_norm": 1.9055273532867432, "learning_rate": 9.91493852927457e-06, "loss": 1.0159, "mean_token_accuracy": 0.6811759769916534, "step": 777 }, { "epoch": 0.529792305073204, "grad_norm": 2.001730442047119, "learning_rate": 9.914524276978797e-06, "loss": 0.8828, "mean_token_accuracy": 0.7229856550693512, "step": 778 }, { "epoch": 0.5304732720463058, "grad_norm": 2.197272539138794, "learning_rate": 9.91410902711644e-06, "loss": 1.0578, "mean_token_accuracy": 0.6792219281196594, "step": 779 }, { "epoch": 0.5311542390194075, "grad_norm": 2.0417118072509766, "learning_rate": 9.91369277977179e-06, "loss": 0.9816, "mean_token_accuracy": 0.7057517170906067, "step": 780 }, { "epoch": 0.5318352059925093, "grad_norm": 1.882527232170105, "learning_rate": 9.913275535029336e-06, "loss": 0.9952, "mean_token_accuracy": 0.6868479251861572, "step": 781 }, { "epoch": 0.5325161729656112, "grad_norm": 1.8522886037826538, "learning_rate": 9.912857292973774e-06, "loss": 0.9627, "mean_token_accuracy": 0.7096176445484161, "step": 782 }, { "epoch": 0.533197139938713, "grad_norm": 1.9088270664215088, "learning_rate": 9.91243805369e-06, "loss": 0.9632, "mean_token_accuracy": 0.7074339389801025, "step": 783 }, { "epoch": 0.5338781069118148, "grad_norm": 1.992061734199524, "learning_rate": 9.912017817263113e-06, "loss": 0.9842, "mean_token_accuracy": 0.7010660767555237, "step": 784 }, { "epoch": 0.5345590738849166, "grad_norm": 1.9243313074111938, "learning_rate": 9.91159658377841e-06, "loss": 0.8512, "mean_token_accuracy": 0.7420373558998108, "step": 785 }, { "epoch": 0.5352400408580184, "grad_norm": 2.137824535369873, "learning_rate": 9.9111743533214e-06, "loss": 0.8481, "mean_token_accuracy": 0.7436374723911285, "step": 786 }, { "epoch": 0.5359210078311202, "grad_norm": 1.7732694149017334, "learning_rate": 9.910751125977783e-06, "loss": 0.9713, "mean_token_accuracy": 0.6986277997493744, "step": 787 }, { "epoch": 0.536601974804222, "grad_norm": 1.9888416528701782, "learning_rate": 9.910326901833472e-06, "loss": 0.9128, "mean_token_accuracy": 0.7159376442432404, "step": 788 }, { "epoch": 0.5372829417773238, "grad_norm": 2.1222245693206787, "learning_rate": 9.909901680974575e-06, "loss": 0.8106, "mean_token_accuracy": 0.7553743720054626, "step": 789 }, { "epoch": 0.5379639087504257, "grad_norm": 1.9746335744857788, "learning_rate": 9.909475463487405e-06, "loss": 0.8578, "mean_token_accuracy": 0.7168098092079163, "step": 790 }, { "epoch": 0.5386448757235274, "grad_norm": 1.9712796211242676, "learning_rate": 9.909048249458476e-06, "loss": 0.9249, "mean_token_accuracy": 0.7169761955738068, "step": 791 }, { "epoch": 0.5393258426966292, "grad_norm": 1.82701575756073, "learning_rate": 9.90862003897451e-06, "loss": 0.9791, "mean_token_accuracy": 0.7067528665065765, "step": 792 }, { "epoch": 0.540006809669731, "grad_norm": 1.7773213386535645, "learning_rate": 9.90819083212242e-06, "loss": 1.0407, "mean_token_accuracy": 0.6809594929218292, "step": 793 }, { "epoch": 0.5406877766428329, "grad_norm": 2.032273769378662, "learning_rate": 9.907760628989334e-06, "loss": 0.9385, "mean_token_accuracy": 0.6906205117702484, "step": 794 }, { "epoch": 0.5413687436159347, "grad_norm": 1.9091987609863281, "learning_rate": 9.90732942966257e-06, "loss": 1.0392, "mean_token_accuracy": 0.6866508424282074, "step": 795 }, { "epoch": 0.5420497105890364, "grad_norm": 1.9338501691818237, "learning_rate": 9.90689723422966e-06, "loss": 1.0262, "mean_token_accuracy": 0.6817237138748169, "step": 796 }, { "epoch": 0.5427306775621382, "grad_norm": 1.9582016468048096, "learning_rate": 9.906464042778329e-06, "loss": 0.8942, "mean_token_accuracy": 0.7205660343170166, "step": 797 }, { "epoch": 0.54341164453524, "grad_norm": 1.850061297416687, "learning_rate": 9.906029855396509e-06, "loss": 0.9503, "mean_token_accuracy": 0.6914559006690979, "step": 798 }, { "epoch": 0.5440926115083419, "grad_norm": 2.011323928833008, "learning_rate": 9.905594672172332e-06, "loss": 0.8693, "mean_token_accuracy": 0.7327053844928741, "step": 799 }, { "epoch": 0.5447735784814437, "grad_norm": 2.0349912643432617, "learning_rate": 9.905158493194132e-06, "loss": 0.9637, "mean_token_accuracy": 0.7099257409572601, "step": 800 }, { "epoch": 0.5454545454545454, "grad_norm": 2.0178065299987793, "learning_rate": 9.90472131855045e-06, "loss": 1.0216, "mean_token_accuracy": 0.6805939674377441, "step": 801 }, { "epoch": 0.5461355124276472, "grad_norm": 2.128081798553467, "learning_rate": 9.904283148330019e-06, "loss": 0.9049, "mean_token_accuracy": 0.7086631655693054, "step": 802 }, { "epoch": 0.5468164794007491, "grad_norm": 1.8685455322265625, "learning_rate": 9.903843982621786e-06, "loss": 0.9424, "mean_token_accuracy": 0.7021132707595825, "step": 803 }, { "epoch": 0.5474974463738509, "grad_norm": 1.8007049560546875, "learning_rate": 9.903403821514893e-06, "loss": 0.9548, "mean_token_accuracy": 0.7081343531608582, "step": 804 }, { "epoch": 0.5481784133469527, "grad_norm": 1.8023500442504883, "learning_rate": 9.902962665098684e-06, "loss": 0.9961, "mean_token_accuracy": 0.7067206799983978, "step": 805 }, { "epoch": 0.5488593803200544, "grad_norm": 1.9121768474578857, "learning_rate": 9.902520513462706e-06, "loss": 0.981, "mean_token_accuracy": 0.6992389261722565, "step": 806 }, { "epoch": 0.5495403472931563, "grad_norm": 2.1279258728027344, "learning_rate": 9.902077366696709e-06, "loss": 0.9124, "mean_token_accuracy": 0.7209068536758423, "step": 807 }, { "epoch": 0.5502213142662581, "grad_norm": 1.7308140993118286, "learning_rate": 9.901633224890646e-06, "loss": 1.0014, "mean_token_accuracy": 0.6894402205944061, "step": 808 }, { "epoch": 0.5509022812393599, "grad_norm": 1.9360507726669312, "learning_rate": 9.901188088134667e-06, "loss": 0.9401, "mean_token_accuracy": 0.7138496041297913, "step": 809 }, { "epoch": 0.5515832482124617, "grad_norm": 2.0580153465270996, "learning_rate": 9.900741956519132e-06, "loss": 0.9449, "mean_token_accuracy": 0.6892483830451965, "step": 810 }, { "epoch": 0.5522642151855635, "grad_norm": 1.9981132745742798, "learning_rate": 9.900294830134595e-06, "loss": 0.9468, "mean_token_accuracy": 0.7047838866710663, "step": 811 }, { "epoch": 0.5529451821586653, "grad_norm": 2.1194427013397217, "learning_rate": 9.899846709071816e-06, "loss": 0.8637, "mean_token_accuracy": 0.7254583239555359, "step": 812 }, { "epoch": 0.5536261491317671, "grad_norm": 1.9187334775924683, "learning_rate": 9.899397593421756e-06, "loss": 0.9071, "mean_token_accuracy": 0.7253083884716034, "step": 813 }, { "epoch": 0.5543071161048689, "grad_norm": 1.726324200630188, "learning_rate": 9.898947483275578e-06, "loss": 1.1446, "mean_token_accuracy": 0.6701413094997406, "step": 814 }, { "epoch": 0.5549880830779708, "grad_norm": 1.9030319452285767, "learning_rate": 9.898496378724647e-06, "loss": 0.9133, "mean_token_accuracy": 0.718495637178421, "step": 815 }, { "epoch": 0.5556690500510725, "grad_norm": 1.9948192834854126, "learning_rate": 9.898044279860529e-06, "loss": 0.9765, "mean_token_accuracy": 0.6955582499504089, "step": 816 }, { "epoch": 0.5563500170241743, "grad_norm": 1.8676776885986328, "learning_rate": 9.897591186774994e-06, "loss": 0.8935, "mean_token_accuracy": 0.7221784293651581, "step": 817 }, { "epoch": 0.5570309839972761, "grad_norm": 1.8231251239776611, "learning_rate": 9.897137099560012e-06, "loss": 0.9762, "mean_token_accuracy": 0.7090394496917725, "step": 818 }, { "epoch": 0.557711950970378, "grad_norm": 1.9323554039001465, "learning_rate": 9.896682018307754e-06, "loss": 0.9277, "mean_token_accuracy": 0.7177361249923706, "step": 819 }, { "epoch": 0.5583929179434798, "grad_norm": 1.7734348773956299, "learning_rate": 9.896225943110594e-06, "loss": 0.9751, "mean_token_accuracy": 0.7046966254711151, "step": 820 }, { "epoch": 0.5590738849165815, "grad_norm": 1.853725790977478, "learning_rate": 9.89576887406111e-06, "loss": 0.9302, "mean_token_accuracy": 0.7148774862289429, "step": 821 }, { "epoch": 0.5597548518896833, "grad_norm": 1.8497865200042725, "learning_rate": 9.895310811252076e-06, "loss": 1.0086, "mean_token_accuracy": 0.6901024580001831, "step": 822 }, { "epoch": 0.5604358188627852, "grad_norm": 2.083036184310913, "learning_rate": 9.894851754776473e-06, "loss": 0.9376, "mean_token_accuracy": 0.7160332202911377, "step": 823 }, { "epoch": 0.561116785835887, "grad_norm": 2.0109269618988037, "learning_rate": 9.894391704727482e-06, "loss": 0.8829, "mean_token_accuracy": 0.7295451760292053, "step": 824 }, { "epoch": 0.5617977528089888, "grad_norm": 2.285829782485962, "learning_rate": 9.893930661198485e-06, "loss": 0.9525, "mean_token_accuracy": 0.7055752575397491, "step": 825 }, { "epoch": 0.5624787197820905, "grad_norm": 1.9479528665542603, "learning_rate": 9.893468624283067e-06, "loss": 1.0044, "mean_token_accuracy": 0.6962834298610687, "step": 826 }, { "epoch": 0.5631596867551923, "grad_norm": 1.8100916147232056, "learning_rate": 9.893005594075013e-06, "loss": 1.0301, "mean_token_accuracy": 0.6985307335853577, "step": 827 }, { "epoch": 0.5638406537282942, "grad_norm": 1.9410083293914795, "learning_rate": 9.89254157066831e-06, "loss": 0.92, "mean_token_accuracy": 0.7107702791690826, "step": 828 }, { "epoch": 0.564521620701396, "grad_norm": 1.9399858713150024, "learning_rate": 9.892076554157147e-06, "loss": 0.978, "mean_token_accuracy": 0.7065522372722626, "step": 829 }, { "epoch": 0.5652025876744978, "grad_norm": 2.1459615230560303, "learning_rate": 9.891610544635916e-06, "loss": 0.9263, "mean_token_accuracy": 0.7055329084396362, "step": 830 }, { "epoch": 0.5658835546475995, "grad_norm": 2.1430253982543945, "learning_rate": 9.891143542199208e-06, "loss": 0.883, "mean_token_accuracy": 0.7316214740276337, "step": 831 }, { "epoch": 0.5665645216207014, "grad_norm": 1.978699803352356, "learning_rate": 9.890675546941817e-06, "loss": 0.9566, "mean_token_accuracy": 0.703665167093277, "step": 832 }, { "epoch": 0.5672454885938032, "grad_norm": 2.0415055751800537, "learning_rate": 9.89020655895874e-06, "loss": 0.9545, "mean_token_accuracy": 0.7041334509849548, "step": 833 }, { "epoch": 0.567926455566905, "grad_norm": 1.898684024810791, "learning_rate": 9.889736578345168e-06, "loss": 0.9496, "mean_token_accuracy": 0.714579850435257, "step": 834 }, { "epoch": 0.5686074225400068, "grad_norm": 2.314148426055908, "learning_rate": 9.889265605196506e-06, "loss": 0.9555, "mean_token_accuracy": 0.7065044939517975, "step": 835 }, { "epoch": 0.5692883895131086, "grad_norm": 1.809054970741272, "learning_rate": 9.888793639608351e-06, "loss": 0.9448, "mean_token_accuracy": 0.700287401676178, "step": 836 }, { "epoch": 0.5699693564862104, "grad_norm": 2.0278213024139404, "learning_rate": 9.888320681676505e-06, "loss": 0.872, "mean_token_accuracy": 0.7230366170406342, "step": 837 }, { "epoch": 0.5706503234593122, "grad_norm": 1.9482988119125366, "learning_rate": 9.887846731496967e-06, "loss": 0.9219, "mean_token_accuracy": 0.7244568765163422, "step": 838 }, { "epoch": 0.571331290432414, "grad_norm": 1.8518434762954712, "learning_rate": 9.887371789165945e-06, "loss": 0.9181, "mean_token_accuracy": 0.7187406122684479, "step": 839 }, { "epoch": 0.5720122574055159, "grad_norm": 1.904462218284607, "learning_rate": 9.886895854779843e-06, "loss": 0.9593, "mean_token_accuracy": 0.7089227139949799, "step": 840 }, { "epoch": 0.5726932243786176, "grad_norm": 1.8968645334243774, "learning_rate": 9.886418928435268e-06, "loss": 1.0298, "mean_token_accuracy": 0.6906135678291321, "step": 841 }, { "epoch": 0.5733741913517194, "grad_norm": 2.0753626823425293, "learning_rate": 9.885941010229028e-06, "loss": 0.9345, "mean_token_accuracy": 0.7176482975482941, "step": 842 }, { "epoch": 0.5740551583248212, "grad_norm": 2.020355463027954, "learning_rate": 9.885462100258131e-06, "loss": 0.9787, "mean_token_accuracy": 0.704463392496109, "step": 843 }, { "epoch": 0.5747361252979231, "grad_norm": 2.055138111114502, "learning_rate": 9.884982198619792e-06, "loss": 0.8969, "mean_token_accuracy": 0.7168489098548889, "step": 844 }, { "epoch": 0.5754170922710249, "grad_norm": 1.936385989189148, "learning_rate": 9.884501305411418e-06, "loss": 0.9564, "mean_token_accuracy": 0.7051523327827454, "step": 845 }, { "epoch": 0.5760980592441267, "grad_norm": 1.9429481029510498, "learning_rate": 9.884019420730626e-06, "loss": 1.0457, "mean_token_accuracy": 0.6839644014835358, "step": 846 }, { "epoch": 0.5767790262172284, "grad_norm": 1.8538686037063599, "learning_rate": 9.883536544675227e-06, "loss": 0.9226, "mean_token_accuracy": 0.7027281522750854, "step": 847 }, { "epoch": 0.5774599931903303, "grad_norm": 1.8618054389953613, "learning_rate": 9.883052677343241e-06, "loss": 0.9655, "mean_token_accuracy": 0.6988201439380646, "step": 848 }, { "epoch": 0.5781409601634321, "grad_norm": 1.8905508518218994, "learning_rate": 9.882567818832882e-06, "loss": 1.0053, "mean_token_accuracy": 0.6883890628814697, "step": 849 }, { "epoch": 0.5788219271365339, "grad_norm": 1.900333285331726, "learning_rate": 9.882081969242569e-06, "loss": 1.0191, "mean_token_accuracy": 0.6902303993701935, "step": 850 }, { "epoch": 0.5795028941096357, "grad_norm": 1.8938381671905518, "learning_rate": 9.88159512867092e-06, "loss": 0.9727, "mean_token_accuracy": 0.6978941857814789, "step": 851 }, { "epoch": 0.5801838610827375, "grad_norm": 1.8677841424942017, "learning_rate": 9.881107297216758e-06, "loss": 1.0675, "mean_token_accuracy": 0.6798535287380219, "step": 852 }, { "epoch": 0.5808648280558393, "grad_norm": 2.053844451904297, "learning_rate": 9.880618474979105e-06, "loss": 0.8987, "mean_token_accuracy": 0.7166960835456848, "step": 853 }, { "epoch": 0.5815457950289411, "grad_norm": 1.914541482925415, "learning_rate": 9.880128662057181e-06, "loss": 0.9132, "mean_token_accuracy": 0.7291778326034546, "step": 854 }, { "epoch": 0.5822267620020429, "grad_norm": 1.9616204500198364, "learning_rate": 9.879637858550412e-06, "loss": 0.9291, "mean_token_accuracy": 0.7156704068183899, "step": 855 }, { "epoch": 0.5829077289751448, "grad_norm": 1.8463778495788574, "learning_rate": 9.879146064558421e-06, "loss": 0.9614, "mean_token_accuracy": 0.7031980156898499, "step": 856 }, { "epoch": 0.5835886959482465, "grad_norm": 1.8258920907974243, "learning_rate": 9.878653280181037e-06, "loss": 0.9472, "mean_token_accuracy": 0.7147740125656128, "step": 857 }, { "epoch": 0.5842696629213483, "grad_norm": 1.8497052192687988, "learning_rate": 9.878159505518283e-06, "loss": 0.8908, "mean_token_accuracy": 0.7202629446983337, "step": 858 }, { "epoch": 0.5849506298944501, "grad_norm": 2.020206928253174, "learning_rate": 9.87766474067039e-06, "loss": 0.9366, "mean_token_accuracy": 0.7077451050281525, "step": 859 }, { "epoch": 0.585631596867552, "grad_norm": 1.7462220191955566, "learning_rate": 9.877168985737786e-06, "loss": 0.947, "mean_token_accuracy": 0.7084119319915771, "step": 860 }, { "epoch": 0.5863125638406538, "grad_norm": 1.8248937129974365, "learning_rate": 9.876672240821103e-06, "loss": 1.0002, "mean_token_accuracy": 0.6947183012962341, "step": 861 }, { "epoch": 0.5869935308137555, "grad_norm": 1.6950011253356934, "learning_rate": 9.876174506021168e-06, "loss": 1.0191, "mean_token_accuracy": 0.6951667368412018, "step": 862 }, { "epoch": 0.5876744977868573, "grad_norm": 1.822561264038086, "learning_rate": 9.875675781439017e-06, "loss": 0.9016, "mean_token_accuracy": 0.7242492139339447, "step": 863 }, { "epoch": 0.5883554647599591, "grad_norm": 1.8542460203170776, "learning_rate": 9.87517606717588e-06, "loss": 0.9544, "mean_token_accuracy": 0.7120011448860168, "step": 864 }, { "epoch": 0.589036431733061, "grad_norm": 1.9122037887573242, "learning_rate": 9.87467536333319e-06, "loss": 0.9307, "mean_token_accuracy": 0.717513233423233, "step": 865 }, { "epoch": 0.5897173987061628, "grad_norm": 1.8895268440246582, "learning_rate": 9.874173670012586e-06, "loss": 0.8918, "mean_token_accuracy": 0.7215910851955414, "step": 866 }, { "epoch": 0.5903983656792645, "grad_norm": 1.8233898878097534, "learning_rate": 9.873670987315899e-06, "loss": 0.928, "mean_token_accuracy": 0.7190805971622467, "step": 867 }, { "epoch": 0.5910793326523663, "grad_norm": 1.8839424848556519, "learning_rate": 9.873167315345168e-06, "loss": 0.8759, "mean_token_accuracy": 0.7182748913764954, "step": 868 }, { "epoch": 0.5917602996254682, "grad_norm": 1.8008066415786743, "learning_rate": 9.872662654202626e-06, "loss": 1.0058, "mean_token_accuracy": 0.7035773396492004, "step": 869 }, { "epoch": 0.59244126659857, "grad_norm": 1.942199468612671, "learning_rate": 9.872157003990715e-06, "loss": 0.9851, "mean_token_accuracy": 0.6997464597225189, "step": 870 }, { "epoch": 0.5931222335716718, "grad_norm": 1.9216194152832031, "learning_rate": 9.871650364812071e-06, "loss": 0.998, "mean_token_accuracy": 0.6939526796340942, "step": 871 }, { "epoch": 0.5938032005447735, "grad_norm": 1.907537817955017, "learning_rate": 9.871142736769536e-06, "loss": 0.9261, "mean_token_accuracy": 0.7220139801502228, "step": 872 }, { "epoch": 0.5944841675178754, "grad_norm": 1.9570351839065552, "learning_rate": 9.870634119966148e-06, "loss": 0.9219, "mean_token_accuracy": 0.7123790979385376, "step": 873 }, { "epoch": 0.5951651344909772, "grad_norm": 2.20703387260437, "learning_rate": 9.870124514505149e-06, "loss": 0.8768, "mean_token_accuracy": 0.7261242270469666, "step": 874 }, { "epoch": 0.595846101464079, "grad_norm": 2.113776922225952, "learning_rate": 9.869613920489977e-06, "loss": 0.9167, "mean_token_accuracy": 0.7156631946563721, "step": 875 }, { "epoch": 0.5965270684371808, "grad_norm": 1.9470868110656738, "learning_rate": 9.869102338024278e-06, "loss": 0.8452, "mean_token_accuracy": 0.737732470035553, "step": 876 }, { "epoch": 0.5972080354102826, "grad_norm": 1.8057085275650024, "learning_rate": 9.868589767211895e-06, "loss": 0.9638, "mean_token_accuracy": 0.6990690529346466, "step": 877 }, { "epoch": 0.5978890023833844, "grad_norm": 1.8039888143539429, "learning_rate": 9.868076208156868e-06, "loss": 0.9806, "mean_token_accuracy": 0.7001926898956299, "step": 878 }, { "epoch": 0.5985699693564862, "grad_norm": 2.135624885559082, "learning_rate": 9.867561660963443e-06, "loss": 0.9226, "mean_token_accuracy": 0.7242733240127563, "step": 879 }, { "epoch": 0.599250936329588, "grad_norm": 1.880703330039978, "learning_rate": 9.867046125736066e-06, "loss": 0.9797, "mean_token_accuracy": 0.6903262734413147, "step": 880 }, { "epoch": 0.5999319033026899, "grad_norm": 1.935417890548706, "learning_rate": 9.866529602579378e-06, "loss": 0.8815, "mean_token_accuracy": 0.7235603332519531, "step": 881 }, { "epoch": 0.6006128702757916, "grad_norm": 1.9271163940429688, "learning_rate": 9.866012091598227e-06, "loss": 1.0043, "mean_token_accuracy": 0.6926162838935852, "step": 882 }, { "epoch": 0.6012938372488934, "grad_norm": 1.7902082204818726, "learning_rate": 9.865493592897659e-06, "loss": 0.9241, "mean_token_accuracy": 0.7171959280967712, "step": 883 }, { "epoch": 0.6019748042219952, "grad_norm": 1.7653539180755615, "learning_rate": 9.864974106582922e-06, "loss": 0.9768, "mean_token_accuracy": 0.6866921186447144, "step": 884 }, { "epoch": 0.6026557711950971, "grad_norm": 1.6140875816345215, "learning_rate": 9.86445363275946e-06, "loss": 0.9917, "mean_token_accuracy": 0.6980612277984619, "step": 885 }, { "epoch": 0.6033367381681989, "grad_norm": 2.007901191711426, "learning_rate": 9.863932171532923e-06, "loss": 0.8848, "mean_token_accuracy": 0.7363362908363342, "step": 886 }, { "epoch": 0.6040177051413006, "grad_norm": 1.8063462972640991, "learning_rate": 9.863409723009157e-06, "loss": 0.9994, "mean_token_accuracy": 0.7059690058231354, "step": 887 }, { "epoch": 0.6046986721144024, "grad_norm": 1.9254032373428345, "learning_rate": 9.862886287294213e-06, "loss": 1.0032, "mean_token_accuracy": 0.6795991659164429, "step": 888 }, { "epoch": 0.6053796390875043, "grad_norm": 1.860926866531372, "learning_rate": 9.862361864494336e-06, "loss": 0.9384, "mean_token_accuracy": 0.7133743464946747, "step": 889 }, { "epoch": 0.6060606060606061, "grad_norm": 1.9405401945114136, "learning_rate": 9.861836454715981e-06, "loss": 0.968, "mean_token_accuracy": 0.7025461792945862, "step": 890 }, { "epoch": 0.6067415730337079, "grad_norm": 2.0291948318481445, "learning_rate": 9.861310058065791e-06, "loss": 0.9337, "mean_token_accuracy": 0.7162294089794159, "step": 891 }, { "epoch": 0.6074225400068096, "grad_norm": 1.7441670894622803, "learning_rate": 9.860782674650619e-06, "loss": 1.0409, "mean_token_accuracy": 0.6998602747917175, "step": 892 }, { "epoch": 0.6081035069799114, "grad_norm": 1.9660848379135132, "learning_rate": 9.860254304577515e-06, "loss": 0.9801, "mean_token_accuracy": 0.692310631275177, "step": 893 }, { "epoch": 0.6087844739530133, "grad_norm": 1.992448329925537, "learning_rate": 9.859724947953727e-06, "loss": 0.9343, "mean_token_accuracy": 0.6986241042613983, "step": 894 }, { "epoch": 0.6094654409261151, "grad_norm": 2.052823305130005, "learning_rate": 9.859194604886708e-06, "loss": 1.0574, "mean_token_accuracy": 0.6714652180671692, "step": 895 }, { "epoch": 0.6101464078992169, "grad_norm": 2.0275306701660156, "learning_rate": 9.85866327548411e-06, "loss": 0.8785, "mean_token_accuracy": 0.7299887537956238, "step": 896 }, { "epoch": 0.6108273748723186, "grad_norm": 1.8506271839141846, "learning_rate": 9.85813095985378e-06, "loss": 0.8769, "mean_token_accuracy": 0.731505960226059, "step": 897 }, { "epoch": 0.6115083418454205, "grad_norm": 1.8339322805404663, "learning_rate": 9.857597658103773e-06, "loss": 0.8944, "mean_token_accuracy": 0.7239902913570404, "step": 898 }, { "epoch": 0.6121893088185223, "grad_norm": 1.7666937112808228, "learning_rate": 9.857063370342338e-06, "loss": 1.0748, "mean_token_accuracy": 0.6811695098876953, "step": 899 }, { "epoch": 0.6128702757916241, "grad_norm": 1.871490716934204, "learning_rate": 9.85652809667793e-06, "loss": 0.9429, "mean_token_accuracy": 0.7037575244903564, "step": 900 }, { "epoch": 0.613551242764726, "grad_norm": 2.085883855819702, "learning_rate": 9.855991837219194e-06, "loss": 0.9672, "mean_token_accuracy": 0.7077447175979614, "step": 901 }, { "epoch": 0.6142322097378277, "grad_norm": 2.0611507892608643, "learning_rate": 9.855454592074988e-06, "loss": 0.9599, "mean_token_accuracy": 0.7037097215652466, "step": 902 }, { "epoch": 0.6149131767109295, "grad_norm": 1.9170644283294678, "learning_rate": 9.85491636135436e-06, "loss": 0.9008, "mean_token_accuracy": 0.7241914570331573, "step": 903 }, { "epoch": 0.6155941436840313, "grad_norm": 1.7753742933273315, "learning_rate": 9.854377145166565e-06, "loss": 0.9108, "mean_token_accuracy": 0.7117636203765869, "step": 904 }, { "epoch": 0.6162751106571331, "grad_norm": 1.8468668460845947, "learning_rate": 9.853836943621053e-06, "loss": 0.9536, "mean_token_accuracy": 0.7172932028770447, "step": 905 }, { "epoch": 0.616956077630235, "grad_norm": 2.0863428115844727, "learning_rate": 9.853295756827476e-06, "loss": 0.9854, "mean_token_accuracy": 0.6989759206771851, "step": 906 }, { "epoch": 0.6176370446033368, "grad_norm": 1.99667227268219, "learning_rate": 9.852753584895687e-06, "loss": 0.9254, "mean_token_accuracy": 0.7184476554393768, "step": 907 }, { "epoch": 0.6183180115764385, "grad_norm": 1.849758505821228, "learning_rate": 9.852210427935735e-06, "loss": 0.9053, "mean_token_accuracy": 0.723817378282547, "step": 908 }, { "epoch": 0.6189989785495403, "grad_norm": 1.793674349784851, "learning_rate": 9.851666286057876e-06, "loss": 0.9218, "mean_token_accuracy": 0.7213384509086609, "step": 909 }, { "epoch": 0.6196799455226422, "grad_norm": 1.810408115386963, "learning_rate": 9.851121159372558e-06, "loss": 1.021, "mean_token_accuracy": 0.7021710276603699, "step": 910 }, { "epoch": 0.620360912495744, "grad_norm": 1.9328376054763794, "learning_rate": 9.850575047990436e-06, "loss": 0.9316, "mean_token_accuracy": 0.7106950581073761, "step": 911 }, { "epoch": 0.6210418794688458, "grad_norm": 1.882814884185791, "learning_rate": 9.850027952022359e-06, "loss": 0.9046, "mean_token_accuracy": 0.7228325605392456, "step": 912 }, { "epoch": 0.6217228464419475, "grad_norm": 1.894395351409912, "learning_rate": 9.849479871579376e-06, "loss": 1.0138, "mean_token_accuracy": 0.6858963966369629, "step": 913 }, { "epoch": 0.6224038134150494, "grad_norm": 1.7752699851989746, "learning_rate": 9.848930806772745e-06, "loss": 0.9861, "mean_token_accuracy": 0.6946883201599121, "step": 914 }, { "epoch": 0.6230847803881512, "grad_norm": 1.8623077869415283, "learning_rate": 9.848380757713913e-06, "loss": 0.9422, "mean_token_accuracy": 0.7174883484840393, "step": 915 }, { "epoch": 0.623765747361253, "grad_norm": 2.044786214828491, "learning_rate": 9.84782972451453e-06, "loss": 0.9324, "mean_token_accuracy": 0.6872653961181641, "step": 916 }, { "epoch": 0.6244467143343548, "grad_norm": 1.866353154182434, "learning_rate": 9.847277707286448e-06, "loss": 0.9252, "mean_token_accuracy": 0.7130465805530548, "step": 917 }, { "epoch": 0.6251276813074566, "grad_norm": 1.8453325033187866, "learning_rate": 9.846724706141718e-06, "loss": 0.8959, "mean_token_accuracy": 0.7266267538070679, "step": 918 }, { "epoch": 0.6258086482805584, "grad_norm": 1.5999999046325684, "learning_rate": 9.846170721192588e-06, "loss": 1.1328, "mean_token_accuracy": 0.6656264364719391, "step": 919 }, { "epoch": 0.6264896152536602, "grad_norm": 1.7168526649475098, "learning_rate": 9.845615752551508e-06, "loss": 0.9458, "mean_token_accuracy": 0.7054413557052612, "step": 920 }, { "epoch": 0.627170582226762, "grad_norm": 2.0268442630767822, "learning_rate": 9.84505980033113e-06, "loss": 0.9058, "mean_token_accuracy": 0.7155932188034058, "step": 921 }, { "epoch": 0.6278515491998639, "grad_norm": 1.8150837421417236, "learning_rate": 9.844502864644298e-06, "loss": 0.8833, "mean_token_accuracy": 0.726096123456955, "step": 922 }, { "epoch": 0.6285325161729656, "grad_norm": 1.8461965322494507, "learning_rate": 9.843944945604066e-06, "loss": 1.0301, "mean_token_accuracy": 0.6860526502132416, "step": 923 }, { "epoch": 0.6292134831460674, "grad_norm": 1.822009801864624, "learning_rate": 9.843386043323682e-06, "loss": 0.9826, "mean_token_accuracy": 0.6948745548725128, "step": 924 }, { "epoch": 0.6298944501191692, "grad_norm": 1.9396921396255493, "learning_rate": 9.84282615791659e-06, "loss": 0.9168, "mean_token_accuracy": 0.7233757972717285, "step": 925 }, { "epoch": 0.630575417092271, "grad_norm": 1.5672427415847778, "learning_rate": 9.842265289496437e-06, "loss": 1.0445, "mean_token_accuracy": 0.6728612780570984, "step": 926 }, { "epoch": 0.6312563840653729, "grad_norm": 1.8047202825546265, "learning_rate": 9.841703438177075e-06, "loss": 0.9738, "mean_token_accuracy": 0.7091594636440277, "step": 927 }, { "epoch": 0.6319373510384746, "grad_norm": 1.747554063796997, "learning_rate": 9.841140604072548e-06, "loss": 1.018, "mean_token_accuracy": 0.6966336071491241, "step": 928 }, { "epoch": 0.6326183180115764, "grad_norm": 1.873108983039856, "learning_rate": 9.8405767872971e-06, "loss": 0.81, "mean_token_accuracy": 0.7380712032318115, "step": 929 }, { "epoch": 0.6332992849846782, "grad_norm": 1.862998604774475, "learning_rate": 9.84001198796518e-06, "loss": 0.9747, "mean_token_accuracy": 0.7056232988834381, "step": 930 }, { "epoch": 0.6339802519577801, "grad_norm": 1.8435055017471313, "learning_rate": 9.83944620619143e-06, "loss": 0.9976, "mean_token_accuracy": 0.6994684934616089, "step": 931 }, { "epoch": 0.6346612189308819, "grad_norm": 1.954116702079773, "learning_rate": 9.838879442090696e-06, "loss": 0.9445, "mean_token_accuracy": 0.7159655392169952, "step": 932 }, { "epoch": 0.6353421859039836, "grad_norm": 1.7557402849197388, "learning_rate": 9.83831169577802e-06, "loss": 0.9883, "mean_token_accuracy": 0.6892327666282654, "step": 933 }, { "epoch": 0.6360231528770854, "grad_norm": 1.8385967016220093, "learning_rate": 9.837742967368647e-06, "loss": 0.9777, "mean_token_accuracy": 0.6997518539428711, "step": 934 }, { "epoch": 0.6367041198501873, "grad_norm": 1.8716561794281006, "learning_rate": 9.837173256978018e-06, "loss": 0.9347, "mean_token_accuracy": 0.7122432887554169, "step": 935 }, { "epoch": 0.6373850868232891, "grad_norm": 1.8193774223327637, "learning_rate": 9.836602564721776e-06, "loss": 1.0499, "mean_token_accuracy": 0.6762175858020782, "step": 936 }, { "epoch": 0.6380660537963909, "grad_norm": 2.063124179840088, "learning_rate": 9.83603089071576e-06, "loss": 0.922, "mean_token_accuracy": 0.7049972712993622, "step": 937 }, { "epoch": 0.6387470207694926, "grad_norm": 2.0541720390319824, "learning_rate": 9.835458235076011e-06, "loss": 0.917, "mean_token_accuracy": 0.7152639925479889, "step": 938 }, { "epoch": 0.6394279877425945, "grad_norm": 1.6803330183029175, "learning_rate": 9.83488459791877e-06, "loss": 1.1048, "mean_token_accuracy": 0.6753070652484894, "step": 939 }, { "epoch": 0.6401089547156963, "grad_norm": 1.6886130571365356, "learning_rate": 9.834309979360474e-06, "loss": 1.094, "mean_token_accuracy": 0.668320506811142, "step": 940 }, { "epoch": 0.6407899216887981, "grad_norm": 1.7877442836761475, "learning_rate": 9.83373437951776e-06, "loss": 0.8659, "mean_token_accuracy": 0.7323583364486694, "step": 941 }, { "epoch": 0.6414708886618999, "grad_norm": 1.7721447944641113, "learning_rate": 9.833157798507472e-06, "loss": 0.881, "mean_token_accuracy": 0.7298937737941742, "step": 942 }, { "epoch": 0.6421518556350017, "grad_norm": 1.998369812965393, "learning_rate": 9.832580236446637e-06, "loss": 0.8928, "mean_token_accuracy": 0.7216371297836304, "step": 943 }, { "epoch": 0.6428328226081035, "grad_norm": 1.8769196271896362, "learning_rate": 9.832001693452497e-06, "loss": 0.9174, "mean_token_accuracy": 0.7152833342552185, "step": 944 }, { "epoch": 0.6435137895812053, "grad_norm": 1.9850821495056152, "learning_rate": 9.831422169642482e-06, "loss": 0.8345, "mean_token_accuracy": 0.742838591337204, "step": 945 }, { "epoch": 0.6441947565543071, "grad_norm": 1.8736904859542847, "learning_rate": 9.830841665134229e-06, "loss": 0.9411, "mean_token_accuracy": 0.7085019648075104, "step": 946 }, { "epoch": 0.644875723527409, "grad_norm": 1.9498058557510376, "learning_rate": 9.830260180045573e-06, "loss": 0.889, "mean_token_accuracy": 0.7321033477783203, "step": 947 }, { "epoch": 0.6455566905005107, "grad_norm": 2.0083162784576416, "learning_rate": 9.829677714494538e-06, "loss": 0.9227, "mean_token_accuracy": 0.7099824547767639, "step": 948 }, { "epoch": 0.6462376574736125, "grad_norm": 1.6253567934036255, "learning_rate": 9.829094268599363e-06, "loss": 1.0128, "mean_token_accuracy": 0.7007487118244171, "step": 949 }, { "epoch": 0.6469186244467143, "grad_norm": 1.8326671123504639, "learning_rate": 9.828509842478474e-06, "loss": 0.9493, "mean_token_accuracy": 0.7108609080314636, "step": 950 }, { "epoch": 0.6475995914198162, "grad_norm": 1.9572131633758545, "learning_rate": 9.827924436250498e-06, "loss": 0.8913, "mean_token_accuracy": 0.7222920358181, "step": 951 }, { "epoch": 0.648280558392918, "grad_norm": 1.7221862077713013, "learning_rate": 9.827338050034266e-06, "loss": 0.9439, "mean_token_accuracy": 0.69377800822258, "step": 952 }, { "epoch": 0.6489615253660197, "grad_norm": 1.6985759735107422, "learning_rate": 9.826750683948802e-06, "loss": 1.0424, "mean_token_accuracy": 0.6914986670017242, "step": 953 }, { "epoch": 0.6496424923391215, "grad_norm": 1.795064926147461, "learning_rate": 9.826162338113332e-06, "loss": 0.9604, "mean_token_accuracy": 0.7052907347679138, "step": 954 }, { "epoch": 0.6503234593122234, "grad_norm": 1.9628881216049194, "learning_rate": 9.825573012647283e-06, "loss": 0.9337, "mean_token_accuracy": 0.7126142084598541, "step": 955 }, { "epoch": 0.6510044262853252, "grad_norm": 1.6987920999526978, "learning_rate": 9.824982707670277e-06, "loss": 0.9427, "mean_token_accuracy": 0.7109542489051819, "step": 956 }, { "epoch": 0.651685393258427, "grad_norm": 1.923168659210205, "learning_rate": 9.824391423302136e-06, "loss": 0.9065, "mean_token_accuracy": 0.6991214156150818, "step": 957 }, { "epoch": 0.6523663602315287, "grad_norm": 1.8599284887313843, "learning_rate": 9.82379915966288e-06, "loss": 0.8401, "mean_token_accuracy": 0.7346092462539673, "step": 958 }, { "epoch": 0.6530473272046305, "grad_norm": 1.8244714736938477, "learning_rate": 9.823205916872726e-06, "loss": 0.9066, "mean_token_accuracy": 0.7289390861988068, "step": 959 }, { "epoch": 0.6537282941777324, "grad_norm": 1.8667688369750977, "learning_rate": 9.822611695052099e-06, "loss": 0.8444, "mean_token_accuracy": 0.7428025007247925, "step": 960 }, { "epoch": 0.6544092611508342, "grad_norm": 1.8670092821121216, "learning_rate": 9.822016494321612e-06, "loss": 1.047, "mean_token_accuracy": 0.6766827404499054, "step": 961 }, { "epoch": 0.655090228123936, "grad_norm": 1.825394868850708, "learning_rate": 9.821420314802082e-06, "loss": 0.9009, "mean_token_accuracy": 0.7164052128791809, "step": 962 }, { "epoch": 0.6557711950970377, "grad_norm": 1.865616798400879, "learning_rate": 9.820823156614522e-06, "loss": 0.9374, "mean_token_accuracy": 0.7074237763881683, "step": 963 }, { "epoch": 0.6564521620701396, "grad_norm": 1.7963694334030151, "learning_rate": 9.820225019880146e-06, "loss": 0.9105, "mean_token_accuracy": 0.7192515730857849, "step": 964 }, { "epoch": 0.6571331290432414, "grad_norm": 1.8546626567840576, "learning_rate": 9.819625904720367e-06, "loss": 1.0209, "mean_token_accuracy": 0.6768304109573364, "step": 965 }, { "epoch": 0.6578140960163432, "grad_norm": 1.9054224491119385, "learning_rate": 9.819025811256793e-06, "loss": 0.7854, "mean_token_accuracy": 0.7570179402828217, "step": 966 }, { "epoch": 0.658495062989445, "grad_norm": 1.867534875869751, "learning_rate": 9.818424739611236e-06, "loss": 0.9212, "mean_token_accuracy": 0.7175463736057281, "step": 967 }, { "epoch": 0.6591760299625468, "grad_norm": 1.6845383644104004, "learning_rate": 9.817822689905701e-06, "loss": 0.9553, "mean_token_accuracy": 0.7083200216293335, "step": 968 }, { "epoch": 0.6598569969356486, "grad_norm": 1.8121330738067627, "learning_rate": 9.817219662262397e-06, "loss": 1.0121, "mean_token_accuracy": 0.6910092234611511, "step": 969 }, { "epoch": 0.6605379639087504, "grad_norm": 1.9962207078933716, "learning_rate": 9.816615656803723e-06, "loss": 0.8151, "mean_token_accuracy": 0.7486975193023682, "step": 970 }, { "epoch": 0.6612189308818522, "grad_norm": 1.860579013824463, "learning_rate": 9.816010673652287e-06, "loss": 0.8613, "mean_token_accuracy": 0.7321327328681946, "step": 971 }, { "epoch": 0.6618998978549541, "grad_norm": 1.9781731367111206, "learning_rate": 9.81540471293089e-06, "loss": 0.9401, "mean_token_accuracy": 0.71474489569664, "step": 972 }, { "epoch": 0.6625808648280559, "grad_norm": 1.73610520362854, "learning_rate": 9.81479777476253e-06, "loss": 1.0553, "mean_token_accuracy": 0.6790037453174591, "step": 973 }, { "epoch": 0.6632618318011576, "grad_norm": 1.6973832845687866, "learning_rate": 9.814189859270407e-06, "loss": 0.9321, "mean_token_accuracy": 0.7083249092102051, "step": 974 }, { "epoch": 0.6639427987742594, "grad_norm": 1.8072295188903809, "learning_rate": 9.813580966577916e-06, "loss": 0.9711, "mean_token_accuracy": 0.7008381187915802, "step": 975 }, { "epoch": 0.6646237657473613, "grad_norm": 1.9321898221969604, "learning_rate": 9.812971096808654e-06, "loss": 0.9444, "mean_token_accuracy": 0.7111658751964569, "step": 976 }, { "epoch": 0.6653047327204631, "grad_norm": 1.8836314678192139, "learning_rate": 9.812360250086415e-06, "loss": 0.9023, "mean_token_accuracy": 0.7218302190303802, "step": 977 }, { "epoch": 0.6659856996935649, "grad_norm": 1.9790079593658447, "learning_rate": 9.811748426535188e-06, "loss": 0.8671, "mean_token_accuracy": 0.7348074615001678, "step": 978 }, { "epoch": 0.6666666666666666, "grad_norm": 1.9018293619155884, "learning_rate": 9.811135626279165e-06, "loss": 0.9756, "mean_token_accuracy": 0.6987021267414093, "step": 979 }, { "epoch": 0.6673476336397685, "grad_norm": 1.8818737268447876, "learning_rate": 9.810521849442732e-06, "loss": 0.8435, "mean_token_accuracy": 0.7330159842967987, "step": 980 }, { "epoch": 0.6680286006128703, "grad_norm": 1.8900965452194214, "learning_rate": 9.809907096150477e-06, "loss": 0.8688, "mean_token_accuracy": 0.7289751172065735, "step": 981 }, { "epoch": 0.6687095675859721, "grad_norm": 1.9061667919158936, "learning_rate": 9.809291366527185e-06, "loss": 0.9468, "mean_token_accuracy": 0.7083511352539062, "step": 982 }, { "epoch": 0.6693905345590739, "grad_norm": 1.9214661121368408, "learning_rate": 9.808674660697839e-06, "loss": 0.9902, "mean_token_accuracy": 0.6949127614498138, "step": 983 }, { "epoch": 0.6700715015321757, "grad_norm": 1.7409363985061646, "learning_rate": 9.80805697878762e-06, "loss": 0.93, "mean_token_accuracy": 0.7023863792419434, "step": 984 }, { "epoch": 0.6707524685052775, "grad_norm": 1.6719300746917725, "learning_rate": 9.807438320921908e-06, "loss": 0.9989, "mean_token_accuracy": 0.6914596259593964, "step": 985 }, { "epoch": 0.6714334354783793, "grad_norm": 1.7128578424453735, "learning_rate": 9.806818687226277e-06, "loss": 1.0072, "mean_token_accuracy": 0.6846819519996643, "step": 986 }, { "epoch": 0.6721144024514811, "grad_norm": 1.683223843574524, "learning_rate": 9.806198077826502e-06, "loss": 0.9065, "mean_token_accuracy": 0.712651401758194, "step": 987 }, { "epoch": 0.672795369424583, "grad_norm": 1.7066842317581177, "learning_rate": 9.805576492848562e-06, "loss": 0.9505, "mean_token_accuracy": 0.7113204598426819, "step": 988 }, { "epoch": 0.6734763363976847, "grad_norm": 1.8650773763656616, "learning_rate": 9.804953932418623e-06, "loss": 0.8507, "mean_token_accuracy": 0.7388908863067627, "step": 989 }, { "epoch": 0.6741573033707865, "grad_norm": 1.9448162317276, "learning_rate": 9.804330396663058e-06, "loss": 0.9345, "mean_token_accuracy": 0.7080562114715576, "step": 990 }, { "epoch": 0.6748382703438883, "grad_norm": 1.6968722343444824, "learning_rate": 9.803705885708432e-06, "loss": 0.9971, "mean_token_accuracy": 0.702313631772995, "step": 991 }, { "epoch": 0.6755192373169902, "grad_norm": 1.910893440246582, "learning_rate": 9.803080399681513e-06, "loss": 0.9035, "mean_token_accuracy": 0.7265417277812958, "step": 992 }, { "epoch": 0.676200204290092, "grad_norm": 1.7461293935775757, "learning_rate": 9.80245393870926e-06, "loss": 0.8931, "mean_token_accuracy": 0.7329574823379517, "step": 993 }, { "epoch": 0.6768811712631937, "grad_norm": 1.8224554061889648, "learning_rate": 9.801826502918836e-06, "loss": 0.9612, "mean_token_accuracy": 0.7055199146270752, "step": 994 }, { "epoch": 0.6775621382362955, "grad_norm": 1.804390788078308, "learning_rate": 9.801198092437603e-06, "loss": 0.9282, "mean_token_accuracy": 0.719453364610672, "step": 995 }, { "epoch": 0.6782431052093973, "grad_norm": 1.8096518516540527, "learning_rate": 9.800568707393115e-06, "loss": 0.9671, "mean_token_accuracy": 0.7065939903259277, "step": 996 }, { "epoch": 0.6789240721824992, "grad_norm": 1.7796059846878052, "learning_rate": 9.799938347913125e-06, "loss": 1.0199, "mean_token_accuracy": 0.6911817491054535, "step": 997 }, { "epoch": 0.679605039155601, "grad_norm": 1.7384451627731323, "learning_rate": 9.79930701412559e-06, "loss": 0.9593, "mean_token_accuracy": 0.7156590819358826, "step": 998 }, { "epoch": 0.6802860061287027, "grad_norm": 1.838953971862793, "learning_rate": 9.798674706158655e-06, "loss": 0.8531, "mean_token_accuracy": 0.7311291396617889, "step": 999 }, { "epoch": 0.6809669731018045, "grad_norm": 1.6522247791290283, "learning_rate": 9.798041424140673e-06, "loss": 1.069, "mean_token_accuracy": 0.6762993633747101, "step": 1000 }, { "epoch": 0.6816479400749064, "grad_norm": 1.7799429893493652, "learning_rate": 9.797407168200187e-06, "loss": 0.9414, "mean_token_accuracy": 0.7006965279579163, "step": 1001 }, { "epoch": 0.6823289070480082, "grad_norm": 1.705612063407898, "learning_rate": 9.79677193846594e-06, "loss": 1.0904, "mean_token_accuracy": 0.6769607961177826, "step": 1002 }, { "epoch": 0.68300987402111, "grad_norm": 2.045665979385376, "learning_rate": 9.796135735066876e-06, "loss": 0.8761, "mean_token_accuracy": 0.7322865426540375, "step": 1003 }, { "epoch": 0.6836908409942117, "grad_norm": 1.9545190334320068, "learning_rate": 9.795498558132131e-06, "loss": 0.785, "mean_token_accuracy": 0.7553550601005554, "step": 1004 }, { "epoch": 0.6843718079673136, "grad_norm": 1.9238789081573486, "learning_rate": 9.79486040779104e-06, "loss": 0.8279, "mean_token_accuracy": 0.7364307045936584, "step": 1005 }, { "epoch": 0.6850527749404154, "grad_norm": 2.0287892818450928, "learning_rate": 9.79422128417314e-06, "loss": 0.908, "mean_token_accuracy": 0.7221776843070984, "step": 1006 }, { "epoch": 0.6857337419135172, "grad_norm": 1.8974817991256714, "learning_rate": 9.79358118740816e-06, "loss": 0.9275, "mean_token_accuracy": 0.7069554626941681, "step": 1007 }, { "epoch": 0.686414708886619, "grad_norm": 1.9170082807540894, "learning_rate": 9.792940117626032e-06, "loss": 0.9093, "mean_token_accuracy": 0.7271568179130554, "step": 1008 }, { "epoch": 0.6870956758597208, "grad_norm": 1.910935401916504, "learning_rate": 9.79229807495688e-06, "loss": 1.008, "mean_token_accuracy": 0.6976250112056732, "step": 1009 }, { "epoch": 0.6877766428328226, "grad_norm": 1.933282732963562, "learning_rate": 9.79165505953103e-06, "loss": 1.0306, "mean_token_accuracy": 0.6912870109081268, "step": 1010 }, { "epoch": 0.6884576098059244, "grad_norm": 1.8784865140914917, "learning_rate": 9.791011071479001e-06, "loss": 0.9858, "mean_token_accuracy": 0.6962109506130219, "step": 1011 }, { "epoch": 0.6891385767790262, "grad_norm": 1.7956476211547852, "learning_rate": 9.790366110931515e-06, "loss": 0.8684, "mean_token_accuracy": 0.7152488231658936, "step": 1012 }, { "epoch": 0.6898195437521281, "grad_norm": 1.9915448427200317, "learning_rate": 9.789720178019483e-06, "loss": 0.8117, "mean_token_accuracy": 0.7556339502334595, "step": 1013 }, { "epoch": 0.6905005107252298, "grad_norm": 1.9660388231277466, "learning_rate": 9.789073272874024e-06, "loss": 0.8987, "mean_token_accuracy": 0.7232690155506134, "step": 1014 }, { "epoch": 0.6911814776983316, "grad_norm": 1.977700114250183, "learning_rate": 9.788425395626446e-06, "loss": 0.9095, "mean_token_accuracy": 0.7177245318889618, "step": 1015 }, { "epoch": 0.6918624446714334, "grad_norm": 1.8185795545578003, "learning_rate": 9.787776546408258e-06, "loss": 0.8667, "mean_token_accuracy": 0.7183667421340942, "step": 1016 }, { "epoch": 0.6925434116445353, "grad_norm": 1.9252287149429321, "learning_rate": 9.787126725351166e-06, "loss": 0.7748, "mean_token_accuracy": 0.757220596075058, "step": 1017 }, { "epoch": 0.6932243786176371, "grad_norm": 1.8535956144332886, "learning_rate": 9.786475932587073e-06, "loss": 0.9668, "mean_token_accuracy": 0.7132187187671661, "step": 1018 }, { "epoch": 0.6939053455907388, "grad_norm": 2.0288329124450684, "learning_rate": 9.78582416824808e-06, "loss": 0.8759, "mean_token_accuracy": 0.732960432767868, "step": 1019 }, { "epoch": 0.6945863125638406, "grad_norm": 1.6972752809524536, "learning_rate": 9.785171432466481e-06, "loss": 0.9379, "mean_token_accuracy": 0.709833562374115, "step": 1020 }, { "epoch": 0.6952672795369425, "grad_norm": 1.5482325553894043, "learning_rate": 9.784517725374771e-06, "loss": 1.0725, "mean_token_accuracy": 0.6925731301307678, "step": 1021 }, { "epoch": 0.6959482465100443, "grad_norm": 1.8664679527282715, "learning_rate": 9.783863047105647e-06, "loss": 0.921, "mean_token_accuracy": 0.7223953902721405, "step": 1022 }, { "epoch": 0.6966292134831461, "grad_norm": 1.669275164604187, "learning_rate": 9.78320739779199e-06, "loss": 0.8853, "mean_token_accuracy": 0.7228345572948456, "step": 1023 }, { "epoch": 0.6973101804562478, "grad_norm": 1.842947244644165, "learning_rate": 9.782550777566893e-06, "loss": 0.9139, "mean_token_accuracy": 0.7100386619567871, "step": 1024 }, { "epoch": 0.6979911474293496, "grad_norm": 2.040978193283081, "learning_rate": 9.781893186563634e-06, "loss": 1.0059, "mean_token_accuracy": 0.6984923183917999, "step": 1025 }, { "epoch": 0.6986721144024515, "grad_norm": 2.058868885040283, "learning_rate": 9.781234624915694e-06, "loss": 0.8887, "mean_token_accuracy": 0.7263525426387787, "step": 1026 }, { "epoch": 0.6993530813755533, "grad_norm": 1.840941309928894, "learning_rate": 9.780575092756753e-06, "loss": 0.9625, "mean_token_accuracy": 0.6976081728935242, "step": 1027 }, { "epoch": 0.7000340483486551, "grad_norm": 1.6349047422409058, "learning_rate": 9.77991459022068e-06, "loss": 0.9937, "mean_token_accuracy": 0.6966170966625214, "step": 1028 }, { "epoch": 0.7007150153217568, "grad_norm": 1.693505048751831, "learning_rate": 9.779253117441551e-06, "loss": 1.0563, "mean_token_accuracy": 0.6859462559223175, "step": 1029 }, { "epoch": 0.7013959822948587, "grad_norm": 1.9111328125, "learning_rate": 9.778590674553631e-06, "loss": 1.0046, "mean_token_accuracy": 0.695732057094574, "step": 1030 }, { "epoch": 0.7020769492679605, "grad_norm": 1.8061859607696533, "learning_rate": 9.777927261691389e-06, "loss": 0.9295, "mean_token_accuracy": 0.7059325575828552, "step": 1031 }, { "epoch": 0.7027579162410623, "grad_norm": 1.7435917854309082, "learning_rate": 9.777262878989479e-06, "loss": 0.8693, "mean_token_accuracy": 0.7319886088371277, "step": 1032 }, { "epoch": 0.7034388832141641, "grad_norm": 1.8100264072418213, "learning_rate": 9.776597526582767e-06, "loss": 0.9461, "mean_token_accuracy": 0.7101120054721832, "step": 1033 }, { "epoch": 0.704119850187266, "grad_norm": 1.8760912418365479, "learning_rate": 9.775931204606304e-06, "loss": 1.0025, "mean_token_accuracy": 0.7035281658172607, "step": 1034 }, { "epoch": 0.7048008171603677, "grad_norm": 2.0213491916656494, "learning_rate": 9.775263913195345e-06, "loss": 0.9869, "mean_token_accuracy": 0.7130241394042969, "step": 1035 }, { "epoch": 0.7054817841334695, "grad_norm": 1.945499300956726, "learning_rate": 9.774595652485337e-06, "loss": 0.9717, "mean_token_accuracy": 0.7187637388706207, "step": 1036 }, { "epoch": 0.7061627511065713, "grad_norm": 1.9241479635238647, "learning_rate": 9.773926422611926e-06, "loss": 0.8732, "mean_token_accuracy": 0.7310539782047272, "step": 1037 }, { "epoch": 0.7068437180796732, "grad_norm": 1.9067779779434204, "learning_rate": 9.773256223710956e-06, "loss": 0.9872, "mean_token_accuracy": 0.696234256029129, "step": 1038 }, { "epoch": 0.707524685052775, "grad_norm": 1.975399374961853, "learning_rate": 9.772585055918467e-06, "loss": 0.9548, "mean_token_accuracy": 0.7173250913619995, "step": 1039 }, { "epoch": 0.7082056520258767, "grad_norm": 1.8482844829559326, "learning_rate": 9.771912919370689e-06, "loss": 0.8898, "mean_token_accuracy": 0.7321145832538605, "step": 1040 }, { "epoch": 0.7088866189989785, "grad_norm": 1.8850311040878296, "learning_rate": 9.771239814204063e-06, "loss": 0.9587, "mean_token_accuracy": 0.705027163028717, "step": 1041 }, { "epoch": 0.7095675859720804, "grad_norm": 1.9392118453979492, "learning_rate": 9.770565740555212e-06, "loss": 0.8756, "mean_token_accuracy": 0.7255065143108368, "step": 1042 }, { "epoch": 0.7102485529451822, "grad_norm": 1.83980393409729, "learning_rate": 9.769890698560964e-06, "loss": 0.9642, "mean_token_accuracy": 0.6839810013771057, "step": 1043 }, { "epoch": 0.710929519918284, "grad_norm": 1.7243824005126953, "learning_rate": 9.769214688358341e-06, "loss": 0.863, "mean_token_accuracy": 0.7323320508003235, "step": 1044 }, { "epoch": 0.7116104868913857, "grad_norm": 1.7200353145599365, "learning_rate": 9.768537710084563e-06, "loss": 1.0137, "mean_token_accuracy": 0.7056117355823517, "step": 1045 }, { "epoch": 0.7122914538644876, "grad_norm": 1.8312945365905762, "learning_rate": 9.767859763877044e-06, "loss": 0.842, "mean_token_accuracy": 0.746867448091507, "step": 1046 }, { "epoch": 0.7129724208375894, "grad_norm": 1.7875964641571045, "learning_rate": 9.767180849873395e-06, "loss": 0.9837, "mean_token_accuracy": 0.693893700838089, "step": 1047 }, { "epoch": 0.7136533878106912, "grad_norm": 1.9748119115829468, "learning_rate": 9.766500968211423e-06, "loss": 0.9002, "mean_token_accuracy": 0.725608766078949, "step": 1048 }, { "epoch": 0.714334354783793, "grad_norm": 1.7999154329299927, "learning_rate": 9.765820119029137e-06, "loss": 1.0234, "mean_token_accuracy": 0.6793855726718903, "step": 1049 }, { "epoch": 0.7150153217568948, "grad_norm": 1.791735291481018, "learning_rate": 9.765138302464737e-06, "loss": 0.9414, "mean_token_accuracy": 0.7045800089836121, "step": 1050 }, { "epoch": 0.7156962887299966, "grad_norm": 1.7774659395217896, "learning_rate": 9.764455518656617e-06, "loss": 0.9252, "mean_token_accuracy": 0.7185877561569214, "step": 1051 }, { "epoch": 0.7163772557030984, "grad_norm": 1.816605567932129, "learning_rate": 9.763771767743372e-06, "loss": 0.8461, "mean_token_accuracy": 0.7428446114063263, "step": 1052 }, { "epoch": 0.7170582226762002, "grad_norm": 1.7624109983444214, "learning_rate": 9.763087049863797e-06, "loss": 0.9584, "mean_token_accuracy": 0.7126893997192383, "step": 1053 }, { "epoch": 0.7177391896493021, "grad_norm": 1.811812400817871, "learning_rate": 9.762401365156872e-06, "loss": 0.9047, "mean_token_accuracy": 0.719323456287384, "step": 1054 }, { "epoch": 0.7184201566224038, "grad_norm": 1.839532732963562, "learning_rate": 9.761714713761781e-06, "loss": 0.9299, "mean_token_accuracy": 0.7101182043552399, "step": 1055 }, { "epoch": 0.7191011235955056, "grad_norm": 1.8053125143051147, "learning_rate": 9.761027095817906e-06, "loss": 0.8915, "mean_token_accuracy": 0.7202680706977844, "step": 1056 }, { "epoch": 0.7197820905686074, "grad_norm": 1.7188596725463867, "learning_rate": 9.760338511464817e-06, "loss": 1.0398, "mean_token_accuracy": 0.6867567300796509, "step": 1057 }, { "epoch": 0.7204630575417093, "grad_norm": 1.7909321784973145, "learning_rate": 9.75964896084229e-06, "loss": 0.9076, "mean_token_accuracy": 0.7166793942451477, "step": 1058 }, { "epoch": 0.7211440245148111, "grad_norm": 1.904221773147583, "learning_rate": 9.75895844409029e-06, "loss": 0.879, "mean_token_accuracy": 0.7332671880722046, "step": 1059 }, { "epoch": 0.7218249914879128, "grad_norm": 1.9079139232635498, "learning_rate": 9.75826696134898e-06, "loss": 0.9109, "mean_token_accuracy": 0.7194808125495911, "step": 1060 }, { "epoch": 0.7225059584610146, "grad_norm": 1.9456851482391357, "learning_rate": 9.75757451275872e-06, "loss": 1.0165, "mean_token_accuracy": 0.6964311599731445, "step": 1061 }, { "epoch": 0.7231869254341164, "grad_norm": 1.8919411897659302, "learning_rate": 9.756881098460066e-06, "loss": 0.9527, "mean_token_accuracy": 0.711885392665863, "step": 1062 }, { "epoch": 0.7238678924072183, "grad_norm": 1.8632500171661377, "learning_rate": 9.756186718593769e-06, "loss": 0.9074, "mean_token_accuracy": 0.715603232383728, "step": 1063 }, { "epoch": 0.7245488593803201, "grad_norm": 1.7830756902694702, "learning_rate": 9.755491373300776e-06, "loss": 0.8975, "mean_token_accuracy": 0.721459686756134, "step": 1064 }, { "epoch": 0.7252298263534218, "grad_norm": 1.8241924047470093, "learning_rate": 9.754795062722232e-06, "loss": 0.8513, "mean_token_accuracy": 0.7322362065315247, "step": 1065 }, { "epoch": 0.7259107933265236, "grad_norm": 1.9120506048202515, "learning_rate": 9.754097786999474e-06, "loss": 0.8834, "mean_token_accuracy": 0.7314199805259705, "step": 1066 }, { "epoch": 0.7265917602996255, "grad_norm": 1.7920773029327393, "learning_rate": 9.753399546274041e-06, "loss": 0.9465, "mean_token_accuracy": 0.7089685797691345, "step": 1067 }, { "epoch": 0.7272727272727273, "grad_norm": 1.8224256038665771, "learning_rate": 9.752700340687662e-06, "loss": 0.9479, "mean_token_accuracy": 0.7101514637470245, "step": 1068 }, { "epoch": 0.7279536942458291, "grad_norm": 1.6656476259231567, "learning_rate": 9.752000170382264e-06, "loss": 0.9104, "mean_token_accuracy": 0.7155720293521881, "step": 1069 }, { "epoch": 0.7286346612189308, "grad_norm": 1.8008344173431396, "learning_rate": 9.75129903549997e-06, "loss": 0.937, "mean_token_accuracy": 0.7151639759540558, "step": 1070 }, { "epoch": 0.7293156281920327, "grad_norm": 1.3762189149856567, "learning_rate": 9.750596936183099e-06, "loss": 1.0187, "mean_token_accuracy": 0.7005558609962463, "step": 1071 }, { "epoch": 0.7299965951651345, "grad_norm": 1.9465909004211426, "learning_rate": 9.749893872574166e-06, "loss": 0.9554, "mean_token_accuracy": 0.7068721652030945, "step": 1072 }, { "epoch": 0.7306775621382363, "grad_norm": 1.8547120094299316, "learning_rate": 9.749189844815879e-06, "loss": 0.9163, "mean_token_accuracy": 0.7219839692115784, "step": 1073 }, { "epoch": 0.7313585291113381, "grad_norm": 1.7552103996276855, "learning_rate": 9.748484853051148e-06, "loss": 0.9186, "mean_token_accuracy": 0.7116154432296753, "step": 1074 }, { "epoch": 0.7320394960844399, "grad_norm": 1.8674991130828857, "learning_rate": 9.747778897423071e-06, "loss": 0.8726, "mean_token_accuracy": 0.7253651916980743, "step": 1075 }, { "epoch": 0.7327204630575417, "grad_norm": 1.824082374572754, "learning_rate": 9.747071978074944e-06, "loss": 0.8607, "mean_token_accuracy": 0.7258321046829224, "step": 1076 }, { "epoch": 0.7334014300306435, "grad_norm": 1.7812671661376953, "learning_rate": 9.746364095150264e-06, "loss": 0.9979, "mean_token_accuracy": 0.6979387700557709, "step": 1077 }, { "epoch": 0.7340823970037453, "grad_norm": 1.721884846687317, "learning_rate": 9.74565524879272e-06, "loss": 1.0251, "mean_token_accuracy": 0.7034668624401093, "step": 1078 }, { "epoch": 0.7347633639768472, "grad_norm": 1.7079840898513794, "learning_rate": 9.744945439146192e-06, "loss": 0.9509, "mean_token_accuracy": 0.688716322183609, "step": 1079 }, { "epoch": 0.7354443309499489, "grad_norm": 1.8590978384017944, "learning_rate": 9.744234666354759e-06, "loss": 1.0146, "mean_token_accuracy": 0.6957120597362518, "step": 1080 }, { "epoch": 0.7361252979230507, "grad_norm": 1.8816410303115845, "learning_rate": 9.743522930562701e-06, "loss": 0.9082, "mean_token_accuracy": 0.7181402146816254, "step": 1081 }, { "epoch": 0.7368062648961525, "grad_norm": 1.7908726930618286, "learning_rate": 9.742810231914485e-06, "loss": 1.0172, "mean_token_accuracy": 0.685890793800354, "step": 1082 }, { "epoch": 0.7374872318692544, "grad_norm": 1.664168119430542, "learning_rate": 9.742096570554778e-06, "loss": 0.9471, "mean_token_accuracy": 0.7044170498847961, "step": 1083 }, { "epoch": 0.7381681988423562, "grad_norm": 1.8263603448867798, "learning_rate": 9.741381946628439e-06, "loss": 0.9488, "mean_token_accuracy": 0.7103535532951355, "step": 1084 }, { "epoch": 0.7388491658154579, "grad_norm": 1.753709077835083, "learning_rate": 9.740666360280528e-06, "loss": 0.9909, "mean_token_accuracy": 0.6854015290737152, "step": 1085 }, { "epoch": 0.7395301327885597, "grad_norm": 1.734004259109497, "learning_rate": 9.739949811656298e-06, "loss": 1.0628, "mean_token_accuracy": 0.6783933341503143, "step": 1086 }, { "epoch": 0.7402110997616616, "grad_norm": 1.9611907005310059, "learning_rate": 9.739232300901191e-06, "loss": 0.8765, "mean_token_accuracy": 0.7132983803749084, "step": 1087 }, { "epoch": 0.7408920667347634, "grad_norm": 1.8074220418930054, "learning_rate": 9.738513828160856e-06, "loss": 0.9304, "mean_token_accuracy": 0.7207367122173309, "step": 1088 }, { "epoch": 0.7415730337078652, "grad_norm": 1.7227391004562378, "learning_rate": 9.737794393581125e-06, "loss": 1.0147, "mean_token_accuracy": 0.695129930973053, "step": 1089 }, { "epoch": 0.7422540006809669, "grad_norm": 1.8247524499893188, "learning_rate": 9.737073997308037e-06, "loss": 0.9604, "mean_token_accuracy": 0.7056140601634979, "step": 1090 }, { "epoch": 0.7429349676540687, "grad_norm": 1.778607964515686, "learning_rate": 9.736352639487818e-06, "loss": 0.8757, "mean_token_accuracy": 0.729184478521347, "step": 1091 }, { "epoch": 0.7436159346271706, "grad_norm": 1.8495299816131592, "learning_rate": 9.735630320266892e-06, "loss": 0.828, "mean_token_accuracy": 0.7413516640663147, "step": 1092 }, { "epoch": 0.7442969016002724, "grad_norm": 1.9959968328475952, "learning_rate": 9.734907039791875e-06, "loss": 0.8853, "mean_token_accuracy": 0.7247504591941833, "step": 1093 }, { "epoch": 0.7449778685733742, "grad_norm": 1.8539109230041504, "learning_rate": 9.734182798209583e-06, "loss": 0.8702, "mean_token_accuracy": 0.7273717820644379, "step": 1094 }, { "epoch": 0.745658835546476, "grad_norm": 1.9809513092041016, "learning_rate": 9.733457595667026e-06, "loss": 0.9551, "mean_token_accuracy": 0.7049571573734283, "step": 1095 }, { "epoch": 0.7463398025195778, "grad_norm": 1.8360562324523926, "learning_rate": 9.732731432311407e-06, "loss": 0.9081, "mean_token_accuracy": 0.7230578660964966, "step": 1096 }, { "epoch": 0.7470207694926796, "grad_norm": 1.8172856569290161, "learning_rate": 9.732004308290123e-06, "loss": 0.8709, "mean_token_accuracy": 0.7257253527641296, "step": 1097 }, { "epoch": 0.7477017364657814, "grad_norm": 1.9827275276184082, "learning_rate": 9.731276223750774e-06, "loss": 0.9323, "mean_token_accuracy": 0.706503838300705, "step": 1098 }, { "epoch": 0.7483827034388832, "grad_norm": 1.991946816444397, "learning_rate": 9.730547178841144e-06, "loss": 0.9556, "mean_token_accuracy": 0.7162654399871826, "step": 1099 }, { "epoch": 0.7490636704119851, "grad_norm": 1.9221510887145996, "learning_rate": 9.729817173709218e-06, "loss": 0.9132, "mean_token_accuracy": 0.7112231254577637, "step": 1100 }, { "epoch": 0.7497446373850868, "grad_norm": 1.8672914505004883, "learning_rate": 9.729086208503174e-06, "loss": 0.8965, "mean_token_accuracy": 0.7219488322734833, "step": 1101 }, { "epoch": 0.7504256043581886, "grad_norm": 1.8145822286605835, "learning_rate": 9.728354283371389e-06, "loss": 0.8822, "mean_token_accuracy": 0.7358022034168243, "step": 1102 }, { "epoch": 0.7511065713312904, "grad_norm": 1.7429068088531494, "learning_rate": 9.727621398462427e-06, "loss": 0.9145, "mean_token_accuracy": 0.7101839780807495, "step": 1103 }, { "epoch": 0.7517875383043923, "grad_norm": 1.7560508251190186, "learning_rate": 9.726887553925056e-06, "loss": 1.0039, "mean_token_accuracy": 0.6864404678344727, "step": 1104 }, { "epoch": 0.7524685052774941, "grad_norm": 1.8664149045944214, "learning_rate": 9.726152749908232e-06, "loss": 0.9435, "mean_token_accuracy": 0.70423823595047, "step": 1105 }, { "epoch": 0.7531494722505958, "grad_norm": 1.8927892446517944, "learning_rate": 9.725416986561107e-06, "loss": 0.9633, "mean_token_accuracy": 0.7085708677768707, "step": 1106 }, { "epoch": 0.7538304392236976, "grad_norm": 1.7299084663391113, "learning_rate": 9.72468026403303e-06, "loss": 1.0339, "mean_token_accuracy": 0.6730154454708099, "step": 1107 }, { "epoch": 0.7545114061967995, "grad_norm": 1.8794612884521484, "learning_rate": 9.723942582473545e-06, "loss": 0.9821, "mean_token_accuracy": 0.7069022357463837, "step": 1108 }, { "epoch": 0.7551923731699013, "grad_norm": 1.8601199388504028, "learning_rate": 9.723203942032385e-06, "loss": 0.9985, "mean_token_accuracy": 0.7064490616321564, "step": 1109 }, { "epoch": 0.7558733401430031, "grad_norm": 1.7448735237121582, "learning_rate": 9.722464342859486e-06, "loss": 0.9323, "mean_token_accuracy": 0.7096219658851624, "step": 1110 }, { "epoch": 0.7565543071161048, "grad_norm": 1.7506498098373413, "learning_rate": 9.721723785104971e-06, "loss": 0.8972, "mean_token_accuracy": 0.7270742356777191, "step": 1111 }, { "epoch": 0.7572352740892067, "grad_norm": 1.7947734594345093, "learning_rate": 9.72098226891916e-06, "loss": 0.9813, "mean_token_accuracy": 0.7063712179660797, "step": 1112 }, { "epoch": 0.7579162410623085, "grad_norm": 1.9209812879562378, "learning_rate": 9.720239794452575e-06, "loss": 0.9856, "mean_token_accuracy": 0.6992534697055817, "step": 1113 }, { "epoch": 0.7585972080354103, "grad_norm": 1.766957402229309, "learning_rate": 9.71949636185592e-06, "loss": 0.9863, "mean_token_accuracy": 0.7100191414356232, "step": 1114 }, { "epoch": 0.7592781750085121, "grad_norm": 2.023522138595581, "learning_rate": 9.7187519712801e-06, "loss": 0.9422, "mean_token_accuracy": 0.7061146199703217, "step": 1115 }, { "epoch": 0.7599591419816139, "grad_norm": 1.8304141759872437, "learning_rate": 9.718006622876217e-06, "loss": 0.755, "mean_token_accuracy": 0.7682033181190491, "step": 1116 }, { "epoch": 0.7606401089547157, "grad_norm": 1.7606562376022339, "learning_rate": 9.717260316795562e-06, "loss": 0.9275, "mean_token_accuracy": 0.7103767991065979, "step": 1117 }, { "epoch": 0.7613210759278175, "grad_norm": 1.7048914432525635, "learning_rate": 9.716513053189622e-06, "loss": 0.9063, "mean_token_accuracy": 0.7225262820720673, "step": 1118 }, { "epoch": 0.7620020429009193, "grad_norm": 1.8473676443099976, "learning_rate": 9.71576483221008e-06, "loss": 0.8218, "mean_token_accuracy": 0.7073612809181213, "step": 1119 }, { "epoch": 0.7626830098740212, "grad_norm": 1.845192551612854, "learning_rate": 9.715015654008813e-06, "loss": 0.9345, "mean_token_accuracy": 0.6935648024082184, "step": 1120 }, { "epoch": 0.7633639768471229, "grad_norm": 1.7663676738739014, "learning_rate": 9.71426551873789e-06, "loss": 0.9425, "mean_token_accuracy": 0.7055336236953735, "step": 1121 }, { "epoch": 0.7640449438202247, "grad_norm": 1.8473007678985596, "learning_rate": 9.713514426549578e-06, "loss": 0.8456, "mean_token_accuracy": 0.7376753687858582, "step": 1122 }, { "epoch": 0.7647259107933265, "grad_norm": 1.5776740312576294, "learning_rate": 9.712762377596335e-06, "loss": 1.0037, "mean_token_accuracy": 0.6803857088088989, "step": 1123 }, { "epoch": 0.7654068777664284, "grad_norm": 1.9117799997329712, "learning_rate": 9.712009372030815e-06, "loss": 0.9854, "mean_token_accuracy": 0.6966078579425812, "step": 1124 }, { "epoch": 0.7660878447395302, "grad_norm": 1.7439684867858887, "learning_rate": 9.711255410005864e-06, "loss": 0.9316, "mean_token_accuracy": 0.7017613053321838, "step": 1125 }, { "epoch": 0.7667688117126319, "grad_norm": 1.744305968284607, "learning_rate": 9.710500491674528e-06, "loss": 0.9083, "mean_token_accuracy": 0.7108344733715057, "step": 1126 }, { "epoch": 0.7674497786857337, "grad_norm": 1.8525079488754272, "learning_rate": 9.709744617190039e-06, "loss": 0.9086, "mean_token_accuracy": 0.7213256359100342, "step": 1127 }, { "epoch": 0.7681307456588355, "grad_norm": 1.8070871829986572, "learning_rate": 9.708987786705825e-06, "loss": 0.9939, "mean_token_accuracy": 0.6966802775859833, "step": 1128 }, { "epoch": 0.7688117126319374, "grad_norm": 1.7880346775054932, "learning_rate": 9.708230000375515e-06, "loss": 0.954, "mean_token_accuracy": 0.7000248432159424, "step": 1129 }, { "epoch": 0.7694926796050392, "grad_norm": 1.8552751541137695, "learning_rate": 9.707471258352925e-06, "loss": 0.9282, "mean_token_accuracy": 0.7048448324203491, "step": 1130 }, { "epoch": 0.7701736465781409, "grad_norm": 1.7227095365524292, "learning_rate": 9.706711560792066e-06, "loss": 0.8773, "mean_token_accuracy": 0.7252954244613647, "step": 1131 }, { "epoch": 0.7708546135512427, "grad_norm": 1.7387943267822266, "learning_rate": 9.705950907847145e-06, "loss": 0.9557, "mean_token_accuracy": 0.7002326250076294, "step": 1132 }, { "epoch": 0.7715355805243446, "grad_norm": 1.7145534753799438, "learning_rate": 9.705189299672562e-06, "loss": 0.9419, "mean_token_accuracy": 0.7082749903202057, "step": 1133 }, { "epoch": 0.7722165474974464, "grad_norm": 1.9098294973373413, "learning_rate": 9.70442673642291e-06, "loss": 0.848, "mean_token_accuracy": 0.7439369261264801, "step": 1134 }, { "epoch": 0.7728975144705482, "grad_norm": 1.8373069763183594, "learning_rate": 9.703663218252976e-06, "loss": 0.8638, "mean_token_accuracy": 0.7376098930835724, "step": 1135 }, { "epoch": 0.7735784814436499, "grad_norm": 1.8606536388397217, "learning_rate": 9.702898745317744e-06, "loss": 0.9054, "mean_token_accuracy": 0.7220079600811005, "step": 1136 }, { "epoch": 0.7742594484167518, "grad_norm": 1.7244553565979004, "learning_rate": 9.70213331777239e-06, "loss": 0.9372, "mean_token_accuracy": 0.699462890625, "step": 1137 }, { "epoch": 0.7749404153898536, "grad_norm": 1.8991163969039917, "learning_rate": 9.701366935772276e-06, "loss": 0.8962, "mean_token_accuracy": 0.7206734418869019, "step": 1138 }, { "epoch": 0.7756213823629554, "grad_norm": 1.7516697645187378, "learning_rate": 9.700599599472973e-06, "loss": 1.0058, "mean_token_accuracy": 0.6877177357673645, "step": 1139 }, { "epoch": 0.7763023493360572, "grad_norm": 1.974962592124939, "learning_rate": 9.699831309030231e-06, "loss": 0.9243, "mean_token_accuracy": 0.7080113589763641, "step": 1140 }, { "epoch": 0.776983316309159, "grad_norm": 1.7668766975402832, "learning_rate": 9.699062064600007e-06, "loss": 1.0233, "mean_token_accuracy": 0.6935610771179199, "step": 1141 }, { "epoch": 0.7776642832822608, "grad_norm": 1.6184380054473877, "learning_rate": 9.69829186633844e-06, "loss": 0.9287, "mean_token_accuracy": 0.7141878306865692, "step": 1142 }, { "epoch": 0.7783452502553626, "grad_norm": 1.9951810836791992, "learning_rate": 9.697520714401868e-06, "loss": 0.8478, "mean_token_accuracy": 0.7432135939598083, "step": 1143 }, { "epoch": 0.7790262172284644, "grad_norm": 1.8963110446929932, "learning_rate": 9.696748608946822e-06, "loss": 0.9464, "mean_token_accuracy": 0.6993700861930847, "step": 1144 }, { "epoch": 0.7797071842015663, "grad_norm": 1.7441699504852295, "learning_rate": 9.695975550130028e-06, "loss": 1.0117, "mean_token_accuracy": 0.6929945945739746, "step": 1145 }, { "epoch": 0.780388151174668, "grad_norm": 1.8030208349227905, "learning_rate": 9.695201538108403e-06, "loss": 1.0124, "mean_token_accuracy": 0.6856607496738434, "step": 1146 }, { "epoch": 0.7810691181477698, "grad_norm": 1.6666393280029297, "learning_rate": 9.694426573039057e-06, "loss": 0.9265, "mean_token_accuracy": 0.7085580229759216, "step": 1147 }, { "epoch": 0.7817500851208716, "grad_norm": 1.7628978490829468, "learning_rate": 9.693650655079298e-06, "loss": 0.9961, "mean_token_accuracy": 0.6867520213127136, "step": 1148 }, { "epoch": 0.7824310520939735, "grad_norm": 1.838558316230774, "learning_rate": 9.692873784386625e-06, "loss": 0.903, "mean_token_accuracy": 0.7143341302871704, "step": 1149 }, { "epoch": 0.7831120190670753, "grad_norm": 1.7126233577728271, "learning_rate": 9.692095961118725e-06, "loss": 1.0637, "mean_token_accuracy": 0.6747037768363953, "step": 1150 }, { "epoch": 0.783792986040177, "grad_norm": 1.8032370805740356, "learning_rate": 9.691317185433488e-06, "loss": 1.0236, "mean_token_accuracy": 0.7021102011203766, "step": 1151 }, { "epoch": 0.7844739530132788, "grad_norm": 1.9659737348556519, "learning_rate": 9.69053745748899e-06, "loss": 0.9053, "mean_token_accuracy": 0.7209859490394592, "step": 1152 }, { "epoch": 0.7851549199863807, "grad_norm": 2.060248851776123, "learning_rate": 9.689756777443502e-06, "loss": 0.907, "mean_token_accuracy": 0.7169306874275208, "step": 1153 }, { "epoch": 0.7858358869594825, "grad_norm": 1.694506287574768, "learning_rate": 9.688975145455493e-06, "loss": 1.0095, "mean_token_accuracy": 0.7004702687263489, "step": 1154 }, { "epoch": 0.7865168539325843, "grad_norm": 2.152897596359253, "learning_rate": 9.688192561683615e-06, "loss": 0.8507, "mean_token_accuracy": 0.7350761890411377, "step": 1155 }, { "epoch": 0.787197820905686, "grad_norm": 1.793628215789795, "learning_rate": 9.687409026286727e-06, "loss": 0.9304, "mean_token_accuracy": 0.7072612047195435, "step": 1156 }, { "epoch": 0.7878787878787878, "grad_norm": 1.84998619556427, "learning_rate": 9.686624539423868e-06, "loss": 0.9264, "mean_token_accuracy": 0.724335253238678, "step": 1157 }, { "epoch": 0.7885597548518897, "grad_norm": 1.7279521226882935, "learning_rate": 9.685839101254278e-06, "loss": 1.0593, "mean_token_accuracy": 0.6709716320037842, "step": 1158 }, { "epoch": 0.7892407218249915, "grad_norm": 1.7688796520233154, "learning_rate": 9.685052711937388e-06, "loss": 0.9694, "mean_token_accuracy": 0.6999001801013947, "step": 1159 }, { "epoch": 0.7899216887980933, "grad_norm": 1.9555976390838623, "learning_rate": 9.68426537163282e-06, "loss": 0.8838, "mean_token_accuracy": 0.7359626293182373, "step": 1160 }, { "epoch": 0.7906026557711952, "grad_norm": 1.9673482179641724, "learning_rate": 9.683477080500392e-06, "loss": 0.9801, "mean_token_accuracy": 0.7013642191886902, "step": 1161 }, { "epoch": 0.7912836227442969, "grad_norm": 1.7931822538375854, "learning_rate": 9.682687838700116e-06, "loss": 0.9338, "mean_token_accuracy": 0.7210496068000793, "step": 1162 }, { "epoch": 0.7919645897173987, "grad_norm": 1.9996017217636108, "learning_rate": 9.681897646392192e-06, "loss": 0.8993, "mean_token_accuracy": 0.7308981120586395, "step": 1163 }, { "epoch": 0.7926455566905005, "grad_norm": 1.7081462144851685, "learning_rate": 9.681106503737019e-06, "loss": 0.9625, "mean_token_accuracy": 0.70111483335495, "step": 1164 }, { "epoch": 0.7933265236636023, "grad_norm": 1.8016529083251953, "learning_rate": 9.680314410895182e-06, "loss": 0.8632, "mean_token_accuracy": 0.7213779985904694, "step": 1165 }, { "epoch": 0.7940074906367042, "grad_norm": 1.694303274154663, "learning_rate": 9.679521368027464e-06, "loss": 0.9667, "mean_token_accuracy": 0.7029579281806946, "step": 1166 }, { "epoch": 0.7946884576098059, "grad_norm": 1.9086657762527466, "learning_rate": 9.678727375294842e-06, "loss": 0.8958, "mean_token_accuracy": 0.7242754697799683, "step": 1167 }, { "epoch": 0.7953694245829077, "grad_norm": 1.7697991132736206, "learning_rate": 9.677932432858481e-06, "loss": 0.8682, "mean_token_accuracy": 0.724299281835556, "step": 1168 }, { "epoch": 0.7960503915560095, "grad_norm": 1.7666630744934082, "learning_rate": 9.677136540879742e-06, "loss": 0.9796, "mean_token_accuracy": 0.7089228928089142, "step": 1169 }, { "epoch": 0.7967313585291114, "grad_norm": 2.001152515411377, "learning_rate": 9.676339699520175e-06, "loss": 0.9424, "mean_token_accuracy": 0.7243922352790833, "step": 1170 }, { "epoch": 0.7974123255022132, "grad_norm": 1.9606746435165405, "learning_rate": 9.67554190894153e-06, "loss": 0.9279, "mean_token_accuracy": 0.7091791331768036, "step": 1171 }, { "epoch": 0.7980932924753149, "grad_norm": 1.9192875623703003, "learning_rate": 9.67474316930574e-06, "loss": 0.9878, "mean_token_accuracy": 0.6863990426063538, "step": 1172 }, { "epoch": 0.7987742594484167, "grad_norm": 1.9293631315231323, "learning_rate": 9.673943480774942e-06, "loss": 0.8961, "mean_token_accuracy": 0.7247385382652283, "step": 1173 }, { "epoch": 0.7994552264215186, "grad_norm": 1.8832898139953613, "learning_rate": 9.673142843511456e-06, "loss": 0.8848, "mean_token_accuracy": 0.7252858281135559, "step": 1174 }, { "epoch": 0.8001361933946204, "grad_norm": 1.6893119812011719, "learning_rate": 9.672341257677798e-06, "loss": 0.941, "mean_token_accuracy": 0.7127213180065155, "step": 1175 }, { "epoch": 0.8008171603677222, "grad_norm": 1.7490373849868774, "learning_rate": 9.671538723436676e-06, "loss": 0.9826, "mean_token_accuracy": 0.6974765360355377, "step": 1176 }, { "epoch": 0.8014981273408239, "grad_norm": 1.83999502658844, "learning_rate": 9.670735240950993e-06, "loss": 0.9546, "mean_token_accuracy": 0.7113222181797028, "step": 1177 }, { "epoch": 0.8021790943139258, "grad_norm": 1.704540729522705, "learning_rate": 9.66993081038384e-06, "loss": 0.9886, "mean_token_accuracy": 0.7055400311946869, "step": 1178 }, { "epoch": 0.8028600612870276, "grad_norm": 1.636008858680725, "learning_rate": 9.669125431898505e-06, "loss": 1.0461, "mean_token_accuracy": 0.6903463304042816, "step": 1179 }, { "epoch": 0.8035410282601294, "grad_norm": 1.6785374879837036, "learning_rate": 9.668319105658467e-06, "loss": 0.9963, "mean_token_accuracy": 0.6930195391178131, "step": 1180 }, { "epoch": 0.8042219952332312, "grad_norm": 1.7162607908248901, "learning_rate": 9.667511831827395e-06, "loss": 0.9766, "mean_token_accuracy": 0.7174504399299622, "step": 1181 }, { "epoch": 0.804902962206333, "grad_norm": 1.7870893478393555, "learning_rate": 9.666703610569153e-06, "loss": 0.8896, "mean_token_accuracy": 0.7319836914539337, "step": 1182 }, { "epoch": 0.8055839291794348, "grad_norm": 1.7789217233657837, "learning_rate": 9.665894442047796e-06, "loss": 1.0171, "mean_token_accuracy": 0.6973325312137604, "step": 1183 }, { "epoch": 0.8062648961525366, "grad_norm": 1.5866150856018066, "learning_rate": 9.665084326427575e-06, "loss": 1.0069, "mean_token_accuracy": 0.70192089676857, "step": 1184 }, { "epoch": 0.8069458631256384, "grad_norm": 1.8565729856491089, "learning_rate": 9.664273263872923e-06, "loss": 0.9036, "mean_token_accuracy": 0.7151606380939484, "step": 1185 }, { "epoch": 0.8076268300987403, "grad_norm": 1.816664457321167, "learning_rate": 9.663461254548478e-06, "loss": 1.0256, "mean_token_accuracy": 0.69109708070755, "step": 1186 }, { "epoch": 0.808307797071842, "grad_norm": 1.921326994895935, "learning_rate": 9.662648298619061e-06, "loss": 0.9637, "mean_token_accuracy": 0.6906255185604095, "step": 1187 }, { "epoch": 0.8089887640449438, "grad_norm": 1.7069050073623657, "learning_rate": 9.661834396249695e-06, "loss": 0.9588, "mean_token_accuracy": 0.7077401578426361, "step": 1188 }, { "epoch": 0.8096697310180456, "grad_norm": 1.7490392923355103, "learning_rate": 9.661019547605579e-06, "loss": 0.9135, "mean_token_accuracy": 0.7172181904315948, "step": 1189 }, { "epoch": 0.8103506979911475, "grad_norm": 1.9585872888565063, "learning_rate": 9.66020375285212e-06, "loss": 0.9206, "mean_token_accuracy": 0.7160815894603729, "step": 1190 }, { "epoch": 0.8110316649642493, "grad_norm": 2.076305389404297, "learning_rate": 9.65938701215491e-06, "loss": 0.8848, "mean_token_accuracy": 0.726393073797226, "step": 1191 }, { "epoch": 0.811712631937351, "grad_norm": 1.8622196912765503, "learning_rate": 9.658569325679733e-06, "loss": 0.8968, "mean_token_accuracy": 0.7213950455188751, "step": 1192 }, { "epoch": 0.8123935989104528, "grad_norm": 1.969014286994934, "learning_rate": 9.657750693592564e-06, "loss": 0.8636, "mean_token_accuracy": 0.7330661416053772, "step": 1193 }, { "epoch": 0.8130745658835546, "grad_norm": 1.678364872932434, "learning_rate": 9.656931116059574e-06, "loss": 1.0257, "mean_token_accuracy": 0.6952763795852661, "step": 1194 }, { "epoch": 0.8137555328566565, "grad_norm": 1.8877832889556885, "learning_rate": 9.656110593247124e-06, "loss": 0.9479, "mean_token_accuracy": 0.6995028555393219, "step": 1195 }, { "epoch": 0.8144364998297583, "grad_norm": 1.7947499752044678, "learning_rate": 9.655289125321766e-06, "loss": 0.8647, "mean_token_accuracy": 0.7389724254608154, "step": 1196 }, { "epoch": 0.81511746680286, "grad_norm": 1.924430251121521, "learning_rate": 9.654466712450241e-06, "loss": 0.8322, "mean_token_accuracy": 0.7412619292736053, "step": 1197 }, { "epoch": 0.8157984337759618, "grad_norm": 1.9835816621780396, "learning_rate": 9.65364335479949e-06, "loss": 0.8978, "mean_token_accuracy": 0.7271039485931396, "step": 1198 }, { "epoch": 0.8164794007490637, "grad_norm": 1.8842840194702148, "learning_rate": 9.652819052536638e-06, "loss": 0.8995, "mean_token_accuracy": 0.7140736877918243, "step": 1199 }, { "epoch": 0.8171603677221655, "grad_norm": 1.7646900415420532, "learning_rate": 9.651993805829007e-06, "loss": 0.9938, "mean_token_accuracy": 0.689206063747406, "step": 1200 }, { "epoch": 0.8178413346952673, "grad_norm": 1.7477885484695435, "learning_rate": 9.651167614844104e-06, "loss": 0.9356, "mean_token_accuracy": 0.7155735194683075, "step": 1201 }, { "epoch": 0.818522301668369, "grad_norm": 1.690820336341858, "learning_rate": 9.650340479749637e-06, "loss": 0.9529, "mean_token_accuracy": 0.7074473798274994, "step": 1202 }, { "epoch": 0.8192032686414709, "grad_norm": 1.5903432369232178, "learning_rate": 9.649512400713497e-06, "loss": 1.0246, "mean_token_accuracy": 0.6919549405574799, "step": 1203 }, { "epoch": 0.8198842356145727, "grad_norm": 1.5902278423309326, "learning_rate": 9.648683377903773e-06, "loss": 0.9461, "mean_token_accuracy": 0.690496176481247, "step": 1204 }, { "epoch": 0.8205652025876745, "grad_norm": 1.750913143157959, "learning_rate": 9.64785341148874e-06, "loss": 0.8686, "mean_token_accuracy": 0.7220234870910645, "step": 1205 }, { "epoch": 0.8212461695607763, "grad_norm": 1.8789281845092773, "learning_rate": 9.647022501636872e-06, "loss": 0.919, "mean_token_accuracy": 0.7113080322742462, "step": 1206 }, { "epoch": 0.821927136533878, "grad_norm": 1.6797380447387695, "learning_rate": 9.646190648516824e-06, "loss": 0.9154, "mean_token_accuracy": 0.7084365785121918, "step": 1207 }, { "epoch": 0.8226081035069799, "grad_norm": 1.7031970024108887, "learning_rate": 9.645357852297452e-06, "loss": 1.0753, "mean_token_accuracy": 0.6659196019172668, "step": 1208 }, { "epoch": 0.8232890704800817, "grad_norm": 1.7323874235153198, "learning_rate": 9.644524113147797e-06, "loss": 0.9433, "mean_token_accuracy": 0.7054809629917145, "step": 1209 }, { "epoch": 0.8239700374531835, "grad_norm": 1.7774523496627808, "learning_rate": 9.643689431237098e-06, "loss": 0.8807, "mean_token_accuracy": 0.7282648980617523, "step": 1210 }, { "epoch": 0.8246510044262854, "grad_norm": 1.7525460720062256, "learning_rate": 9.642853806734779e-06, "loss": 0.9911, "mean_token_accuracy": 0.6797225773334503, "step": 1211 }, { "epoch": 0.8253319713993871, "grad_norm": 1.749721646308899, "learning_rate": 9.642017239810457e-06, "loss": 0.9693, "mean_token_accuracy": 0.7084324657917023, "step": 1212 }, { "epoch": 0.8260129383724889, "grad_norm": 1.5996593236923218, "learning_rate": 9.641179730633945e-06, "loss": 1.0053, "mean_token_accuracy": 0.6937741339206696, "step": 1213 }, { "epoch": 0.8266939053455907, "grad_norm": 1.8799188137054443, "learning_rate": 9.64034127937524e-06, "loss": 0.8742, "mean_token_accuracy": 0.7288071513175964, "step": 1214 }, { "epoch": 0.8273748723186926, "grad_norm": 1.7242133617401123, "learning_rate": 9.639501886204534e-06, "loss": 0.9653, "mean_token_accuracy": 0.701635092496872, "step": 1215 }, { "epoch": 0.8280558392917944, "grad_norm": 1.9787721633911133, "learning_rate": 9.63866155129221e-06, "loss": 0.8305, "mean_token_accuracy": 0.7380355298519135, "step": 1216 }, { "epoch": 0.8287368062648961, "grad_norm": 1.6938034296035767, "learning_rate": 9.637820274808843e-06, "loss": 1.012, "mean_token_accuracy": 0.6916789412498474, "step": 1217 }, { "epoch": 0.8294177732379979, "grad_norm": 2.015446662902832, "learning_rate": 9.636978056925197e-06, "loss": 0.8319, "mean_token_accuracy": 0.7501365542411804, "step": 1218 }, { "epoch": 0.8300987402110998, "grad_norm": 1.7930023670196533, "learning_rate": 9.63613489781223e-06, "loss": 0.8947, "mean_token_accuracy": 0.7220171689987183, "step": 1219 }, { "epoch": 0.8307797071842016, "grad_norm": 1.7701555490493774, "learning_rate": 9.635290797641087e-06, "loss": 0.8884, "mean_token_accuracy": 0.7221902906894684, "step": 1220 }, { "epoch": 0.8314606741573034, "grad_norm": 1.8755931854248047, "learning_rate": 9.634445756583108e-06, "loss": 0.8965, "mean_token_accuracy": 0.7267372012138367, "step": 1221 }, { "epoch": 0.8321416411304052, "grad_norm": 1.6122400760650635, "learning_rate": 9.633599774809822e-06, "loss": 0.974, "mean_token_accuracy": 0.7026757597923279, "step": 1222 }, { "epoch": 0.832822608103507, "grad_norm": 1.6400855779647827, "learning_rate": 9.632752852492946e-06, "loss": 0.8617, "mean_token_accuracy": 0.7128010392189026, "step": 1223 }, { "epoch": 0.8335035750766088, "grad_norm": 1.7315094470977783, "learning_rate": 9.631904989804395e-06, "loss": 0.944, "mean_token_accuracy": 0.6935459971427917, "step": 1224 }, { "epoch": 0.8341845420497106, "grad_norm": 1.6716303825378418, "learning_rate": 9.631056186916271e-06, "loss": 0.981, "mean_token_accuracy": 0.7100951373577118, "step": 1225 }, { "epoch": 0.8348655090228124, "grad_norm": 1.7162458896636963, "learning_rate": 9.630206444000865e-06, "loss": 0.9395, "mean_token_accuracy": 0.7076989710330963, "step": 1226 }, { "epoch": 0.8355464759959143, "grad_norm": 1.7531684637069702, "learning_rate": 9.629355761230663e-06, "loss": 0.9926, "mean_token_accuracy": 0.7043269574642181, "step": 1227 }, { "epoch": 0.836227442969016, "grad_norm": 1.9473564624786377, "learning_rate": 9.628504138778336e-06, "loss": 0.9266, "mean_token_accuracy": 0.694770336151123, "step": 1228 }, { "epoch": 0.8369084099421178, "grad_norm": 1.8572648763656616, "learning_rate": 9.62765157681675e-06, "loss": 0.8547, "mean_token_accuracy": 0.7423422038555145, "step": 1229 }, { "epoch": 0.8375893769152196, "grad_norm": 1.750510334968567, "learning_rate": 9.626798075518964e-06, "loss": 0.8993, "mean_token_accuracy": 0.722957044839859, "step": 1230 }, { "epoch": 0.8382703438883214, "grad_norm": 1.7096363306045532, "learning_rate": 9.625943635058222e-06, "loss": 1.0305, "mean_token_accuracy": 0.7038693726062775, "step": 1231 }, { "epoch": 0.8389513108614233, "grad_norm": 1.9138621091842651, "learning_rate": 9.62508825560796e-06, "loss": 0.9112, "mean_token_accuracy": 0.7059042751789093, "step": 1232 }, { "epoch": 0.839632277834525, "grad_norm": 1.7933839559555054, "learning_rate": 9.624231937341808e-06, "loss": 0.9139, "mean_token_accuracy": 0.7123423218727112, "step": 1233 }, { "epoch": 0.8403132448076268, "grad_norm": 1.9560068845748901, "learning_rate": 9.623374680433585e-06, "loss": 0.8595, "mean_token_accuracy": 0.7385145723819733, "step": 1234 }, { "epoch": 0.8409942117807286, "grad_norm": 1.755037784576416, "learning_rate": 9.622516485057299e-06, "loss": 0.9635, "mean_token_accuracy": 0.7023265063762665, "step": 1235 }, { "epoch": 0.8416751787538305, "grad_norm": 1.9257049560546875, "learning_rate": 9.621657351387146e-06, "loss": 0.9074, "mean_token_accuracy": 0.7241981327533722, "step": 1236 }, { "epoch": 0.8423561457269323, "grad_norm": 1.9444955587387085, "learning_rate": 9.620797279597522e-06, "loss": 0.8974, "mean_token_accuracy": 0.7209935486316681, "step": 1237 }, { "epoch": 0.843037112700034, "grad_norm": 1.7784205675125122, "learning_rate": 9.619936269863002e-06, "loss": 1.0673, "mean_token_accuracy": 0.6788965463638306, "step": 1238 }, { "epoch": 0.8437180796731358, "grad_norm": 1.9321285486221313, "learning_rate": 9.619074322358359e-06, "loss": 0.8938, "mean_token_accuracy": 0.72782102227211, "step": 1239 }, { "epoch": 0.8443990466462377, "grad_norm": 1.825711965560913, "learning_rate": 9.618211437258554e-06, "loss": 0.903, "mean_token_accuracy": 0.7206749618053436, "step": 1240 }, { "epoch": 0.8450800136193395, "grad_norm": 1.800168514251709, "learning_rate": 9.617347614738738e-06, "loss": 1.0135, "mean_token_accuracy": 0.6802629232406616, "step": 1241 }, { "epoch": 0.8457609805924413, "grad_norm": 1.724875569343567, "learning_rate": 9.616482854974251e-06, "loss": 0.9292, "mean_token_accuracy": 0.7199026644229889, "step": 1242 }, { "epoch": 0.846441947565543, "grad_norm": 1.9037199020385742, "learning_rate": 9.61561715814063e-06, "loss": 0.8281, "mean_token_accuracy": 0.7475929260253906, "step": 1243 }, { "epoch": 0.8471229145386449, "grad_norm": 1.9153722524642944, "learning_rate": 9.614750524413591e-06, "loss": 0.8308, "mean_token_accuracy": 0.7366803288459778, "step": 1244 }, { "epoch": 0.8478038815117467, "grad_norm": 1.8300285339355469, "learning_rate": 9.61388295396905e-06, "loss": 0.8474, "mean_token_accuracy": 0.7286835312843323, "step": 1245 }, { "epoch": 0.8484848484848485, "grad_norm": 1.8957891464233398, "learning_rate": 9.613014446983106e-06, "loss": 0.9223, "mean_token_accuracy": 0.7224587202072144, "step": 1246 }, { "epoch": 0.8491658154579503, "grad_norm": 1.7288044691085815, "learning_rate": 9.612145003632054e-06, "loss": 0.9179, "mean_token_accuracy": 0.7229567468166351, "step": 1247 }, { "epoch": 0.849846782431052, "grad_norm": 1.8277229070663452, "learning_rate": 9.611274624092376e-06, "loss": 0.9777, "mean_token_accuracy": 0.6908672451972961, "step": 1248 }, { "epoch": 0.8505277494041539, "grad_norm": 1.9088060855865479, "learning_rate": 9.610403308540745e-06, "loss": 0.9524, "mean_token_accuracy": 0.7041372060775757, "step": 1249 }, { "epoch": 0.8512087163772557, "grad_norm": 1.851757287979126, "learning_rate": 9.609531057154022e-06, "loss": 0.922, "mean_token_accuracy": 0.7115908861160278, "step": 1250 }, { "epoch": 0.8518896833503575, "grad_norm": 1.9038527011871338, "learning_rate": 9.608657870109263e-06, "loss": 0.8145, "mean_token_accuracy": 0.7494410872459412, "step": 1251 }, { "epoch": 0.8525706503234594, "grad_norm": 1.9456313848495483, "learning_rate": 9.607783747583705e-06, "loss": 0.9058, "mean_token_accuracy": 0.7222963273525238, "step": 1252 }, { "epoch": 0.8532516172965611, "grad_norm": 1.6846987009048462, "learning_rate": 9.606908689754785e-06, "loss": 0.8955, "mean_token_accuracy": 0.7073348164558411, "step": 1253 }, { "epoch": 0.8539325842696629, "grad_norm": 1.6237199306488037, "learning_rate": 9.606032696800124e-06, "loss": 0.9149, "mean_token_accuracy": 0.7135954797267914, "step": 1254 }, { "epoch": 0.8546135512427647, "grad_norm": 1.8586081266403198, "learning_rate": 9.605155768897533e-06, "loss": 0.9199, "mean_token_accuracy": 0.7183062732219696, "step": 1255 }, { "epoch": 0.8552945182158666, "grad_norm": 1.7763636112213135, "learning_rate": 9.604277906225015e-06, "loss": 0.9294, "mean_token_accuracy": 0.7021408975124359, "step": 1256 }, { "epoch": 0.8559754851889684, "grad_norm": 1.8389960527420044, "learning_rate": 9.603399108960759e-06, "loss": 0.9181, "mean_token_accuracy": 0.71418496966362, "step": 1257 }, { "epoch": 0.8566564521620701, "grad_norm": 1.8045028448104858, "learning_rate": 9.602519377283149e-06, "loss": 0.9455, "mean_token_accuracy": 0.7122480869293213, "step": 1258 }, { "epoch": 0.8573374191351719, "grad_norm": 1.6869255304336548, "learning_rate": 9.601638711370755e-06, "loss": 0.9829, "mean_token_accuracy": 0.7060867846012115, "step": 1259 }, { "epoch": 0.8580183861082737, "grad_norm": 1.8165297508239746, "learning_rate": 9.600757111402336e-06, "loss": 0.8987, "mean_token_accuracy": 0.7209956645965576, "step": 1260 }, { "epoch": 0.8586993530813756, "grad_norm": 1.7073521614074707, "learning_rate": 9.599874577556845e-06, "loss": 1.0167, "mean_token_accuracy": 0.6789763569831848, "step": 1261 }, { "epoch": 0.8593803200544774, "grad_norm": 1.7308367490768433, "learning_rate": 9.598991110013418e-06, "loss": 0.9739, "mean_token_accuracy": 0.701383113861084, "step": 1262 }, { "epoch": 0.8600612870275791, "grad_norm": 1.7371035814285278, "learning_rate": 9.59810670895139e-06, "loss": 0.9608, "mean_token_accuracy": 0.71408411860466, "step": 1263 }, { "epoch": 0.8607422540006809, "grad_norm": 1.814604640007019, "learning_rate": 9.597221374550272e-06, "loss": 0.9801, "mean_token_accuracy": 0.7058002054691315, "step": 1264 }, { "epoch": 0.8614232209737828, "grad_norm": 1.809842824935913, "learning_rate": 9.596335106989777e-06, "loss": 0.9559, "mean_token_accuracy": 0.7045382559299469, "step": 1265 }, { "epoch": 0.8621041879468846, "grad_norm": 1.6497564315795898, "learning_rate": 9.595447906449805e-06, "loss": 0.9617, "mean_token_accuracy": 0.7044810652732849, "step": 1266 }, { "epoch": 0.8627851549199864, "grad_norm": 1.7969720363616943, "learning_rate": 9.594559773110436e-06, "loss": 0.8384, "mean_token_accuracy": 0.7219753265380859, "step": 1267 }, { "epoch": 0.8634661218930881, "grad_norm": 1.9356120824813843, "learning_rate": 9.593670707151952e-06, "loss": 0.9296, "mean_token_accuracy": 0.7002801299095154, "step": 1268 }, { "epoch": 0.86414708886619, "grad_norm": 1.5909727811813354, "learning_rate": 9.592780708754815e-06, "loss": 1.0389, "mean_token_accuracy": 0.6955714821815491, "step": 1269 }, { "epoch": 0.8648280558392918, "grad_norm": 1.796966314315796, "learning_rate": 9.591889778099682e-06, "loss": 0.9099, "mean_token_accuracy": 0.7187400460243225, "step": 1270 }, { "epoch": 0.8655090228123936, "grad_norm": 2.050086259841919, "learning_rate": 9.590997915367395e-06, "loss": 0.9022, "mean_token_accuracy": 0.7192157804965973, "step": 1271 }, { "epoch": 0.8661899897854954, "grad_norm": 1.9071494340896606, "learning_rate": 9.59010512073899e-06, "loss": 0.7484, "mean_token_accuracy": 0.7705189883708954, "step": 1272 }, { "epoch": 0.8668709567585972, "grad_norm": 1.8600503206253052, "learning_rate": 9.589211394395689e-06, "loss": 0.8864, "mean_token_accuracy": 0.7312121391296387, "step": 1273 }, { "epoch": 0.867551923731699, "grad_norm": 1.733795404434204, "learning_rate": 9.588316736518902e-06, "loss": 0.9558, "mean_token_accuracy": 0.7108137905597687, "step": 1274 }, { "epoch": 0.8682328907048008, "grad_norm": 1.7386653423309326, "learning_rate": 9.58742114729023e-06, "loss": 0.793, "mean_token_accuracy": 0.75159552693367, "step": 1275 }, { "epoch": 0.8689138576779026, "grad_norm": 1.7997620105743408, "learning_rate": 9.586524626891462e-06, "loss": 0.8505, "mean_token_accuracy": 0.7343891561031342, "step": 1276 }, { "epoch": 0.8695948246510045, "grad_norm": 1.748645305633545, "learning_rate": 9.58562717550458e-06, "loss": 0.9086, "mean_token_accuracy": 0.6994458436965942, "step": 1277 }, { "epoch": 0.8702757916241062, "grad_norm": 1.5830680131912231, "learning_rate": 9.584728793311748e-06, "loss": 1.1121, "mean_token_accuracy": 0.6594838201999664, "step": 1278 }, { "epoch": 0.870956758597208, "grad_norm": 1.6131870746612549, "learning_rate": 9.583829480495325e-06, "loss": 0.939, "mean_token_accuracy": 0.6978248953819275, "step": 1279 }, { "epoch": 0.8716377255703098, "grad_norm": 1.7852433919906616, "learning_rate": 9.582929237237856e-06, "loss": 0.9144, "mean_token_accuracy": 0.7210577130317688, "step": 1280 }, { "epoch": 0.8723186925434117, "grad_norm": 1.719745397567749, "learning_rate": 9.582028063722072e-06, "loss": 0.8791, "mean_token_accuracy": 0.7112641930580139, "step": 1281 }, { "epoch": 0.8729996595165135, "grad_norm": 1.7482811212539673, "learning_rate": 9.581125960130904e-06, "loss": 0.8522, "mean_token_accuracy": 0.7306362390518188, "step": 1282 }, { "epoch": 0.8736806264896153, "grad_norm": 1.9030932188034058, "learning_rate": 9.580222926647455e-06, "loss": 0.8849, "mean_token_accuracy": 0.7229642570018768, "step": 1283 }, { "epoch": 0.874361593462717, "grad_norm": 1.7977255582809448, "learning_rate": 9.579318963455033e-06, "loss": 0.9234, "mean_token_accuracy": 0.7085744738578796, "step": 1284 }, { "epoch": 0.8750425604358189, "grad_norm": 1.95963454246521, "learning_rate": 9.578414070737124e-06, "loss": 0.8487, "mean_token_accuracy": 0.742483526468277, "step": 1285 }, { "epoch": 0.8757235274089207, "grad_norm": 1.8200833797454834, "learning_rate": 9.577508248677404e-06, "loss": 0.9165, "mean_token_accuracy": 0.7130768895149231, "step": 1286 }, { "epoch": 0.8764044943820225, "grad_norm": 1.8470381498336792, "learning_rate": 9.576601497459745e-06, "loss": 0.8824, "mean_token_accuracy": 0.715479165315628, "step": 1287 }, { "epoch": 0.8770854613551243, "grad_norm": 1.8056962490081787, "learning_rate": 9.575693817268199e-06, "loss": 0.9883, "mean_token_accuracy": 0.7010474801063538, "step": 1288 }, { "epoch": 0.877766428328226, "grad_norm": 1.5777684450149536, "learning_rate": 9.57478520828701e-06, "loss": 0.9679, "mean_token_accuracy": 0.6914712488651276, "step": 1289 }, { "epoch": 0.8784473953013279, "grad_norm": 1.7301387786865234, "learning_rate": 9.573875670700611e-06, "loss": 0.9968, "mean_token_accuracy": 0.6961714327335358, "step": 1290 }, { "epoch": 0.8791283622744297, "grad_norm": 1.7702703475952148, "learning_rate": 9.572965204693625e-06, "loss": 0.9797, "mean_token_accuracy": 0.7040505409240723, "step": 1291 }, { "epoch": 0.8798093292475315, "grad_norm": 1.6675519943237305, "learning_rate": 9.572053810450859e-06, "loss": 0.9541, "mean_token_accuracy": 0.7145614326000214, "step": 1292 }, { "epoch": 0.8804902962206334, "grad_norm": 1.7657772302627563, "learning_rate": 9.571141488157311e-06, "loss": 0.8826, "mean_token_accuracy": 0.7263154685497284, "step": 1293 }, { "epoch": 0.8811712631937351, "grad_norm": 1.9213142395019531, "learning_rate": 9.570228237998167e-06, "loss": 0.9495, "mean_token_accuracy": 0.7077927589416504, "step": 1294 }, { "epoch": 0.8818522301668369, "grad_norm": 1.8291773796081543, "learning_rate": 9.569314060158804e-06, "loss": 0.98, "mean_token_accuracy": 0.698774129152298, "step": 1295 }, { "epoch": 0.8825331971399387, "grad_norm": 1.7054400444030762, "learning_rate": 9.568398954824782e-06, "loss": 0.9858, "mean_token_accuracy": 0.7051801085472107, "step": 1296 }, { "epoch": 0.8832141641130405, "grad_norm": 1.8478219509124756, "learning_rate": 9.567482922181855e-06, "loss": 0.8565, "mean_token_accuracy": 0.7359857857227325, "step": 1297 }, { "epoch": 0.8838951310861424, "grad_norm": 1.6342017650604248, "learning_rate": 9.566565962415958e-06, "loss": 1.0046, "mean_token_accuracy": 0.6887188851833344, "step": 1298 }, { "epoch": 0.8845760980592441, "grad_norm": 1.7492485046386719, "learning_rate": 9.565648075713224e-06, "loss": 1.0001, "mean_token_accuracy": 0.6914623379707336, "step": 1299 }, { "epoch": 0.8852570650323459, "grad_norm": 1.8332442045211792, "learning_rate": 9.564729262259963e-06, "loss": 1.0191, "mean_token_accuracy": 0.688644528388977, "step": 1300 }, { "epoch": 0.8859380320054477, "grad_norm": 1.8758291006088257, "learning_rate": 9.563809522242686e-06, "loss": 1.0047, "mean_token_accuracy": 0.7018884420394897, "step": 1301 }, { "epoch": 0.8866189989785496, "grad_norm": 1.7795690298080444, "learning_rate": 9.562888855848078e-06, "loss": 0.8751, "mean_token_accuracy": 0.7321815192699432, "step": 1302 }, { "epoch": 0.8872999659516514, "grad_norm": 1.7450295686721802, "learning_rate": 9.561967263263021e-06, "loss": 0.9252, "mean_token_accuracy": 0.7063422203063965, "step": 1303 }, { "epoch": 0.8879809329247531, "grad_norm": 1.6732677221298218, "learning_rate": 9.561044744674586e-06, "loss": 1.0289, "mean_token_accuracy": 0.6824813485145569, "step": 1304 }, { "epoch": 0.8886618998978549, "grad_norm": 1.7312692403793335, "learning_rate": 9.560121300270025e-06, "loss": 0.8928, "mean_token_accuracy": 0.7174566388130188, "step": 1305 }, { "epoch": 0.8893428668709568, "grad_norm": 1.8757474422454834, "learning_rate": 9.559196930236783e-06, "loss": 0.8633, "mean_token_accuracy": 0.7301332056522369, "step": 1306 }, { "epoch": 0.8900238338440586, "grad_norm": 1.6410812139511108, "learning_rate": 9.558271634762493e-06, "loss": 0.9386, "mean_token_accuracy": 0.7111132442951202, "step": 1307 }, { "epoch": 0.8907048008171604, "grad_norm": 1.8763980865478516, "learning_rate": 9.557345414034974e-06, "loss": 0.872, "mean_token_accuracy": 0.7335589528083801, "step": 1308 }, { "epoch": 0.8913857677902621, "grad_norm": 1.9127739667892456, "learning_rate": 9.55641826824223e-06, "loss": 0.8545, "mean_token_accuracy": 0.719176858663559, "step": 1309 }, { "epoch": 0.892066734763364, "grad_norm": 1.8355860710144043, "learning_rate": 9.555490197572464e-06, "loss": 0.9246, "mean_token_accuracy": 0.7121459245681763, "step": 1310 }, { "epoch": 0.8927477017364658, "grad_norm": 1.768162488937378, "learning_rate": 9.55456120221405e-06, "loss": 0.9536, "mean_token_accuracy": 0.7036756575107574, "step": 1311 }, { "epoch": 0.8934286687095676, "grad_norm": 1.6705138683319092, "learning_rate": 9.553631282355566e-06, "loss": 0.8666, "mean_token_accuracy": 0.7374596297740936, "step": 1312 }, { "epoch": 0.8941096356826694, "grad_norm": 2.017153024673462, "learning_rate": 9.552700438185765e-06, "loss": 0.8376, "mean_token_accuracy": 0.7487968802452087, "step": 1313 }, { "epoch": 0.8947906026557712, "grad_norm": 1.9194785356521606, "learning_rate": 9.551768669893594e-06, "loss": 0.9432, "mean_token_accuracy": 0.7130188643932343, "step": 1314 }, { "epoch": 0.895471569628873, "grad_norm": 1.7185858488082886, "learning_rate": 9.550835977668188e-06, "loss": 0.9712, "mean_token_accuracy": 0.7136608064174652, "step": 1315 }, { "epoch": 0.8961525366019748, "grad_norm": 1.781015396118164, "learning_rate": 9.549902361698868e-06, "loss": 1.0328, "mean_token_accuracy": 0.685062825679779, "step": 1316 }, { "epoch": 0.8968335035750766, "grad_norm": 1.858420491218567, "learning_rate": 9.548967822175142e-06, "loss": 0.8817, "mean_token_accuracy": 0.7297031879425049, "step": 1317 }, { "epoch": 0.8975144705481785, "grad_norm": 1.6796718835830688, "learning_rate": 9.548032359286704e-06, "loss": 0.9983, "mean_token_accuracy": 0.6973167061805725, "step": 1318 }, { "epoch": 0.8981954375212802, "grad_norm": 1.7884842157363892, "learning_rate": 9.54709597322344e-06, "loss": 0.9209, "mean_token_accuracy": 0.71080082654953, "step": 1319 }, { "epoch": 0.898876404494382, "grad_norm": 1.7249722480773926, "learning_rate": 9.54615866417542e-06, "loss": 0.8107, "mean_token_accuracy": 0.7486619055271149, "step": 1320 }, { "epoch": 0.8995573714674838, "grad_norm": 1.7357466220855713, "learning_rate": 9.5452204323329e-06, "loss": 0.9718, "mean_token_accuracy": 0.7017837762832642, "step": 1321 }, { "epoch": 0.9002383384405857, "grad_norm": 1.7768385410308838, "learning_rate": 9.544281277886328e-06, "loss": 0.9347, "mean_token_accuracy": 0.687223345041275, "step": 1322 }, { "epoch": 0.9009193054136875, "grad_norm": 1.8065022230148315, "learning_rate": 9.543341201026339e-06, "loss": 0.934, "mean_token_accuracy": 0.7154054939746857, "step": 1323 }, { "epoch": 0.9016002723867892, "grad_norm": 1.7846463918685913, "learning_rate": 9.54240020194375e-06, "loss": 0.9456, "mean_token_accuracy": 0.7057912349700928, "step": 1324 }, { "epoch": 0.902281239359891, "grad_norm": 1.752097487449646, "learning_rate": 9.541458280829566e-06, "loss": 0.8963, "mean_token_accuracy": 0.7202872037887573, "step": 1325 }, { "epoch": 0.9029622063329928, "grad_norm": 1.7589963674545288, "learning_rate": 9.540515437874982e-06, "loss": 0.9763, "mean_token_accuracy": 0.7012647688388824, "step": 1326 }, { "epoch": 0.9036431733060947, "grad_norm": 1.7143704891204834, "learning_rate": 9.539571673271384e-06, "loss": 0.9749, "mean_token_accuracy": 0.7076754570007324, "step": 1327 }, { "epoch": 0.9043241402791965, "grad_norm": 1.6671109199523926, "learning_rate": 9.538626987210335e-06, "loss": 0.8937, "mean_token_accuracy": 0.714063972234726, "step": 1328 }, { "epoch": 0.9050051072522982, "grad_norm": 1.5903246402740479, "learning_rate": 9.537681379883594e-06, "loss": 0.918, "mean_token_accuracy": 0.7075797617435455, "step": 1329 }, { "epoch": 0.9056860742254, "grad_norm": 1.7641247510910034, "learning_rate": 9.536734851483102e-06, "loss": 0.9169, "mean_token_accuracy": 0.7231203317642212, "step": 1330 }, { "epoch": 0.9063670411985019, "grad_norm": 1.7209962606430054, "learning_rate": 9.535787402200992e-06, "loss": 0.8211, "mean_token_accuracy": 0.7185207009315491, "step": 1331 }, { "epoch": 0.9070480081716037, "grad_norm": 1.7949016094207764, "learning_rate": 9.534839032229574e-06, "loss": 0.91, "mean_token_accuracy": 0.7160229086875916, "step": 1332 }, { "epoch": 0.9077289751447055, "grad_norm": 1.8708707094192505, "learning_rate": 9.533889741761355e-06, "loss": 0.9338, "mean_token_accuracy": 0.7162827253341675, "step": 1333 }, { "epoch": 0.9084099421178072, "grad_norm": 1.637528657913208, "learning_rate": 9.532939530989027e-06, "loss": 1.0012, "mean_token_accuracy": 0.6911028325557709, "step": 1334 }, { "epoch": 0.9090909090909091, "grad_norm": 1.7770684957504272, "learning_rate": 9.531988400105462e-06, "loss": 1.0325, "mean_token_accuracy": 0.67650106549263, "step": 1335 }, { "epoch": 0.9097718760640109, "grad_norm": 1.6854019165039062, "learning_rate": 9.531036349303729e-06, "loss": 0.9743, "mean_token_accuracy": 0.7058890461921692, "step": 1336 }, { "epoch": 0.9104528430371127, "grad_norm": 1.4888265132904053, "learning_rate": 9.530083378777075e-06, "loss": 0.99, "mean_token_accuracy": 0.7015166878700256, "step": 1337 }, { "epoch": 0.9111338100102145, "grad_norm": 1.799291729927063, "learning_rate": 9.529129488718938e-06, "loss": 0.9625, "mean_token_accuracy": 0.7084206938743591, "step": 1338 }, { "epoch": 0.9118147769833163, "grad_norm": 1.751110315322876, "learning_rate": 9.528174679322943e-06, "loss": 0.911, "mean_token_accuracy": 0.6984183490276337, "step": 1339 }, { "epoch": 0.9124957439564181, "grad_norm": 1.654313087463379, "learning_rate": 9.527218950782898e-06, "loss": 1.0132, "mean_token_accuracy": 0.6969050168991089, "step": 1340 }, { "epoch": 0.9131767109295199, "grad_norm": 1.61264169216156, "learning_rate": 9.526262303292801e-06, "loss": 1.0032, "mean_token_accuracy": 0.691008597612381, "step": 1341 }, { "epoch": 0.9138576779026217, "grad_norm": 1.7383400201797485, "learning_rate": 9.525304737046838e-06, "loss": 1.0701, "mean_token_accuracy": 0.6834848523139954, "step": 1342 }, { "epoch": 0.9145386448757236, "grad_norm": 1.799468994140625, "learning_rate": 9.524346252239377e-06, "loss": 0.8918, "mean_token_accuracy": 0.73136305809021, "step": 1343 }, { "epoch": 0.9152196118488254, "grad_norm": 1.739681363105774, "learning_rate": 9.523386849064972e-06, "loss": 0.8964, "mean_token_accuracy": 0.7292089760303497, "step": 1344 }, { "epoch": 0.9159005788219271, "grad_norm": 1.795621633529663, "learning_rate": 9.522426527718369e-06, "loss": 0.8804, "mean_token_accuracy": 0.7286984920501709, "step": 1345 }, { "epoch": 0.9165815457950289, "grad_norm": 2.032900094985962, "learning_rate": 9.521465288394498e-06, "loss": 0.8558, "mean_token_accuracy": 0.7381353080272675, "step": 1346 }, { "epoch": 0.9172625127681308, "grad_norm": 1.7226364612579346, "learning_rate": 9.52050313128847e-06, "loss": 0.8765, "mean_token_accuracy": 0.7260130047798157, "step": 1347 }, { "epoch": 0.9179434797412326, "grad_norm": 1.6414341926574707, "learning_rate": 9.519540056595593e-06, "loss": 0.9184, "mean_token_accuracy": 0.7071200013160706, "step": 1348 }, { "epoch": 0.9186244467143344, "grad_norm": 1.873913049697876, "learning_rate": 9.518576064511349e-06, "loss": 0.862, "mean_token_accuracy": 0.7423662841320038, "step": 1349 }, { "epoch": 0.9193054136874361, "grad_norm": 1.6670517921447754, "learning_rate": 9.517611155231417e-06, "loss": 0.927, "mean_token_accuracy": 0.7011347711086273, "step": 1350 }, { "epoch": 0.919986380660538, "grad_norm": 1.6588120460510254, "learning_rate": 9.516645328951657e-06, "loss": 0.8948, "mean_token_accuracy": 0.7320249676704407, "step": 1351 }, { "epoch": 0.9206673476336398, "grad_norm": 1.5606603622436523, "learning_rate": 9.515678585868113e-06, "loss": 1.0094, "mean_token_accuracy": 0.6876223087310791, "step": 1352 }, { "epoch": 0.9213483146067416, "grad_norm": 1.7628477811813354, "learning_rate": 9.514710926177019e-06, "loss": 0.9491, "mean_token_accuracy": 0.7163446247577667, "step": 1353 }, { "epoch": 0.9220292815798434, "grad_norm": 1.7022026777267456, "learning_rate": 9.513742350074795e-06, "loss": 0.9151, "mean_token_accuracy": 0.705122172832489, "step": 1354 }, { "epoch": 0.9227102485529451, "grad_norm": 1.704234004020691, "learning_rate": 9.512772857758044e-06, "loss": 0.9081, "mean_token_accuracy": 0.7250590920448303, "step": 1355 }, { "epoch": 0.923391215526047, "grad_norm": 1.783499002456665, "learning_rate": 9.511802449423557e-06, "loss": 0.8735, "mean_token_accuracy": 0.7244462668895721, "step": 1356 }, { "epoch": 0.9240721824991488, "grad_norm": 1.6807365417480469, "learning_rate": 9.510831125268314e-06, "loss": 0.944, "mean_token_accuracy": 0.716223418712616, "step": 1357 }, { "epoch": 0.9247531494722506, "grad_norm": 1.884333848953247, "learning_rate": 9.509858885489473e-06, "loss": 0.9517, "mean_token_accuracy": 0.6913713216781616, "step": 1358 }, { "epoch": 0.9254341164453525, "grad_norm": 1.7600163221359253, "learning_rate": 9.508885730284386e-06, "loss": 1.0172, "mean_token_accuracy": 0.6848571002483368, "step": 1359 }, { "epoch": 0.9261150834184542, "grad_norm": 1.873063325881958, "learning_rate": 9.507911659850585e-06, "loss": 0.9832, "mean_token_accuracy": 0.6919971406459808, "step": 1360 }, { "epoch": 0.926796050391556, "grad_norm": 1.7677823305130005, "learning_rate": 9.50693667438579e-06, "loss": 0.9464, "mean_token_accuracy": 0.7034784555435181, "step": 1361 }, { "epoch": 0.9274770173646578, "grad_norm": 1.8684087991714478, "learning_rate": 9.505960774087907e-06, "loss": 0.8242, "mean_token_accuracy": 0.7383258938789368, "step": 1362 }, { "epoch": 0.9281579843377596, "grad_norm": 1.7691553831100464, "learning_rate": 9.50498395915503e-06, "loss": 0.8506, "mean_token_accuracy": 0.7368349432945251, "step": 1363 }, { "epoch": 0.9288389513108615, "grad_norm": 1.6196272373199463, "learning_rate": 9.504006229785435e-06, "loss": 0.8632, "mean_token_accuracy": 0.7316581606864929, "step": 1364 }, { "epoch": 0.9295199182839632, "grad_norm": 1.703822374343872, "learning_rate": 9.503027586177582e-06, "loss": 0.9801, "mean_token_accuracy": 0.7055411040782928, "step": 1365 }, { "epoch": 0.930200885257065, "grad_norm": 1.960737943649292, "learning_rate": 9.502048028530122e-06, "loss": 0.8813, "mean_token_accuracy": 0.7304438948631287, "step": 1366 }, { "epoch": 0.9308818522301668, "grad_norm": 1.6655274629592896, "learning_rate": 9.50106755704189e-06, "loss": 0.9377, "mean_token_accuracy": 0.6937447786331177, "step": 1367 }, { "epoch": 0.9315628192032687, "grad_norm": 1.7508503198623657, "learning_rate": 9.500086171911902e-06, "loss": 0.8807, "mean_token_accuracy": 0.7120013535022736, "step": 1368 }, { "epoch": 0.9322437861763705, "grad_norm": 1.8378853797912598, "learning_rate": 9.499103873339365e-06, "loss": 0.9032, "mean_token_accuracy": 0.7231522500514984, "step": 1369 }, { "epoch": 0.9329247531494722, "grad_norm": 1.7637168169021606, "learning_rate": 9.498120661523668e-06, "loss": 0.9408, "mean_token_accuracy": 0.6973828375339508, "step": 1370 }, { "epoch": 0.933605720122574, "grad_norm": 1.614024043083191, "learning_rate": 9.49713653666439e-06, "loss": 1.0666, "mean_token_accuracy": 0.6816617548465729, "step": 1371 }, { "epoch": 0.9342866870956759, "grad_norm": 1.794519305229187, "learning_rate": 9.496151498961288e-06, "loss": 0.9358, "mean_token_accuracy": 0.708680659532547, "step": 1372 }, { "epoch": 0.9349676540687777, "grad_norm": 1.6954325437545776, "learning_rate": 9.495165548614308e-06, "loss": 0.9116, "mean_token_accuracy": 0.7253568470478058, "step": 1373 }, { "epoch": 0.9356486210418795, "grad_norm": 1.6646203994750977, "learning_rate": 9.494178685823586e-06, "loss": 0.8382, "mean_token_accuracy": 0.7422456741333008, "step": 1374 }, { "epoch": 0.9363295880149812, "grad_norm": 1.6028856039047241, "learning_rate": 9.493190910789434e-06, "loss": 0.9347, "mean_token_accuracy": 0.7159057557582855, "step": 1375 }, { "epoch": 0.937010554988083, "grad_norm": 1.7889769077301025, "learning_rate": 9.492202223712358e-06, "loss": 0.8806, "mean_token_accuracy": 0.73721644282341, "step": 1376 }, { "epoch": 0.9376915219611849, "grad_norm": 1.7294344902038574, "learning_rate": 9.491212624793041e-06, "loss": 0.945, "mean_token_accuracy": 0.7071895599365234, "step": 1377 }, { "epoch": 0.9383724889342867, "grad_norm": 1.589557409286499, "learning_rate": 9.490222114232357e-06, "loss": 0.9531, "mean_token_accuracy": 0.7036537528038025, "step": 1378 }, { "epoch": 0.9390534559073885, "grad_norm": 1.7099226713180542, "learning_rate": 9.489230692231363e-06, "loss": 1.0582, "mean_token_accuracy": 0.6904714703559875, "step": 1379 }, { "epoch": 0.9397344228804902, "grad_norm": 1.7695024013519287, "learning_rate": 9.488238358991302e-06, "loss": 0.9665, "mean_token_accuracy": 0.7041118443012238, "step": 1380 }, { "epoch": 0.9404153898535921, "grad_norm": 1.6446332931518555, "learning_rate": 9.4872451147136e-06, "loss": 0.9261, "mean_token_accuracy": 0.7058785259723663, "step": 1381 }, { "epoch": 0.9410963568266939, "grad_norm": 1.6836919784545898, "learning_rate": 9.48625095959987e-06, "loss": 0.9685, "mean_token_accuracy": 0.7095093131065369, "step": 1382 }, { "epoch": 0.9417773237997957, "grad_norm": 1.6533249616622925, "learning_rate": 9.485255893851906e-06, "loss": 0.908, "mean_token_accuracy": 0.7156163156032562, "step": 1383 }, { "epoch": 0.9424582907728976, "grad_norm": 1.663745641708374, "learning_rate": 9.484259917671695e-06, "loss": 0.9057, "mean_token_accuracy": 0.7077637910842896, "step": 1384 }, { "epoch": 0.9431392577459993, "grad_norm": 1.6761473417282104, "learning_rate": 9.483263031261397e-06, "loss": 0.8853, "mean_token_accuracy": 0.7248724102973938, "step": 1385 }, { "epoch": 0.9438202247191011, "grad_norm": 1.8022072315216064, "learning_rate": 9.48226523482337e-06, "loss": 0.9314, "mean_token_accuracy": 0.7158766686916351, "step": 1386 }, { "epoch": 0.9445011916922029, "grad_norm": 1.766812801361084, "learning_rate": 9.481266528560147e-06, "loss": 0.9197, "mean_token_accuracy": 0.706868588924408, "step": 1387 }, { "epoch": 0.9451821586653048, "grad_norm": 1.8442860841751099, "learning_rate": 9.480266912674447e-06, "loss": 0.9233, "mean_token_accuracy": 0.7168764770030975, "step": 1388 }, { "epoch": 0.9458631256384066, "grad_norm": 1.6160005331039429, "learning_rate": 9.47926638736918e-06, "loss": 0.942, "mean_token_accuracy": 0.7094301283359528, "step": 1389 }, { "epoch": 0.9465440926115083, "grad_norm": 1.6811944246292114, "learning_rate": 9.47826495284743e-06, "loss": 0.9844, "mean_token_accuracy": 0.6943905353546143, "step": 1390 }, { "epoch": 0.9472250595846101, "grad_norm": 1.7083064317703247, "learning_rate": 9.477262609312476e-06, "loss": 0.9401, "mean_token_accuracy": 0.7108964622020721, "step": 1391 }, { "epoch": 0.947906026557712, "grad_norm": 1.854647159576416, "learning_rate": 9.476259356967776e-06, "loss": 0.9071, "mean_token_accuracy": 0.7064099311828613, "step": 1392 }, { "epoch": 0.9485869935308138, "grad_norm": 1.6735953092575073, "learning_rate": 9.475255196016972e-06, "loss": 0.8345, "mean_token_accuracy": 0.7411974370479584, "step": 1393 }, { "epoch": 0.9492679605039156, "grad_norm": 1.7410629987716675, "learning_rate": 9.474250126663896e-06, "loss": 0.7818, "mean_token_accuracy": 0.7572416365146637, "step": 1394 }, { "epoch": 0.9499489274770173, "grad_norm": 1.7858902215957642, "learning_rate": 9.473244149112556e-06, "loss": 0.8767, "mean_token_accuracy": 0.7342624366283417, "step": 1395 }, { "epoch": 0.9506298944501191, "grad_norm": 1.9585216045379639, "learning_rate": 9.47223726356715e-06, "loss": 0.987, "mean_token_accuracy": 0.7050594389438629, "step": 1396 }, { "epoch": 0.951310861423221, "grad_norm": 1.650991439819336, "learning_rate": 9.471229470232058e-06, "loss": 1.0031, "mean_token_accuracy": 0.6917406916618347, "step": 1397 }, { "epoch": 0.9519918283963228, "grad_norm": 1.790953278541565, "learning_rate": 9.470220769311848e-06, "loss": 0.909, "mean_token_accuracy": 0.728043407201767, "step": 1398 }, { "epoch": 0.9526727953694246, "grad_norm": 1.756285309791565, "learning_rate": 9.469211161011267e-06, "loss": 0.9321, "mean_token_accuracy": 0.7154143154621124, "step": 1399 }, { "epoch": 0.9533537623425263, "grad_norm": 1.8589266538619995, "learning_rate": 9.468200645535251e-06, "loss": 0.9435, "mean_token_accuracy": 0.7243132293224335, "step": 1400 }, { "epoch": 0.9540347293156282, "grad_norm": 1.7047231197357178, "learning_rate": 9.467189223088915e-06, "loss": 0.9674, "mean_token_accuracy": 0.7076576948165894, "step": 1401 }, { "epoch": 0.95471569628873, "grad_norm": 1.858126163482666, "learning_rate": 9.466176893877563e-06, "loss": 0.9044, "mean_token_accuracy": 0.7142670154571533, "step": 1402 }, { "epoch": 0.9553966632618318, "grad_norm": 1.9765174388885498, "learning_rate": 9.46516365810668e-06, "loss": 0.8563, "mean_token_accuracy": 0.7337316274642944, "step": 1403 }, { "epoch": 0.9560776302349336, "grad_norm": 1.7359815835952759, "learning_rate": 9.464149515981936e-06, "loss": 0.9738, "mean_token_accuracy": 0.7082333862781525, "step": 1404 }, { "epoch": 0.9567585972080354, "grad_norm": 1.8007320165634155, "learning_rate": 9.463134467709187e-06, "loss": 0.9628, "mean_token_accuracy": 0.7005479335784912, "step": 1405 }, { "epoch": 0.9574395641811372, "grad_norm": 1.6704788208007812, "learning_rate": 9.462118513494467e-06, "loss": 1.0108, "mean_token_accuracy": 0.6976579427719116, "step": 1406 }, { "epoch": 0.958120531154239, "grad_norm": 1.8170368671417236, "learning_rate": 9.461101653544001e-06, "loss": 0.9363, "mean_token_accuracy": 0.714349776506424, "step": 1407 }, { "epoch": 0.9588014981273408, "grad_norm": 1.6351209878921509, "learning_rate": 9.460083888064194e-06, "loss": 0.9028, "mean_token_accuracy": 0.7218720018863678, "step": 1408 }, { "epoch": 0.9594824651004427, "grad_norm": 1.7019774913787842, "learning_rate": 9.459065217261635e-06, "loss": 0.9086, "mean_token_accuracy": 0.7159274518489838, "step": 1409 }, { "epoch": 0.9601634320735445, "grad_norm": 1.7704575061798096, "learning_rate": 9.458045641343096e-06, "loss": 1.0322, "mean_token_accuracy": 0.6901247501373291, "step": 1410 }, { "epoch": 0.9608443990466462, "grad_norm": 1.7433433532714844, "learning_rate": 9.457025160515536e-06, "loss": 0.9306, "mean_token_accuracy": 0.7056136429309845, "step": 1411 }, { "epoch": 0.961525366019748, "grad_norm": 1.902492642402649, "learning_rate": 9.456003774986096e-06, "loss": 0.8325, "mean_token_accuracy": 0.7342240810394287, "step": 1412 }, { "epoch": 0.9622063329928499, "grad_norm": 1.9608193635940552, "learning_rate": 9.454981484962097e-06, "loss": 0.8096, "mean_token_accuracy": 0.7458337545394897, "step": 1413 }, { "epoch": 0.9628872999659517, "grad_norm": 1.800539493560791, "learning_rate": 9.453958290651051e-06, "loss": 0.8594, "mean_token_accuracy": 0.7331580221652985, "step": 1414 }, { "epoch": 0.9635682669390535, "grad_norm": 1.662778377532959, "learning_rate": 9.452934192260645e-06, "loss": 0.9604, "mean_token_accuracy": 0.6862944662570953, "step": 1415 }, { "epoch": 0.9642492339121552, "grad_norm": 1.994627833366394, "learning_rate": 9.451909189998756e-06, "loss": 0.9756, "mean_token_accuracy": 0.6907395422458649, "step": 1416 }, { "epoch": 0.964930200885257, "grad_norm": 1.6409798860549927, "learning_rate": 9.450883284073444e-06, "loss": 0.9035, "mean_token_accuracy": 0.7191045880317688, "step": 1417 }, { "epoch": 0.9656111678583589, "grad_norm": 1.6834880113601685, "learning_rate": 9.449856474692951e-06, "loss": 0.8426, "mean_token_accuracy": 0.7172749638557434, "step": 1418 }, { "epoch": 0.9662921348314607, "grad_norm": 1.8293122053146362, "learning_rate": 9.448828762065698e-06, "loss": 0.8636, "mean_token_accuracy": 0.7270095646381378, "step": 1419 }, { "epoch": 0.9669731018045625, "grad_norm": 1.7684545516967773, "learning_rate": 9.447800146400297e-06, "loss": 0.8833, "mean_token_accuracy": 0.7270847260951996, "step": 1420 }, { "epoch": 0.9676540687776642, "grad_norm": 1.5930824279785156, "learning_rate": 9.44677062790554e-06, "loss": 1.0944, "mean_token_accuracy": 0.6758308112621307, "step": 1421 }, { "epoch": 0.9683350357507661, "grad_norm": 1.7869551181793213, "learning_rate": 9.4457402067904e-06, "loss": 0.9716, "mean_token_accuracy": 0.6855579316616058, "step": 1422 }, { "epoch": 0.9690160027238679, "grad_norm": 1.7898876667022705, "learning_rate": 9.444708883264036e-06, "loss": 0.9006, "mean_token_accuracy": 0.7248052060604095, "step": 1423 }, { "epoch": 0.9696969696969697, "grad_norm": 1.757073998451233, "learning_rate": 9.443676657535792e-06, "loss": 0.8794, "mean_token_accuracy": 0.7217016220092773, "step": 1424 }, { "epoch": 0.9703779366700716, "grad_norm": 1.735515832901001, "learning_rate": 9.44264352981519e-06, "loss": 0.9616, "mean_token_accuracy": 0.710945725440979, "step": 1425 }, { "epoch": 0.9710589036431733, "grad_norm": 1.758655071258545, "learning_rate": 9.441609500311936e-06, "loss": 0.9238, "mean_token_accuracy": 0.7131350040435791, "step": 1426 }, { "epoch": 0.9717398706162751, "grad_norm": 1.9050201177597046, "learning_rate": 9.440574569235925e-06, "loss": 0.8875, "mean_token_accuracy": 0.7326708137989044, "step": 1427 }, { "epoch": 0.9724208375893769, "grad_norm": 1.6968010663986206, "learning_rate": 9.439538736797227e-06, "loss": 0.9775, "mean_token_accuracy": 0.7064845561981201, "step": 1428 }, { "epoch": 0.9731018045624787, "grad_norm": 1.7630928754806519, "learning_rate": 9.438502003206103e-06, "loss": 0.9041, "mean_token_accuracy": 0.7243589162826538, "step": 1429 }, { "epoch": 0.9737827715355806, "grad_norm": 1.9552353620529175, "learning_rate": 9.43746436867299e-06, "loss": 0.773, "mean_token_accuracy": 0.7590298354625702, "step": 1430 }, { "epoch": 0.9744637385086823, "grad_norm": 1.633804440498352, "learning_rate": 9.436425833408509e-06, "loss": 1.0393, "mean_token_accuracy": 0.6936219036579132, "step": 1431 }, { "epoch": 0.9751447054817841, "grad_norm": 1.6176239252090454, "learning_rate": 9.435386397623469e-06, "loss": 1.0389, "mean_token_accuracy": 0.6789788007736206, "step": 1432 }, { "epoch": 0.9758256724548859, "grad_norm": 1.7643330097198486, "learning_rate": 9.434346061528856e-06, "loss": 0.8286, "mean_token_accuracy": 0.7505448162555695, "step": 1433 }, { "epoch": 0.9765066394279878, "grad_norm": 1.7557414770126343, "learning_rate": 9.43330482533584e-06, "loss": 0.9286, "mean_token_accuracy": 0.7153541743755341, "step": 1434 }, { "epoch": 0.9771876064010896, "grad_norm": 1.7853325605392456, "learning_rate": 9.432262689255777e-06, "loss": 0.9104, "mean_token_accuracy": 0.7236232459545135, "step": 1435 }, { "epoch": 0.9778685733741913, "grad_norm": 1.850640892982483, "learning_rate": 9.4312196535002e-06, "loss": 0.8954, "mean_token_accuracy": 0.7079088687896729, "step": 1436 }, { "epoch": 0.9785495403472931, "grad_norm": 1.7933961153030396, "learning_rate": 9.430175718280832e-06, "loss": 0.9223, "mean_token_accuracy": 0.7195059061050415, "step": 1437 }, { "epoch": 0.979230507320395, "grad_norm": 1.5514497756958008, "learning_rate": 9.429130883809572e-06, "loss": 1.09, "mean_token_accuracy": 0.6641381084918976, "step": 1438 }, { "epoch": 0.9799114742934968, "grad_norm": 1.6370185613632202, "learning_rate": 9.428085150298505e-06, "loss": 0.963, "mean_token_accuracy": 0.7059635519981384, "step": 1439 }, { "epoch": 0.9805924412665986, "grad_norm": 1.672754168510437, "learning_rate": 9.427038517959897e-06, "loss": 0.8732, "mean_token_accuracy": 0.7399011850357056, "step": 1440 }, { "epoch": 0.9812734082397003, "grad_norm": 1.5920825004577637, "learning_rate": 9.425990987006196e-06, "loss": 0.9735, "mean_token_accuracy": 0.6993723511695862, "step": 1441 }, { "epoch": 0.9819543752128022, "grad_norm": 1.7651885747909546, "learning_rate": 9.424942557650035e-06, "loss": 1.0049, "mean_token_accuracy": 0.7050550580024719, "step": 1442 }, { "epoch": 0.982635342185904, "grad_norm": 1.8577635288238525, "learning_rate": 9.423893230104226e-06, "loss": 0.9105, "mean_token_accuracy": 0.7006041407585144, "step": 1443 }, { "epoch": 0.9833163091590058, "grad_norm": 1.946603536605835, "learning_rate": 9.422843004581765e-06, "loss": 0.9835, "mean_token_accuracy": 0.6994627118110657, "step": 1444 }, { "epoch": 0.9839972761321076, "grad_norm": 1.6945401430130005, "learning_rate": 9.421791881295832e-06, "loss": 0.8804, "mean_token_accuracy": 0.6926760673522949, "step": 1445 }, { "epoch": 0.9846782431052093, "grad_norm": 1.8106458187103271, "learning_rate": 9.420739860459786e-06, "loss": 0.8679, "mean_token_accuracy": 0.7198711037635803, "step": 1446 }, { "epoch": 0.9853592100783112, "grad_norm": 1.673215627670288, "learning_rate": 9.419686942287171e-06, "loss": 0.9088, "mean_token_accuracy": 0.7161553502082825, "step": 1447 }, { "epoch": 0.986040177051413, "grad_norm": 1.6412218809127808, "learning_rate": 9.418633126991713e-06, "loss": 1.0405, "mean_token_accuracy": 0.6950632333755493, "step": 1448 }, { "epoch": 0.9867211440245148, "grad_norm": 1.7640122175216675, "learning_rate": 9.417578414787316e-06, "loss": 0.9054, "mean_token_accuracy": 0.721777468919754, "step": 1449 }, { "epoch": 0.9874021109976167, "grad_norm": 1.9882481098175049, "learning_rate": 9.416522805888072e-06, "loss": 0.8131, "mean_token_accuracy": 0.7488832175731659, "step": 1450 }, { "epoch": 0.9880830779707184, "grad_norm": 1.8828333616256714, "learning_rate": 9.415466300508247e-06, "loss": 0.8889, "mean_token_accuracy": 0.7287221550941467, "step": 1451 }, { "epoch": 0.9887640449438202, "grad_norm": 1.4910868406295776, "learning_rate": 9.414408898862298e-06, "loss": 0.9936, "mean_token_accuracy": 0.6912838816642761, "step": 1452 }, { "epoch": 0.989445011916922, "grad_norm": 1.9933604001998901, "learning_rate": 9.41335060116486e-06, "loss": 0.8934, "mean_token_accuracy": 0.7303038835525513, "step": 1453 }, { "epoch": 0.9901259788900239, "grad_norm": 1.893162727355957, "learning_rate": 9.412291407630749e-06, "loss": 0.8777, "mean_token_accuracy": 0.7321167290210724, "step": 1454 }, { "epoch": 0.9908069458631257, "grad_norm": 1.5464998483657837, "learning_rate": 9.411231318474963e-06, "loss": 0.9151, "mean_token_accuracy": 0.7189997732639313, "step": 1455 }, { "epoch": 0.9914879128362274, "grad_norm": 1.6364637613296509, "learning_rate": 9.410170333912683e-06, "loss": 0.9043, "mean_token_accuracy": 0.7263066470623016, "step": 1456 }, { "epoch": 0.9921688798093292, "grad_norm": 1.8725471496582031, "learning_rate": 9.409108454159272e-06, "loss": 0.912, "mean_token_accuracy": 0.7348898649215698, "step": 1457 }, { "epoch": 0.992849846782431, "grad_norm": 1.9037343263626099, "learning_rate": 9.408045679430274e-06, "loss": 0.8261, "mean_token_accuracy": 0.7453885674476624, "step": 1458 }, { "epoch": 0.9935308137555329, "grad_norm": 1.6754775047302246, "learning_rate": 9.406982009941413e-06, "loss": 0.9346, "mean_token_accuracy": 0.7107999622821808, "step": 1459 }, { "epoch": 0.9942117807286347, "grad_norm": 1.7819559574127197, "learning_rate": 9.405917445908595e-06, "loss": 0.9092, "mean_token_accuracy": 0.7170052826404572, "step": 1460 }, { "epoch": 0.9948927477017364, "grad_norm": 1.7203037738800049, "learning_rate": 9.404851987547913e-06, "loss": 0.9634, "mean_token_accuracy": 0.7097218930721283, "step": 1461 }, { "epoch": 0.9955737146748382, "grad_norm": 1.7841590642929077, "learning_rate": 9.403785635075634e-06, "loss": 0.9213, "mean_token_accuracy": 0.7213341593742371, "step": 1462 }, { "epoch": 0.9962546816479401, "grad_norm": 1.9072598218917847, "learning_rate": 9.402718388708212e-06, "loss": 0.9663, "mean_token_accuracy": 0.6949611306190491, "step": 1463 }, { "epoch": 0.9969356486210419, "grad_norm": 1.5994915962219238, "learning_rate": 9.401650248662278e-06, "loss": 1.0211, "mean_token_accuracy": 0.677570104598999, "step": 1464 }, { "epoch": 0.9976166155941437, "grad_norm": 1.6920274496078491, "learning_rate": 9.400581215154647e-06, "loss": 0.944, "mean_token_accuracy": 0.6925908327102661, "step": 1465 }, { "epoch": 0.9982975825672454, "grad_norm": 1.6356055736541748, "learning_rate": 9.399511288402318e-06, "loss": 0.9578, "mean_token_accuracy": 0.7075184285640717, "step": 1466 }, { "epoch": 0.9989785495403473, "grad_norm": 1.7898842096328735, "learning_rate": 9.398440468622465e-06, "loss": 0.9047, "mean_token_accuracy": 0.7111682891845703, "step": 1467 }, { "epoch": 0.9996595165134491, "grad_norm": 1.684064269065857, "learning_rate": 9.397368756032445e-06, "loss": 0.9774, "mean_token_accuracy": 0.6897006630897522, "step": 1468 }, { "epoch": 1.0, "grad_norm": 2.5639841556549072, "learning_rate": 9.396296150849804e-06, "loss": 0.8759, "mean_token_accuracy": 0.7297232151031494, "step": 1469 }, { "epoch": 1.0006809669731018, "grad_norm": 2.0237302780151367, "learning_rate": 9.395222653292258e-06, "loss": 0.8378, "mean_token_accuracy": 0.7197279930114746, "step": 1470 }, { "epoch": 1.0013619339462037, "grad_norm": 1.6518430709838867, "learning_rate": 9.39414826357771e-06, "loss": 0.7437, "mean_token_accuracy": 0.7423594295978546, "step": 1471 }, { "epoch": 1.0020429009193055, "grad_norm": 1.5960484743118286, "learning_rate": 9.393072981924245e-06, "loss": 0.8444, "mean_token_accuracy": 0.7318470180034637, "step": 1472 }, { "epoch": 1.0027238678924073, "grad_norm": 1.7186223268508911, "learning_rate": 9.391996808550126e-06, "loss": 0.7428, "mean_token_accuracy": 0.756513237953186, "step": 1473 }, { "epoch": 1.0034048348655091, "grad_norm": 1.5017465353012085, "learning_rate": 9.390919743673796e-06, "loss": 0.6829, "mean_token_accuracy": 0.7812684178352356, "step": 1474 }, { "epoch": 1.0040858018386107, "grad_norm": 1.4585151672363281, "learning_rate": 9.389841787513885e-06, "loss": 0.6868, "mean_token_accuracy": 0.7684345245361328, "step": 1475 }, { "epoch": 1.0047667688117126, "grad_norm": 1.8864206075668335, "learning_rate": 9.3887629402892e-06, "loss": 0.6661, "mean_token_accuracy": 0.7786177098751068, "step": 1476 }, { "epoch": 1.0054477357848144, "grad_norm": 1.6847424507141113, "learning_rate": 9.387683202218725e-06, "loss": 0.8267, "mean_token_accuracy": 0.741792619228363, "step": 1477 }, { "epoch": 1.0061287027579162, "grad_norm": 1.6311753988265991, "learning_rate": 9.386602573521632e-06, "loss": 0.7952, "mean_token_accuracy": 0.7219721674919128, "step": 1478 }, { "epoch": 1.006809669731018, "grad_norm": 1.9457333087921143, "learning_rate": 9.385521054417272e-06, "loss": 0.7683, "mean_token_accuracy": 0.7400089502334595, "step": 1479 }, { "epoch": 1.0074906367041199, "grad_norm": 1.7204160690307617, "learning_rate": 9.384438645125172e-06, "loss": 0.7899, "mean_token_accuracy": 0.7229034900665283, "step": 1480 }, { "epoch": 1.0081716036772217, "grad_norm": 1.6766293048858643, "learning_rate": 9.383355345865046e-06, "loss": 0.6291, "mean_token_accuracy": 0.7959899008274078, "step": 1481 }, { "epoch": 1.0088525706503235, "grad_norm": 1.8749576807022095, "learning_rate": 9.382271156856784e-06, "loss": 0.7218, "mean_token_accuracy": 0.7669570744037628, "step": 1482 }, { "epoch": 1.0095335376234253, "grad_norm": 1.8255558013916016, "learning_rate": 9.381186078320457e-06, "loss": 0.6721, "mean_token_accuracy": 0.7803369164466858, "step": 1483 }, { "epoch": 1.0102145045965272, "grad_norm": 1.8329168558120728, "learning_rate": 9.38010011047632e-06, "loss": 0.8002, "mean_token_accuracy": 0.7288993299007416, "step": 1484 }, { "epoch": 1.0108954715696288, "grad_norm": 1.7767970561981201, "learning_rate": 9.379013253544807e-06, "loss": 0.6954, "mean_token_accuracy": 0.7734760046005249, "step": 1485 }, { "epoch": 1.0115764385427306, "grad_norm": 1.7294045686721802, "learning_rate": 9.37792550774653e-06, "loss": 0.815, "mean_token_accuracy": 0.7313277125358582, "step": 1486 }, { "epoch": 1.0122574055158324, "grad_norm": 1.6139864921569824, "learning_rate": 9.376836873302285e-06, "loss": 0.7622, "mean_token_accuracy": 0.7471030950546265, "step": 1487 }, { "epoch": 1.0129383724889343, "grad_norm": 1.8537720441818237, "learning_rate": 9.375747350433044e-06, "loss": 0.5861, "mean_token_accuracy": 0.8043071627616882, "step": 1488 }, { "epoch": 1.013619339462036, "grad_norm": 1.8860059976577759, "learning_rate": 9.374656939359964e-06, "loss": 0.7473, "mean_token_accuracy": 0.7497865259647369, "step": 1489 }, { "epoch": 1.014300306435138, "grad_norm": 1.784018635749817, "learning_rate": 9.37356564030438e-06, "loss": 0.8403, "mean_token_accuracy": 0.7333140671253204, "step": 1490 }, { "epoch": 1.0149812734082397, "grad_norm": 2.0446290969848633, "learning_rate": 9.372473453487808e-06, "loss": 0.6696, "mean_token_accuracy": 0.7850201427936554, "step": 1491 }, { "epoch": 1.0156622403813416, "grad_norm": 1.7837902307510376, "learning_rate": 9.371380379131942e-06, "loss": 0.6753, "mean_token_accuracy": 0.7895945012569427, "step": 1492 }, { "epoch": 1.0163432073544434, "grad_norm": 1.7967807054519653, "learning_rate": 9.37028641745866e-06, "loss": 0.6872, "mean_token_accuracy": 0.7719338238239288, "step": 1493 }, { "epoch": 1.0170241743275452, "grad_norm": 1.9302057027816772, "learning_rate": 9.369191568690017e-06, "loss": 0.8428, "mean_token_accuracy": 0.7069588303565979, "step": 1494 }, { "epoch": 1.0177051413006468, "grad_norm": 1.6871150732040405, "learning_rate": 9.36809583304825e-06, "loss": 0.916, "mean_token_accuracy": 0.7209416329860687, "step": 1495 }, { "epoch": 1.0183861082737486, "grad_norm": 1.9317740201950073, "learning_rate": 9.366999210755774e-06, "loss": 0.6576, "mean_token_accuracy": 0.7717374265193939, "step": 1496 }, { "epoch": 1.0190670752468505, "grad_norm": 1.6651209592819214, "learning_rate": 9.365901702035183e-06, "loss": 0.7415, "mean_token_accuracy": 0.7447335422039032, "step": 1497 }, { "epoch": 1.0197480422199523, "grad_norm": 1.7661545276641846, "learning_rate": 9.364803307109258e-06, "loss": 0.8456, "mean_token_accuracy": 0.7389750182628632, "step": 1498 }, { "epoch": 1.0204290091930541, "grad_norm": 1.7154121398925781, "learning_rate": 9.36370402620095e-06, "loss": 0.7974, "mean_token_accuracy": 0.7401939332485199, "step": 1499 }, { "epoch": 1.021109976166156, "grad_norm": 1.7382234334945679, "learning_rate": 9.362603859533401e-06, "loss": 0.7248, "mean_token_accuracy": 0.7474386096000671, "step": 1500 }, { "epoch": 1.0217909431392578, "grad_norm": 1.6274300813674927, "learning_rate": 9.361502807329918e-06, "loss": 0.7538, "mean_token_accuracy": 0.748777836561203, "step": 1501 }, { "epoch": 1.0224719101123596, "grad_norm": 1.6125389337539673, "learning_rate": 9.360400869814004e-06, "loss": 0.8044, "mean_token_accuracy": 0.7323842346668243, "step": 1502 }, { "epoch": 1.0231528770854614, "grad_norm": 1.9158130884170532, "learning_rate": 9.359298047209329e-06, "loss": 0.8373, "mean_token_accuracy": 0.7314241826534271, "step": 1503 }, { "epoch": 1.0238338440585633, "grad_norm": 1.5729140043258667, "learning_rate": 9.358194339739748e-06, "loss": 0.7317, "mean_token_accuracy": 0.7430703639984131, "step": 1504 }, { "epoch": 1.0245148110316649, "grad_norm": 1.836482048034668, "learning_rate": 9.357089747629298e-06, "loss": 0.6877, "mean_token_accuracy": 0.7779473066329956, "step": 1505 }, { "epoch": 1.0251957780047667, "grad_norm": 1.6471978425979614, "learning_rate": 9.355984271102191e-06, "loss": 0.7802, "mean_token_accuracy": 0.7417094111442566, "step": 1506 }, { "epoch": 1.0258767449778685, "grad_norm": 1.6099820137023926, "learning_rate": 9.35487791038282e-06, "loss": 0.5883, "mean_token_accuracy": 0.8000064492225647, "step": 1507 }, { "epoch": 1.0265577119509703, "grad_norm": 1.7541922330856323, "learning_rate": 9.35377066569576e-06, "loss": 0.6352, "mean_token_accuracy": 0.7891330718994141, "step": 1508 }, { "epoch": 1.0272386789240722, "grad_norm": 1.8489017486572266, "learning_rate": 9.35266253726576e-06, "loss": 0.6542, "mean_token_accuracy": 0.7717653512954712, "step": 1509 }, { "epoch": 1.027919645897174, "grad_norm": 1.8463900089263916, "learning_rate": 9.351553525317754e-06, "loss": 0.671, "mean_token_accuracy": 0.7930231392383575, "step": 1510 }, { "epoch": 1.0286006128702758, "grad_norm": 1.7078421115875244, "learning_rate": 9.350443630076853e-06, "loss": 0.776, "mean_token_accuracy": 0.7419396042823792, "step": 1511 }, { "epoch": 1.0292815798433776, "grad_norm": 1.7840994596481323, "learning_rate": 9.349332851768346e-06, "loss": 0.7183, "mean_token_accuracy": 0.7551460564136505, "step": 1512 }, { "epoch": 1.0299625468164795, "grad_norm": 1.8586567640304565, "learning_rate": 9.348221190617703e-06, "loss": 0.7, "mean_token_accuracy": 0.7452987134456635, "step": 1513 }, { "epoch": 1.0306435137895813, "grad_norm": 1.7037534713745117, "learning_rate": 9.347108646850572e-06, "loss": 0.7937, "mean_token_accuracy": 0.747356116771698, "step": 1514 }, { "epoch": 1.031324480762683, "grad_norm": 1.625988483428955, "learning_rate": 9.345995220692782e-06, "loss": 0.7582, "mean_token_accuracy": 0.7518436908721924, "step": 1515 }, { "epoch": 1.0320054477357847, "grad_norm": 1.7044615745544434, "learning_rate": 9.34488091237034e-06, "loss": 0.7104, "mean_token_accuracy": 0.7699385285377502, "step": 1516 }, { "epoch": 1.0326864147088866, "grad_norm": 1.8133108615875244, "learning_rate": 9.343765722109431e-06, "loss": 0.718, "mean_token_accuracy": 0.7654089331626892, "step": 1517 }, { "epoch": 1.0333673816819884, "grad_norm": 1.7529785633087158, "learning_rate": 9.34264965013642e-06, "loss": 0.8717, "mean_token_accuracy": 0.6864257454872131, "step": 1518 }, { "epoch": 1.0340483486550902, "grad_norm": 1.771345853805542, "learning_rate": 9.341532696677851e-06, "loss": 0.7436, "mean_token_accuracy": 0.7519786953926086, "step": 1519 }, { "epoch": 1.034729315628192, "grad_norm": 1.8184717893600464, "learning_rate": 9.340414861960449e-06, "loss": 0.8008, "mean_token_accuracy": 0.748275637626648, "step": 1520 }, { "epoch": 1.0354102826012939, "grad_norm": 1.6916621923446655, "learning_rate": 9.339296146211114e-06, "loss": 0.7699, "mean_token_accuracy": 0.7324450016021729, "step": 1521 }, { "epoch": 1.0360912495743957, "grad_norm": 1.8414846658706665, "learning_rate": 9.338176549656928e-06, "loss": 0.8249, "mean_token_accuracy": 0.7331790626049042, "step": 1522 }, { "epoch": 1.0367722165474975, "grad_norm": 1.924532413482666, "learning_rate": 9.337056072525148e-06, "loss": 0.6977, "mean_token_accuracy": 0.7679703235626221, "step": 1523 }, { "epoch": 1.0374531835205993, "grad_norm": 1.6999350786209106, "learning_rate": 9.335934715043216e-06, "loss": 0.7704, "mean_token_accuracy": 0.743678092956543, "step": 1524 }, { "epoch": 1.038134150493701, "grad_norm": 1.7321314811706543, "learning_rate": 9.334812477438744e-06, "loss": 0.7212, "mean_token_accuracy": 0.7527982890605927, "step": 1525 }, { "epoch": 1.0388151174668028, "grad_norm": 1.6982226371765137, "learning_rate": 9.33368935993953e-06, "loss": 0.8378, "mean_token_accuracy": 0.729876309633255, "step": 1526 }, { "epoch": 1.0394960844399046, "grad_norm": 1.886973261833191, "learning_rate": 9.332565362773551e-06, "loss": 0.7298, "mean_token_accuracy": 0.7642487585544586, "step": 1527 }, { "epoch": 1.0401770514130064, "grad_norm": 1.6399240493774414, "learning_rate": 9.331440486168954e-06, "loss": 0.7126, "mean_token_accuracy": 0.7664419114589691, "step": 1528 }, { "epoch": 1.0408580183861083, "grad_norm": 1.7787235975265503, "learning_rate": 9.330314730354075e-06, "loss": 0.8424, "mean_token_accuracy": 0.7220813035964966, "step": 1529 }, { "epoch": 1.04153898535921, "grad_norm": 1.8022748231887817, "learning_rate": 9.329188095557421e-06, "loss": 0.6526, "mean_token_accuracy": 0.7874112725257874, "step": 1530 }, { "epoch": 1.042219952332312, "grad_norm": 1.7300959825515747, "learning_rate": 9.328060582007683e-06, "loss": 0.6796, "mean_token_accuracy": 0.763645350933075, "step": 1531 }, { "epoch": 1.0429009193054137, "grad_norm": 1.6946189403533936, "learning_rate": 9.326932189933724e-06, "loss": 0.6989, "mean_token_accuracy": 0.7587964832782745, "step": 1532 }, { "epoch": 1.0435818862785156, "grad_norm": 1.7149584293365479, "learning_rate": 9.32580291956459e-06, "loss": 0.7533, "mean_token_accuracy": 0.743162989616394, "step": 1533 }, { "epoch": 1.0442628532516174, "grad_norm": 1.7092517614364624, "learning_rate": 9.324672771129506e-06, "loss": 0.812, "mean_token_accuracy": 0.7442255020141602, "step": 1534 }, { "epoch": 1.0449438202247192, "grad_norm": 1.758368968963623, "learning_rate": 9.323541744857869e-06, "loss": 0.6406, "mean_token_accuracy": 0.7746928930282593, "step": 1535 }, { "epoch": 1.0456247871978208, "grad_norm": 1.8917335271835327, "learning_rate": 9.322409840979262e-06, "loss": 0.6618, "mean_token_accuracy": 0.7663267254829407, "step": 1536 }, { "epoch": 1.0463057541709226, "grad_norm": 1.9135643243789673, "learning_rate": 9.321277059723443e-06, "loss": 0.5758, "mean_token_accuracy": 0.8113524913787842, "step": 1537 }, { "epoch": 1.0469867211440245, "grad_norm": 1.9711480140686035, "learning_rate": 9.320143401320346e-06, "loss": 0.6875, "mean_token_accuracy": 0.766429603099823, "step": 1538 }, { "epoch": 1.0476676881171263, "grad_norm": 1.9847933053970337, "learning_rate": 9.319008866000085e-06, "loss": 0.7185, "mean_token_accuracy": 0.7580244243144989, "step": 1539 }, { "epoch": 1.0483486550902281, "grad_norm": 1.7126102447509766, "learning_rate": 9.317873453992952e-06, "loss": 0.7801, "mean_token_accuracy": 0.7372893393039703, "step": 1540 }, { "epoch": 1.04902962206333, "grad_norm": 1.573413610458374, "learning_rate": 9.316737165529418e-06, "loss": 0.8959, "mean_token_accuracy": 0.7181865870952606, "step": 1541 }, { "epoch": 1.0497105890364318, "grad_norm": 1.6708916425704956, "learning_rate": 9.315600000840129e-06, "loss": 0.7487, "mean_token_accuracy": 0.762346625328064, "step": 1542 }, { "epoch": 1.0503915560095336, "grad_norm": 1.873494029045105, "learning_rate": 9.314461960155909e-06, "loss": 0.6894, "mean_token_accuracy": 0.7767453193664551, "step": 1543 }, { "epoch": 1.0510725229826354, "grad_norm": 1.6392396688461304, "learning_rate": 9.313323043707765e-06, "loss": 0.7509, "mean_token_accuracy": 0.7514075040817261, "step": 1544 }, { "epoch": 1.0517534899557373, "grad_norm": 1.6885908842086792, "learning_rate": 9.312183251726876e-06, "loss": 0.7447, "mean_token_accuracy": 0.7468406856060028, "step": 1545 }, { "epoch": 1.0524344569288389, "grad_norm": 1.6594871282577515, "learning_rate": 9.311042584444601e-06, "loss": 0.7003, "mean_token_accuracy": 0.7614539563655853, "step": 1546 }, { "epoch": 1.0531154239019407, "grad_norm": 1.6411163806915283, "learning_rate": 9.309901042092477e-06, "loss": 0.8531, "mean_token_accuracy": 0.7360460758209229, "step": 1547 }, { "epoch": 1.0537963908750425, "grad_norm": 1.8711851835250854, "learning_rate": 9.308758624902216e-06, "loss": 0.7399, "mean_token_accuracy": 0.7615758776664734, "step": 1548 }, { "epoch": 1.0544773578481443, "grad_norm": 1.6329760551452637, "learning_rate": 9.307615333105713e-06, "loss": 0.7856, "mean_token_accuracy": 0.7388091683387756, "step": 1549 }, { "epoch": 1.0551583248212462, "grad_norm": 1.6575957536697388, "learning_rate": 9.306471166935036e-06, "loss": 0.7808, "mean_token_accuracy": 0.7290811836719513, "step": 1550 }, { "epoch": 1.055839291794348, "grad_norm": 1.561772108078003, "learning_rate": 9.30532612662243e-06, "loss": 0.8034, "mean_token_accuracy": 0.713568389415741, "step": 1551 }, { "epoch": 1.0565202587674498, "grad_norm": 1.6982227563858032, "learning_rate": 9.304180212400322e-06, "loss": 0.8048, "mean_token_accuracy": 0.7330443859100342, "step": 1552 }, { "epoch": 1.0572012257405516, "grad_norm": 1.6699105501174927, "learning_rate": 9.30303342450131e-06, "loss": 0.7878, "mean_token_accuracy": 0.7149074077606201, "step": 1553 }, { "epoch": 1.0578821927136535, "grad_norm": 1.9948756694793701, "learning_rate": 9.301885763158173e-06, "loss": 0.7031, "mean_token_accuracy": 0.7488657236099243, "step": 1554 }, { "epoch": 1.0585631596867553, "grad_norm": 1.802850604057312, "learning_rate": 9.300737228603874e-06, "loss": 0.6999, "mean_token_accuracy": 0.7637290358543396, "step": 1555 }, { "epoch": 1.059244126659857, "grad_norm": 1.9225070476531982, "learning_rate": 9.299587821071536e-06, "loss": 0.5693, "mean_token_accuracy": 0.8116478621959686, "step": 1556 }, { "epoch": 1.0599250936329587, "grad_norm": 2.050874948501587, "learning_rate": 9.298437540794479e-06, "loss": 0.627, "mean_token_accuracy": 0.8015241622924805, "step": 1557 }, { "epoch": 1.0606060606060606, "grad_norm": 1.758666753768921, "learning_rate": 9.297286388006184e-06, "loss": 0.6544, "mean_token_accuracy": 0.7957689464092255, "step": 1558 }, { "epoch": 1.0612870275791624, "grad_norm": 1.7303470373153687, "learning_rate": 9.296134362940318e-06, "loss": 0.6667, "mean_token_accuracy": 0.7845681607723236, "step": 1559 }, { "epoch": 1.0619679945522642, "grad_norm": 1.7246448993682861, "learning_rate": 9.294981465830724e-06, "loss": 0.6962, "mean_token_accuracy": 0.7753237783908844, "step": 1560 }, { "epoch": 1.062648961525366, "grad_norm": 1.8388001918792725, "learning_rate": 9.293827696911419e-06, "loss": 0.7898, "mean_token_accuracy": 0.7271365225315094, "step": 1561 }, { "epoch": 1.0633299284984679, "grad_norm": 1.772353172302246, "learning_rate": 9.292673056416602e-06, "loss": 0.6914, "mean_token_accuracy": 0.7739920020103455, "step": 1562 }, { "epoch": 1.0640108954715697, "grad_norm": 1.8098013401031494, "learning_rate": 9.291517544580642e-06, "loss": 0.6471, "mean_token_accuracy": 0.7720178365707397, "step": 1563 }, { "epoch": 1.0646918624446715, "grad_norm": 1.6944950819015503, "learning_rate": 9.290361161638093e-06, "loss": 0.7614, "mean_token_accuracy": 0.7405329048633575, "step": 1564 }, { "epoch": 1.0653728294177733, "grad_norm": 1.8462865352630615, "learning_rate": 9.289203907823676e-06, "loss": 0.637, "mean_token_accuracy": 0.797063946723938, "step": 1565 }, { "epoch": 1.066053796390875, "grad_norm": 1.9285938739776611, "learning_rate": 9.288045783372299e-06, "loss": 0.7942, "mean_token_accuracy": 0.7449604570865631, "step": 1566 }, { "epoch": 1.0667347633639768, "grad_norm": 1.5260117053985596, "learning_rate": 9.286886788519041e-06, "loss": 0.928, "mean_token_accuracy": 0.7120823264122009, "step": 1567 }, { "epoch": 1.0674157303370786, "grad_norm": 1.774540901184082, "learning_rate": 9.285726923499156e-06, "loss": 0.7474, "mean_token_accuracy": 0.7552771866321564, "step": 1568 }, { "epoch": 1.0680966973101804, "grad_norm": 1.7216379642486572, "learning_rate": 9.284566188548082e-06, "loss": 0.7168, "mean_token_accuracy": 0.7724968194961548, "step": 1569 }, { "epoch": 1.0687776642832822, "grad_norm": 1.69661283493042, "learning_rate": 9.283404583901423e-06, "loss": 0.7442, "mean_token_accuracy": 0.7594690024852753, "step": 1570 }, { "epoch": 1.069458631256384, "grad_norm": 1.8947396278381348, "learning_rate": 9.282242109794968e-06, "loss": 0.7452, "mean_token_accuracy": 0.7487689256668091, "step": 1571 }, { "epoch": 1.070139598229486, "grad_norm": 1.792999505996704, "learning_rate": 9.281078766464682e-06, "loss": 0.6779, "mean_token_accuracy": 0.7498167753219604, "step": 1572 }, { "epoch": 1.0708205652025877, "grad_norm": 2.03233003616333, "learning_rate": 9.279914554146704e-06, "loss": 0.5561, "mean_token_accuracy": 0.8189840316772461, "step": 1573 }, { "epoch": 1.0715015321756896, "grad_norm": 1.7960734367370605, "learning_rate": 9.278749473077344e-06, "loss": 0.6773, "mean_token_accuracy": 0.7704671919345856, "step": 1574 }, { "epoch": 1.0721824991487914, "grad_norm": 1.7402386665344238, "learning_rate": 9.277583523493101e-06, "loss": 0.7311, "mean_token_accuracy": 0.761003851890564, "step": 1575 }, { "epoch": 1.072863466121893, "grad_norm": 1.774526834487915, "learning_rate": 9.276416705630641e-06, "loss": 0.7257, "mean_token_accuracy": 0.7353731095790863, "step": 1576 }, { "epoch": 1.0735444330949948, "grad_norm": 1.720393419265747, "learning_rate": 9.275249019726809e-06, "loss": 0.7076, "mean_token_accuracy": 0.7457854151725769, "step": 1577 }, { "epoch": 1.0742254000680966, "grad_norm": 1.7599313259124756, "learning_rate": 9.274080466018624e-06, "loss": 0.7931, "mean_token_accuracy": 0.7362770140171051, "step": 1578 }, { "epoch": 1.0749063670411985, "grad_norm": 2.022830009460449, "learning_rate": 9.272911044743283e-06, "loss": 0.7578, "mean_token_accuracy": 0.751206636428833, "step": 1579 }, { "epoch": 1.0755873340143003, "grad_norm": 1.5911816358566284, "learning_rate": 9.271740756138161e-06, "loss": 0.8937, "mean_token_accuracy": 0.7347811460494995, "step": 1580 }, { "epoch": 1.0762683009874021, "grad_norm": 1.7251553535461426, "learning_rate": 9.270569600440807e-06, "loss": 0.6277, "mean_token_accuracy": 0.7921494841575623, "step": 1581 }, { "epoch": 1.076949267960504, "grad_norm": 1.5936013460159302, "learning_rate": 9.269397577888945e-06, "loss": 0.8211, "mean_token_accuracy": 0.7322792112827301, "step": 1582 }, { "epoch": 1.0776302349336058, "grad_norm": 1.7832216024398804, "learning_rate": 9.268224688720475e-06, "loss": 0.6825, "mean_token_accuracy": 0.7672508955001831, "step": 1583 }, { "epoch": 1.0783112019067076, "grad_norm": 1.8482004404067993, "learning_rate": 9.267050933173475e-06, "loss": 0.7431, "mean_token_accuracy": 0.7550114989280701, "step": 1584 }, { "epoch": 1.0789921688798094, "grad_norm": 1.7614045143127441, "learning_rate": 9.265876311486202e-06, "loss": 0.7104, "mean_token_accuracy": 0.7717466056346893, "step": 1585 }, { "epoch": 1.0796731358529112, "grad_norm": 1.714539885520935, "learning_rate": 9.264700823897077e-06, "loss": 0.7561, "mean_token_accuracy": 0.7424389123916626, "step": 1586 }, { "epoch": 1.0803541028260129, "grad_norm": 1.8530720472335815, "learning_rate": 9.263524470644708e-06, "loss": 0.7814, "mean_token_accuracy": 0.7499648928642273, "step": 1587 }, { "epoch": 1.0810350697991147, "grad_norm": 1.7798383235931396, "learning_rate": 9.262347251967878e-06, "loss": 0.703, "mean_token_accuracy": 0.7718378305435181, "step": 1588 }, { "epoch": 1.0817160367722165, "grad_norm": 1.7468485832214355, "learning_rate": 9.261169168105539e-06, "loss": 0.6755, "mean_token_accuracy": 0.7740812301635742, "step": 1589 }, { "epoch": 1.0823970037453183, "grad_norm": 1.538449764251709, "learning_rate": 9.259990219296824e-06, "loss": 0.8106, "mean_token_accuracy": 0.7293069660663605, "step": 1590 }, { "epoch": 1.0830779707184202, "grad_norm": 1.7727200984954834, "learning_rate": 9.25881040578104e-06, "loss": 0.7588, "mean_token_accuracy": 0.7646459341049194, "step": 1591 }, { "epoch": 1.083758937691522, "grad_norm": 1.6496165990829468, "learning_rate": 9.257629727797666e-06, "loss": 0.7493, "mean_token_accuracy": 0.7442460358142853, "step": 1592 }, { "epoch": 1.0844399046646238, "grad_norm": 1.7600301504135132, "learning_rate": 9.256448185586367e-06, "loss": 0.6635, "mean_token_accuracy": 0.7641317844390869, "step": 1593 }, { "epoch": 1.0851208716377256, "grad_norm": 1.8424314260482788, "learning_rate": 9.255265779386968e-06, "loss": 0.7457, "mean_token_accuracy": 0.753203421831131, "step": 1594 }, { "epoch": 1.0858018386108275, "grad_norm": 2.00875186920166, "learning_rate": 9.254082509439485e-06, "loss": 0.7688, "mean_token_accuracy": 0.7432229220867157, "step": 1595 }, { "epoch": 1.086482805583929, "grad_norm": 1.7717180252075195, "learning_rate": 9.252898375984099e-06, "loss": 0.7876, "mean_token_accuracy": 0.7465777397155762, "step": 1596 }, { "epoch": 1.087163772557031, "grad_norm": 1.783960223197937, "learning_rate": 9.251713379261168e-06, "loss": 0.6961, "mean_token_accuracy": 0.7725102603435516, "step": 1597 }, { "epoch": 1.0878447395301327, "grad_norm": 2.074099063873291, "learning_rate": 9.25052751951123e-06, "loss": 0.6223, "mean_token_accuracy": 0.8022877871990204, "step": 1598 }, { "epoch": 1.0885257065032345, "grad_norm": 1.8769726753234863, "learning_rate": 9.24934079697499e-06, "loss": 0.6443, "mean_token_accuracy": 0.7771853804588318, "step": 1599 }, { "epoch": 1.0892066734763364, "grad_norm": 1.81005859375, "learning_rate": 9.248153211893339e-06, "loss": 0.6323, "mean_token_accuracy": 0.7912357151508331, "step": 1600 }, { "epoch": 1.0898876404494382, "grad_norm": 1.7516051530838013, "learning_rate": 9.246964764507333e-06, "loss": 0.7592, "mean_token_accuracy": 0.7594552338123322, "step": 1601 }, { "epoch": 1.09056860742254, "grad_norm": 1.8281399011611938, "learning_rate": 9.245775455058207e-06, "loss": 0.6812, "mean_token_accuracy": 0.7791709899902344, "step": 1602 }, { "epoch": 1.0912495743956419, "grad_norm": 1.7867846488952637, "learning_rate": 9.244585283787373e-06, "loss": 0.7471, "mean_token_accuracy": 0.7653115391731262, "step": 1603 }, { "epoch": 1.0919305413687437, "grad_norm": 1.7456731796264648, "learning_rate": 9.243394250936415e-06, "loss": 0.7288, "mean_token_accuracy": 0.7547584772109985, "step": 1604 }, { "epoch": 1.0926115083418455, "grad_norm": 1.8598979711532593, "learning_rate": 9.242202356747092e-06, "loss": 0.7519, "mean_token_accuracy": 0.754321426153183, "step": 1605 }, { "epoch": 1.0932924753149473, "grad_norm": 1.7798606157302856, "learning_rate": 9.24100960146134e-06, "loss": 0.7258, "mean_token_accuracy": 0.7536269426345825, "step": 1606 }, { "epoch": 1.093973442288049, "grad_norm": 1.6769988536834717, "learning_rate": 9.239815985321266e-06, "loss": 0.7898, "mean_token_accuracy": 0.7519420683383942, "step": 1607 }, { "epoch": 1.0946544092611508, "grad_norm": 1.7907606363296509, "learning_rate": 9.238621508569157e-06, "loss": 0.6202, "mean_token_accuracy": 0.8004415333271027, "step": 1608 }, { "epoch": 1.0953353762342526, "grad_norm": 1.7644190788269043, "learning_rate": 9.237426171447471e-06, "loss": 0.772, "mean_token_accuracy": 0.7440113127231598, "step": 1609 }, { "epoch": 1.0960163432073544, "grad_norm": 1.9621171951293945, "learning_rate": 9.236229974198841e-06, "loss": 0.6336, "mean_token_accuracy": 0.7974292933940887, "step": 1610 }, { "epoch": 1.0966973101804562, "grad_norm": 1.9772958755493164, "learning_rate": 9.235032917066077e-06, "loss": 0.6684, "mean_token_accuracy": 0.7867651283740997, "step": 1611 }, { "epoch": 1.097378277153558, "grad_norm": 1.7795015573501587, "learning_rate": 9.233835000292159e-06, "loss": 0.7103, "mean_token_accuracy": 0.7544711828231812, "step": 1612 }, { "epoch": 1.09805924412666, "grad_norm": 1.7249113321304321, "learning_rate": 9.232636224120246e-06, "loss": 0.7932, "mean_token_accuracy": 0.7161034345626831, "step": 1613 }, { "epoch": 1.0987402110997617, "grad_norm": 1.8614548444747925, "learning_rate": 9.231436588793667e-06, "loss": 0.7176, "mean_token_accuracy": 0.7617781162261963, "step": 1614 }, { "epoch": 1.0994211780728635, "grad_norm": 1.5727465152740479, "learning_rate": 9.230236094555933e-06, "loss": 0.7629, "mean_token_accuracy": 0.754871279001236, "step": 1615 }, { "epoch": 1.1001021450459654, "grad_norm": 1.7196457386016846, "learning_rate": 9.22903474165072e-06, "loss": 0.8498, "mean_token_accuracy": 0.7442688047885895, "step": 1616 }, { "epoch": 1.100783112019067, "grad_norm": 1.9090144634246826, "learning_rate": 9.227832530321883e-06, "loss": 0.6925, "mean_token_accuracy": 0.7632534205913544, "step": 1617 }, { "epoch": 1.1014640789921688, "grad_norm": 1.5784202814102173, "learning_rate": 9.226629460813452e-06, "loss": 0.8686, "mean_token_accuracy": 0.7350661158561707, "step": 1618 }, { "epoch": 1.1021450459652706, "grad_norm": 1.6122418642044067, "learning_rate": 9.22542553336963e-06, "loss": 0.7667, "mean_token_accuracy": 0.7675730884075165, "step": 1619 }, { "epoch": 1.1028260129383725, "grad_norm": 1.878004550933838, "learning_rate": 9.224220748234794e-06, "loss": 0.6251, "mean_token_accuracy": 0.7942375838756561, "step": 1620 }, { "epoch": 1.1035069799114743, "grad_norm": 1.8259024620056152, "learning_rate": 9.223015105653497e-06, "loss": 0.6869, "mean_token_accuracy": 0.7713882625102997, "step": 1621 }, { "epoch": 1.104187946884576, "grad_norm": 1.6730172634124756, "learning_rate": 9.221808605870462e-06, "loss": 0.7398, "mean_token_accuracy": 0.748197078704834, "step": 1622 }, { "epoch": 1.104868913857678, "grad_norm": 1.7784345149993896, "learning_rate": 9.220601249130588e-06, "loss": 0.6802, "mean_token_accuracy": 0.767779529094696, "step": 1623 }, { "epoch": 1.1055498808307798, "grad_norm": 1.8248170614242554, "learning_rate": 9.21939303567895e-06, "loss": 0.8019, "mean_token_accuracy": 0.7340686917304993, "step": 1624 }, { "epoch": 1.1062308478038816, "grad_norm": 1.74620521068573, "learning_rate": 9.218183965760794e-06, "loss": 0.7004, "mean_token_accuracy": 0.7624540328979492, "step": 1625 }, { "epoch": 1.1069118147769834, "grad_norm": 1.6386545896530151, "learning_rate": 9.216974039621544e-06, "loss": 0.7107, "mean_token_accuracy": 0.7704310119152069, "step": 1626 }, { "epoch": 1.107592781750085, "grad_norm": 1.7813504934310913, "learning_rate": 9.21576325750679e-06, "loss": 0.7954, "mean_token_accuracy": 0.7357569336891174, "step": 1627 }, { "epoch": 1.1082737487231868, "grad_norm": 1.5672457218170166, "learning_rate": 9.214551619662305e-06, "loss": 0.7636, "mean_token_accuracy": 0.7230709791183472, "step": 1628 }, { "epoch": 1.1089547156962887, "grad_norm": 1.9435734748840332, "learning_rate": 9.21333912633403e-06, "loss": 0.7549, "mean_token_accuracy": 0.7524317800998688, "step": 1629 }, { "epoch": 1.1096356826693905, "grad_norm": 1.7214710712432861, "learning_rate": 9.212125777768078e-06, "loss": 0.5905, "mean_token_accuracy": 0.8032031059265137, "step": 1630 }, { "epoch": 1.1103166496424923, "grad_norm": 1.8096989393234253, "learning_rate": 9.210911574210742e-06, "loss": 0.6878, "mean_token_accuracy": 0.7487408518791199, "step": 1631 }, { "epoch": 1.1109976166155942, "grad_norm": 1.7792199850082397, "learning_rate": 9.209696515908485e-06, "loss": 0.9409, "mean_token_accuracy": 0.7054384052753448, "step": 1632 }, { "epoch": 1.111678583588696, "grad_norm": 1.7773070335388184, "learning_rate": 9.208480603107942e-06, "loss": 0.7993, "mean_token_accuracy": 0.7323169112205505, "step": 1633 }, { "epoch": 1.1123595505617978, "grad_norm": 1.6520267724990845, "learning_rate": 9.207263836055923e-06, "loss": 0.7326, "mean_token_accuracy": 0.7600584328174591, "step": 1634 }, { "epoch": 1.1130405175348996, "grad_norm": 1.744375228881836, "learning_rate": 9.206046214999412e-06, "loss": 0.6576, "mean_token_accuracy": 0.7830063104629517, "step": 1635 }, { "epoch": 1.1137214845080015, "grad_norm": 1.7664602994918823, "learning_rate": 9.204827740185566e-06, "loss": 0.7127, "mean_token_accuracy": 0.7693211436271667, "step": 1636 }, { "epoch": 1.1144024514811033, "grad_norm": 1.9303375482559204, "learning_rate": 9.203608411861717e-06, "loss": 0.7803, "mean_token_accuracy": 0.7478379011154175, "step": 1637 }, { "epoch": 1.1150834184542049, "grad_norm": 1.853297472000122, "learning_rate": 9.202388230275365e-06, "loss": 0.7168, "mean_token_accuracy": 0.7590545415878296, "step": 1638 }, { "epoch": 1.1157643854273067, "grad_norm": 1.869146466255188, "learning_rate": 9.201167195674188e-06, "loss": 0.767, "mean_token_accuracy": 0.7588397264480591, "step": 1639 }, { "epoch": 1.1164453524004085, "grad_norm": 1.6446363925933838, "learning_rate": 9.199945308306037e-06, "loss": 0.8003, "mean_token_accuracy": 0.7200289368629456, "step": 1640 }, { "epoch": 1.1171263193735104, "grad_norm": 1.6132283210754395, "learning_rate": 9.198722568418933e-06, "loss": 0.8283, "mean_token_accuracy": 0.7096248269081116, "step": 1641 }, { "epoch": 1.1178072863466122, "grad_norm": 1.9261544942855835, "learning_rate": 9.19749897626107e-06, "loss": 0.7217, "mean_token_accuracy": 0.7621451318264008, "step": 1642 }, { "epoch": 1.118488253319714, "grad_norm": 1.8324838876724243, "learning_rate": 9.196274532080821e-06, "loss": 0.7449, "mean_token_accuracy": 0.7446739673614502, "step": 1643 }, { "epoch": 1.1191692202928158, "grad_norm": 1.6738505363464355, "learning_rate": 9.195049236126726e-06, "loss": 0.6992, "mean_token_accuracy": 0.7761678695678711, "step": 1644 }, { "epoch": 1.1198501872659177, "grad_norm": 1.745863437652588, "learning_rate": 9.193823088647498e-06, "loss": 0.7758, "mean_token_accuracy": 0.7480656206607819, "step": 1645 }, { "epoch": 1.1205311542390195, "grad_norm": 1.7119745016098022, "learning_rate": 9.192596089892027e-06, "loss": 0.7582, "mean_token_accuracy": 0.751409262418747, "step": 1646 }, { "epoch": 1.121212121212121, "grad_norm": 1.7870410680770874, "learning_rate": 9.191368240109374e-06, "loss": 0.7379, "mean_token_accuracy": 0.7650189697742462, "step": 1647 }, { "epoch": 1.121893088185223, "grad_norm": 1.6793134212493896, "learning_rate": 9.190139539548768e-06, "loss": 0.8318, "mean_token_accuracy": 0.7592829763889313, "step": 1648 }, { "epoch": 1.1225740551583248, "grad_norm": 1.7119742631912231, "learning_rate": 9.188909988459618e-06, "loss": 0.8467, "mean_token_accuracy": 0.7345155775547028, "step": 1649 }, { "epoch": 1.1232550221314266, "grad_norm": 1.8254917860031128, "learning_rate": 9.187679587091503e-06, "loss": 0.6553, "mean_token_accuracy": 0.7788478136062622, "step": 1650 }, { "epoch": 1.1239359891045284, "grad_norm": 1.856095790863037, "learning_rate": 9.186448335694172e-06, "loss": 0.6822, "mean_token_accuracy": 0.7737314105033875, "step": 1651 }, { "epoch": 1.1246169560776302, "grad_norm": 1.9040418863296509, "learning_rate": 9.185216234517546e-06, "loss": 0.6942, "mean_token_accuracy": 0.7643031179904938, "step": 1652 }, { "epoch": 1.125297923050732, "grad_norm": 1.849622130393982, "learning_rate": 9.183983283811729e-06, "loss": 0.649, "mean_token_accuracy": 0.7715820968151093, "step": 1653 }, { "epoch": 1.125978890023834, "grad_norm": 1.8212757110595703, "learning_rate": 9.18274948382698e-06, "loss": 0.6529, "mean_token_accuracy": 0.7827580273151398, "step": 1654 }, { "epoch": 1.1266598569969357, "grad_norm": 1.8000478744506836, "learning_rate": 9.181514834813747e-06, "loss": 0.652, "mean_token_accuracy": 0.7656265199184418, "step": 1655 }, { "epoch": 1.1273408239700375, "grad_norm": 1.7409135103225708, "learning_rate": 9.180279337022641e-06, "loss": 0.6446, "mean_token_accuracy": 0.7792328894138336, "step": 1656 }, { "epoch": 1.1280217909431394, "grad_norm": 1.9711140394210815, "learning_rate": 9.179042990704446e-06, "loss": 0.6531, "mean_token_accuracy": 0.772796630859375, "step": 1657 }, { "epoch": 1.128702757916241, "grad_norm": 1.7748358249664307, "learning_rate": 9.177805796110122e-06, "loss": 0.8388, "mean_token_accuracy": 0.7299023270606995, "step": 1658 }, { "epoch": 1.1293837248893428, "grad_norm": 1.667945146560669, "learning_rate": 9.176567753490795e-06, "loss": 0.798, "mean_token_accuracy": 0.7407715618610382, "step": 1659 }, { "epoch": 1.1300646918624446, "grad_norm": 1.8221325874328613, "learning_rate": 9.175328863097772e-06, "loss": 0.669, "mean_token_accuracy": 0.7793433666229248, "step": 1660 }, { "epoch": 1.1307456588355465, "grad_norm": 1.7944607734680176, "learning_rate": 9.174089125182526e-06, "loss": 0.8477, "mean_token_accuracy": 0.7296790182590485, "step": 1661 }, { "epoch": 1.1314266258086483, "grad_norm": 1.6887222528457642, "learning_rate": 9.1728485399967e-06, "loss": 0.6759, "mean_token_accuracy": 0.7731229662895203, "step": 1662 }, { "epoch": 1.13210759278175, "grad_norm": 1.6953223943710327, "learning_rate": 9.171607107792113e-06, "loss": 0.7743, "mean_token_accuracy": 0.7259563505649567, "step": 1663 }, { "epoch": 1.132788559754852, "grad_norm": 1.873039722442627, "learning_rate": 9.170364828820759e-06, "loss": 0.7082, "mean_token_accuracy": 0.7703516185283661, "step": 1664 }, { "epoch": 1.1334695267279538, "grad_norm": 1.6973975896835327, "learning_rate": 9.169121703334796e-06, "loss": 0.7133, "mean_token_accuracy": 0.754806786775589, "step": 1665 }, { "epoch": 1.1341504937010556, "grad_norm": 1.8627499341964722, "learning_rate": 9.167877731586559e-06, "loss": 0.6168, "mean_token_accuracy": 0.7969554364681244, "step": 1666 }, { "epoch": 1.1348314606741572, "grad_norm": 1.594435691833496, "learning_rate": 9.166632913828553e-06, "loss": 0.9524, "mean_token_accuracy": 0.7188757956027985, "step": 1667 }, { "epoch": 1.135512427647259, "grad_norm": 1.6940069198608398, "learning_rate": 9.165387250313455e-06, "loss": 0.6028, "mean_token_accuracy": 0.7677764296531677, "step": 1668 }, { "epoch": 1.1361933946203608, "grad_norm": 1.9061802625656128, "learning_rate": 9.164140741294116e-06, "loss": 0.834, "mean_token_accuracy": 0.730589747428894, "step": 1669 }, { "epoch": 1.1368743615934627, "grad_norm": 1.7641454935073853, "learning_rate": 9.162893387023554e-06, "loss": 0.9949, "mean_token_accuracy": 0.6989951729774475, "step": 1670 }, { "epoch": 1.1375553285665645, "grad_norm": 1.8391046524047852, "learning_rate": 9.161645187754963e-06, "loss": 0.6637, "mean_token_accuracy": 0.7769548296928406, "step": 1671 }, { "epoch": 1.1382362955396663, "grad_norm": 1.947153925895691, "learning_rate": 9.160396143741708e-06, "loss": 0.6296, "mean_token_accuracy": 0.7894448935985565, "step": 1672 }, { "epoch": 1.1389172625127681, "grad_norm": 1.8267202377319336, "learning_rate": 9.15914625523732e-06, "loss": 0.7332, "mean_token_accuracy": 0.7511670887470245, "step": 1673 }, { "epoch": 1.13959822948587, "grad_norm": 1.6801257133483887, "learning_rate": 9.157895522495508e-06, "loss": 0.7927, "mean_token_accuracy": 0.7052841782569885, "step": 1674 }, { "epoch": 1.1402791964589718, "grad_norm": 1.6461892127990723, "learning_rate": 9.156643945770149e-06, "loss": 0.7615, "mean_token_accuracy": 0.7493312656879425, "step": 1675 }, { "epoch": 1.1409601634320736, "grad_norm": 1.750585675239563, "learning_rate": 9.155391525315295e-06, "loss": 0.7422, "mean_token_accuracy": 0.7301076650619507, "step": 1676 }, { "epoch": 1.1416411304051755, "grad_norm": 1.7811098098754883, "learning_rate": 9.154138261385164e-06, "loss": 0.6511, "mean_token_accuracy": 0.7722989320755005, "step": 1677 }, { "epoch": 1.142322097378277, "grad_norm": 1.8756171464920044, "learning_rate": 9.152884154234147e-06, "loss": 0.6788, "mean_token_accuracy": 0.7790284752845764, "step": 1678 }, { "epoch": 1.1430030643513789, "grad_norm": 1.6806576251983643, "learning_rate": 9.151629204116807e-06, "loss": 0.7489, "mean_token_accuracy": 0.7608038485050201, "step": 1679 }, { "epoch": 1.1436840313244807, "grad_norm": 1.8867864608764648, "learning_rate": 9.15037341128788e-06, "loss": 0.8668, "mean_token_accuracy": 0.7275677025318146, "step": 1680 }, { "epoch": 1.1443649982975825, "grad_norm": 1.9065130949020386, "learning_rate": 9.149116776002271e-06, "loss": 0.8737, "mean_token_accuracy": 0.7174364030361176, "step": 1681 }, { "epoch": 1.1450459652706844, "grad_norm": 1.6392512321472168, "learning_rate": 9.147859298515053e-06, "loss": 0.7136, "mean_token_accuracy": 0.7494046986103058, "step": 1682 }, { "epoch": 1.1457269322437862, "grad_norm": 1.6398322582244873, "learning_rate": 9.146600979081477e-06, "loss": 0.7492, "mean_token_accuracy": 0.7446482479572296, "step": 1683 }, { "epoch": 1.146407899216888, "grad_norm": 1.908312201499939, "learning_rate": 9.145341817956958e-06, "loss": 0.6379, "mean_token_accuracy": 0.7920015454292297, "step": 1684 }, { "epoch": 1.1470888661899898, "grad_norm": 1.8874269723892212, "learning_rate": 9.144081815397086e-06, "loss": 0.6755, "mean_token_accuracy": 0.7748969495296478, "step": 1685 }, { "epoch": 1.1477698331630917, "grad_norm": 1.6475504636764526, "learning_rate": 9.14282097165762e-06, "loss": 0.7217, "mean_token_accuracy": 0.7765801548957825, "step": 1686 }, { "epoch": 1.1484508001361933, "grad_norm": 1.8352504968643188, "learning_rate": 9.14155928699449e-06, "loss": 0.6471, "mean_token_accuracy": 0.7830113768577576, "step": 1687 }, { "epoch": 1.1491317671092953, "grad_norm": 1.6970614194869995, "learning_rate": 9.140296761663799e-06, "loss": 0.729, "mean_token_accuracy": 0.7658594846725464, "step": 1688 }, { "epoch": 1.149812734082397, "grad_norm": 1.7200261354446411, "learning_rate": 9.139033395921814e-06, "loss": 0.7674, "mean_token_accuracy": 0.7262404561042786, "step": 1689 }, { "epoch": 1.1504937010554988, "grad_norm": 1.6510827541351318, "learning_rate": 9.137769190024983e-06, "loss": 0.7659, "mean_token_accuracy": 0.7313656508922577, "step": 1690 }, { "epoch": 1.1511746680286006, "grad_norm": 1.7508851289749146, "learning_rate": 9.136504144229915e-06, "loss": 0.6485, "mean_token_accuracy": 0.7827011346817017, "step": 1691 }, { "epoch": 1.1518556350017024, "grad_norm": 1.6581170558929443, "learning_rate": 9.135238258793394e-06, "loss": 0.7643, "mean_token_accuracy": 0.7588267922401428, "step": 1692 }, { "epoch": 1.1525366019748042, "grad_norm": 1.720718264579773, "learning_rate": 9.133971533972375e-06, "loss": 0.7906, "mean_token_accuracy": 0.7392296195030212, "step": 1693 }, { "epoch": 1.153217568947906, "grad_norm": 1.704024076461792, "learning_rate": 9.132703970023979e-06, "loss": 0.7536, "mean_token_accuracy": 0.7335870265960693, "step": 1694 }, { "epoch": 1.1538985359210079, "grad_norm": 1.8948887586593628, "learning_rate": 9.131435567205502e-06, "loss": 0.6515, "mean_token_accuracy": 0.790409505367279, "step": 1695 }, { "epoch": 1.1545795028941097, "grad_norm": 1.8681459426879883, "learning_rate": 9.130166325774411e-06, "loss": 0.6951, "mean_token_accuracy": 0.7810002565383911, "step": 1696 }, { "epoch": 1.1552604698672115, "grad_norm": 1.8436418771743774, "learning_rate": 9.128896245988338e-06, "loss": 0.6073, "mean_token_accuracy": 0.8005259335041046, "step": 1697 }, { "epoch": 1.1559414368403131, "grad_norm": 1.8314249515533447, "learning_rate": 9.127625328105089e-06, "loss": 0.6938, "mean_token_accuracy": 0.7613458633422852, "step": 1698 }, { "epoch": 1.156622403813415, "grad_norm": 1.7211278676986694, "learning_rate": 9.126353572382639e-06, "loss": 0.6061, "mean_token_accuracy": 0.7842853963375092, "step": 1699 }, { "epoch": 1.1573033707865168, "grad_norm": 1.7254050970077515, "learning_rate": 9.125080979079133e-06, "loss": 0.7395, "mean_token_accuracy": 0.7549102008342743, "step": 1700 }, { "epoch": 1.1579843377596186, "grad_norm": 1.9382566213607788, "learning_rate": 9.123807548452886e-06, "loss": 0.6856, "mean_token_accuracy": 0.7903409898281097, "step": 1701 }, { "epoch": 1.1586653047327204, "grad_norm": 1.851680874824524, "learning_rate": 9.122533280762384e-06, "loss": 0.7635, "mean_token_accuracy": 0.7658926844596863, "step": 1702 }, { "epoch": 1.1593462717058223, "grad_norm": 1.596719741821289, "learning_rate": 9.121258176266283e-06, "loss": 0.856, "mean_token_accuracy": 0.7227761447429657, "step": 1703 }, { "epoch": 1.160027238678924, "grad_norm": 1.770680546760559, "learning_rate": 9.119982235223406e-06, "loss": 0.6015, "mean_token_accuracy": 0.8012022376060486, "step": 1704 }, { "epoch": 1.160708205652026, "grad_norm": 1.7409433126449585, "learning_rate": 9.118705457892748e-06, "loss": 0.6873, "mean_token_accuracy": 0.7715353667736053, "step": 1705 }, { "epoch": 1.1613891726251278, "grad_norm": 2.02060866355896, "learning_rate": 9.117427844533474e-06, "loss": 0.6673, "mean_token_accuracy": 0.7881543040275574, "step": 1706 }, { "epoch": 1.1620701395982296, "grad_norm": 1.858594298362732, "learning_rate": 9.116149395404919e-06, "loss": 0.6735, "mean_token_accuracy": 0.7685970067977905, "step": 1707 }, { "epoch": 1.1627511065713314, "grad_norm": 1.7103222608566284, "learning_rate": 9.114870110766586e-06, "loss": 0.7927, "mean_token_accuracy": 0.740854948759079, "step": 1708 }, { "epoch": 1.163432073544433, "grad_norm": 1.9415504932403564, "learning_rate": 9.11358999087815e-06, "loss": 0.6874, "mean_token_accuracy": 0.772412896156311, "step": 1709 }, { "epoch": 1.1641130405175348, "grad_norm": 1.7260396480560303, "learning_rate": 9.11230903599945e-06, "loss": 0.6369, "mean_token_accuracy": 0.784630537033081, "step": 1710 }, { "epoch": 1.1647940074906367, "grad_norm": 1.671645998954773, "learning_rate": 9.111027246390504e-06, "loss": 0.6894, "mean_token_accuracy": 0.7779784202575684, "step": 1711 }, { "epoch": 1.1654749744637385, "grad_norm": 1.8079560995101929, "learning_rate": 9.10974462231149e-06, "loss": 0.6303, "mean_token_accuracy": 0.7893673181533813, "step": 1712 }, { "epoch": 1.1661559414368403, "grad_norm": 1.8324302434921265, "learning_rate": 9.10846116402276e-06, "loss": 0.767, "mean_token_accuracy": 0.7445246577262878, "step": 1713 }, { "epoch": 1.1668369084099421, "grad_norm": 1.6440763473510742, "learning_rate": 9.107176871784835e-06, "loss": 0.6567, "mean_token_accuracy": 0.7604553997516632, "step": 1714 }, { "epoch": 1.167517875383044, "grad_norm": 1.769466757774353, "learning_rate": 9.105891745858406e-06, "loss": 0.5846, "mean_token_accuracy": 0.8072551190853119, "step": 1715 }, { "epoch": 1.1681988423561458, "grad_norm": 1.6821858882904053, "learning_rate": 9.104605786504332e-06, "loss": 0.9059, "mean_token_accuracy": 0.724162757396698, "step": 1716 }, { "epoch": 1.1688798093292476, "grad_norm": 1.7298167943954468, "learning_rate": 9.103318993983639e-06, "loss": 0.7458, "mean_token_accuracy": 0.7576910853385925, "step": 1717 }, { "epoch": 1.1695607763023492, "grad_norm": 1.9507659673690796, "learning_rate": 9.102031368557527e-06, "loss": 0.7981, "mean_token_accuracy": 0.7434285581111908, "step": 1718 }, { "epoch": 1.170241743275451, "grad_norm": 1.8676860332489014, "learning_rate": 9.10074291048736e-06, "loss": 0.6205, "mean_token_accuracy": 0.7980692684650421, "step": 1719 }, { "epoch": 1.1709227102485529, "grad_norm": 1.8508920669555664, "learning_rate": 9.099453620034676e-06, "loss": 0.675, "mean_token_accuracy": 0.7781087756156921, "step": 1720 }, { "epoch": 1.1716036772216547, "grad_norm": 1.986057162284851, "learning_rate": 9.09816349746118e-06, "loss": 0.672, "mean_token_accuracy": 0.7862632274627686, "step": 1721 }, { "epoch": 1.1722846441947565, "grad_norm": 1.8808400630950928, "learning_rate": 9.096872543028742e-06, "loss": 0.7078, "mean_token_accuracy": 0.7532941997051239, "step": 1722 }, { "epoch": 1.1729656111678584, "grad_norm": 1.6992182731628418, "learning_rate": 9.095580756999406e-06, "loss": 0.7256, "mean_token_accuracy": 0.7644773423671722, "step": 1723 }, { "epoch": 1.1736465781409602, "grad_norm": 1.7101001739501953, "learning_rate": 9.094288139635386e-06, "loss": 0.9095, "mean_token_accuracy": 0.7205009758472443, "step": 1724 }, { "epoch": 1.174327545114062, "grad_norm": 1.584560513496399, "learning_rate": 9.092994691199057e-06, "loss": 0.7112, "mean_token_accuracy": 0.7595758736133575, "step": 1725 }, { "epoch": 1.1750085120871638, "grad_norm": 1.795163631439209, "learning_rate": 9.091700411952971e-06, "loss": 0.7016, "mean_token_accuracy": 0.7621840536594391, "step": 1726 }, { "epoch": 1.1756894790602657, "grad_norm": 1.90579354763031, "learning_rate": 9.090405302159843e-06, "loss": 0.7881, "mean_token_accuracy": 0.7462532818317413, "step": 1727 }, { "epoch": 1.1763704460333675, "grad_norm": 1.6993192434310913, "learning_rate": 9.089109362082558e-06, "loss": 0.6546, "mean_token_accuracy": 0.7768180966377258, "step": 1728 }, { "epoch": 1.177051413006469, "grad_norm": 1.749767541885376, "learning_rate": 9.087812591984174e-06, "loss": 0.7994, "mean_token_accuracy": 0.7450532019138336, "step": 1729 }, { "epoch": 1.177732379979571, "grad_norm": 1.7987041473388672, "learning_rate": 9.086514992127909e-06, "loss": 0.7527, "mean_token_accuracy": 0.755636066198349, "step": 1730 }, { "epoch": 1.1784133469526727, "grad_norm": 1.759169340133667, "learning_rate": 9.08521656277716e-06, "loss": 0.7915, "mean_token_accuracy": 0.7412112355232239, "step": 1731 }, { "epoch": 1.1790943139257746, "grad_norm": 1.9028658866882324, "learning_rate": 9.08391730419548e-06, "loss": 0.7195, "mean_token_accuracy": 0.765324205160141, "step": 1732 }, { "epoch": 1.1797752808988764, "grad_norm": 1.9854817390441895, "learning_rate": 9.082617216646601e-06, "loss": 0.6074, "mean_token_accuracy": 0.8001163303852081, "step": 1733 }, { "epoch": 1.1804562478719782, "grad_norm": 2.0250842571258545, "learning_rate": 9.081316300394417e-06, "loss": 0.6789, "mean_token_accuracy": 0.7771673798561096, "step": 1734 }, { "epoch": 1.18113721484508, "grad_norm": 1.872612714767456, "learning_rate": 9.080014555702993e-06, "loss": 0.6813, "mean_token_accuracy": 0.7692495286464691, "step": 1735 }, { "epoch": 1.1818181818181819, "grad_norm": 1.769277572631836, "learning_rate": 9.078711982836566e-06, "loss": 0.6657, "mean_token_accuracy": 0.7856216132640839, "step": 1736 }, { "epoch": 1.1824991487912837, "grad_norm": 2.069793224334717, "learning_rate": 9.077408582059527e-06, "loss": 0.743, "mean_token_accuracy": 0.7583776414394379, "step": 1737 }, { "epoch": 1.1831801157643853, "grad_norm": 1.9063910245895386, "learning_rate": 9.076104353636454e-06, "loss": 0.6152, "mean_token_accuracy": 0.8050388693809509, "step": 1738 }, { "epoch": 1.1838610827374871, "grad_norm": 1.786989450454712, "learning_rate": 9.07479929783208e-06, "loss": 0.7335, "mean_token_accuracy": 0.7600734531879425, "step": 1739 }, { "epoch": 1.184542049710589, "grad_norm": 1.8654767274856567, "learning_rate": 9.073493414911308e-06, "loss": 0.7418, "mean_token_accuracy": 0.7405271232128143, "step": 1740 }, { "epoch": 1.1852230166836908, "grad_norm": 1.7453440427780151, "learning_rate": 9.072186705139212e-06, "loss": 0.7627, "mean_token_accuracy": 0.7333837151527405, "step": 1741 }, { "epoch": 1.1859039836567926, "grad_norm": 1.847487449645996, "learning_rate": 9.070879168781033e-06, "loss": 0.6825, "mean_token_accuracy": 0.7686491012573242, "step": 1742 }, { "epoch": 1.1865849506298944, "grad_norm": 1.7204887866973877, "learning_rate": 9.069570806102177e-06, "loss": 0.7042, "mean_token_accuracy": 0.7684272825717926, "step": 1743 }, { "epoch": 1.1872659176029963, "grad_norm": 1.7699694633483887, "learning_rate": 9.068261617368221e-06, "loss": 0.6965, "mean_token_accuracy": 0.7626553177833557, "step": 1744 }, { "epoch": 1.187946884576098, "grad_norm": 1.9767454862594604, "learning_rate": 9.06695160284491e-06, "loss": 0.768, "mean_token_accuracy": 0.7461754977703094, "step": 1745 }, { "epoch": 1.1886278515492, "grad_norm": 1.792629361152649, "learning_rate": 9.065640762798153e-06, "loss": 0.7414, "mean_token_accuracy": 0.7600275874137878, "step": 1746 }, { "epoch": 1.1893088185223017, "grad_norm": 1.7185975313186646, "learning_rate": 9.06432909749403e-06, "loss": 0.6983, "mean_token_accuracy": 0.767496258020401, "step": 1747 }, { "epoch": 1.1899897854954036, "grad_norm": 1.6368910074234009, "learning_rate": 9.063016607198785e-06, "loss": 0.7511, "mean_token_accuracy": 0.7525335252285004, "step": 1748 }, { "epoch": 1.1906707524685052, "grad_norm": 1.7826359272003174, "learning_rate": 9.061703292178836e-06, "loss": 0.7053, "mean_token_accuracy": 0.7628914415836334, "step": 1749 }, { "epoch": 1.191351719441607, "grad_norm": 1.7700186967849731, "learning_rate": 9.060389152700759e-06, "loss": 0.7002, "mean_token_accuracy": 0.7797207236289978, "step": 1750 }, { "epoch": 1.1920326864147088, "grad_norm": 1.5138884782791138, "learning_rate": 9.059074189031308e-06, "loss": 0.8678, "mean_token_accuracy": 0.7191881239414215, "step": 1751 }, { "epoch": 1.1927136533878107, "grad_norm": 1.8188151121139526, "learning_rate": 9.057758401437393e-06, "loss": 0.8241, "mean_token_accuracy": 0.7227032780647278, "step": 1752 }, { "epoch": 1.1933946203609125, "grad_norm": 1.7882380485534668, "learning_rate": 9.056441790186099e-06, "loss": 0.6821, "mean_token_accuracy": 0.7732020616531372, "step": 1753 }, { "epoch": 1.1940755873340143, "grad_norm": 1.9419894218444824, "learning_rate": 9.05512435554468e-06, "loss": 0.6768, "mean_token_accuracy": 0.7637686431407928, "step": 1754 }, { "epoch": 1.1947565543071161, "grad_norm": 1.6551188230514526, "learning_rate": 9.053806097780548e-06, "loss": 0.8012, "mean_token_accuracy": 0.7510688006877899, "step": 1755 }, { "epoch": 1.195437521280218, "grad_norm": 1.7080632448196411, "learning_rate": 9.052487017161289e-06, "loss": 0.8344, "mean_token_accuracy": 0.7197174429893494, "step": 1756 }, { "epoch": 1.1961184882533198, "grad_norm": 1.7365134954452515, "learning_rate": 9.051167113954655e-06, "loss": 0.7985, "mean_token_accuracy": 0.7571618258953094, "step": 1757 }, { "epoch": 1.1967994552264214, "grad_norm": 1.750636100769043, "learning_rate": 9.049846388428564e-06, "loss": 0.8123, "mean_token_accuracy": 0.7477938830852509, "step": 1758 }, { "epoch": 1.1974804221995234, "grad_norm": 1.6492189168930054, "learning_rate": 9.048524840851103e-06, "loss": 0.8812, "mean_token_accuracy": 0.7191182971000671, "step": 1759 }, { "epoch": 1.198161389172625, "grad_norm": 1.6672741174697876, "learning_rate": 9.04720247149052e-06, "loss": 0.7092, "mean_token_accuracy": 0.7437719106674194, "step": 1760 }, { "epoch": 1.1988423561457269, "grad_norm": 1.7529971599578857, "learning_rate": 9.04587928061524e-06, "loss": 0.8226, "mean_token_accuracy": 0.7433914244174957, "step": 1761 }, { "epoch": 1.1995233231188287, "grad_norm": 1.7426724433898926, "learning_rate": 9.044555268493842e-06, "loss": 0.6657, "mean_token_accuracy": 0.7810835540294647, "step": 1762 }, { "epoch": 1.2002042900919305, "grad_norm": 1.9403858184814453, "learning_rate": 9.043230435395083e-06, "loss": 0.7617, "mean_token_accuracy": 0.7546471655368805, "step": 1763 }, { "epoch": 1.2008852570650324, "grad_norm": 1.6609998941421509, "learning_rate": 9.04190478158788e-06, "loss": 0.7335, "mean_token_accuracy": 0.7520596981048584, "step": 1764 }, { "epoch": 1.2015662240381342, "grad_norm": 1.62490713596344, "learning_rate": 9.040578307341322e-06, "loss": 0.7458, "mean_token_accuracy": 0.7669457197189331, "step": 1765 }, { "epoch": 1.202247191011236, "grad_norm": 1.9062113761901855, "learning_rate": 9.039251012924655e-06, "loss": 0.7523, "mean_token_accuracy": 0.735367476940155, "step": 1766 }, { "epoch": 1.2029281579843378, "grad_norm": 1.806621789932251, "learning_rate": 9.037922898607303e-06, "loss": 0.6722, "mean_token_accuracy": 0.7731513977050781, "step": 1767 }, { "epoch": 1.2036091249574397, "grad_norm": 1.8057165145874023, "learning_rate": 9.036593964658849e-06, "loss": 0.6066, "mean_token_accuracy": 0.7978222966194153, "step": 1768 }, { "epoch": 1.2042900919305413, "grad_norm": 1.7555705308914185, "learning_rate": 9.035264211349047e-06, "loss": 0.7978, "mean_token_accuracy": 0.7250668704509735, "step": 1769 }, { "epoch": 1.204971058903643, "grad_norm": 1.7848775386810303, "learning_rate": 9.03393363894781e-06, "loss": 0.8335, "mean_token_accuracy": 0.7302711009979248, "step": 1770 }, { "epoch": 1.205652025876745, "grad_norm": 1.7191060781478882, "learning_rate": 9.032602247725224e-06, "loss": 0.7141, "mean_token_accuracy": 0.771205484867096, "step": 1771 }, { "epoch": 1.2063329928498467, "grad_norm": 1.7228500843048096, "learning_rate": 9.03127003795154e-06, "loss": 0.8066, "mean_token_accuracy": 0.7211948335170746, "step": 1772 }, { "epoch": 1.2070139598229486, "grad_norm": 1.904587745666504, "learning_rate": 9.029937009897176e-06, "loss": 0.7677, "mean_token_accuracy": 0.7526505887508392, "step": 1773 }, { "epoch": 1.2076949267960504, "grad_norm": 1.940287709236145, "learning_rate": 9.028603163832711e-06, "loss": 0.6372, "mean_token_accuracy": 0.7914485335350037, "step": 1774 }, { "epoch": 1.2083758937691522, "grad_norm": 1.6906038522720337, "learning_rate": 9.027268500028897e-06, "loss": 0.6547, "mean_token_accuracy": 0.788988471031189, "step": 1775 }, { "epoch": 1.209056860742254, "grad_norm": 1.7292590141296387, "learning_rate": 9.025933018756646e-06, "loss": 0.7399, "mean_token_accuracy": 0.7410309910774231, "step": 1776 }, { "epoch": 1.2097378277153559, "grad_norm": 1.702305555343628, "learning_rate": 9.02459672028704e-06, "loss": 0.7649, "mean_token_accuracy": 0.7600404918193817, "step": 1777 }, { "epoch": 1.2104187946884577, "grad_norm": 1.6830917596817017, "learning_rate": 9.023259604891324e-06, "loss": 0.7192, "mean_token_accuracy": 0.7681789100170135, "step": 1778 }, { "epoch": 1.2110997616615595, "grad_norm": 1.6245981454849243, "learning_rate": 9.02192167284091e-06, "loss": 0.7671, "mean_token_accuracy": 0.7510569989681244, "step": 1779 }, { "epoch": 1.2117807286346611, "grad_norm": 1.7491657733917236, "learning_rate": 9.020582924407378e-06, "loss": 0.6964, "mean_token_accuracy": 0.7505424618721008, "step": 1780 }, { "epoch": 1.212461695607763, "grad_norm": 1.8597620725631714, "learning_rate": 9.019243359862469e-06, "loss": 0.6346, "mean_token_accuracy": 0.7757901251316071, "step": 1781 }, { "epoch": 1.2131426625808648, "grad_norm": 1.8093851804733276, "learning_rate": 9.017902979478093e-06, "loss": 0.7253, "mean_token_accuracy": 0.7640716731548309, "step": 1782 }, { "epoch": 1.2138236295539666, "grad_norm": 1.9647167921066284, "learning_rate": 9.016561783526328e-06, "loss": 0.7922, "mean_token_accuracy": 0.7605278491973877, "step": 1783 }, { "epoch": 1.2145045965270684, "grad_norm": 1.831082820892334, "learning_rate": 9.015219772279408e-06, "loss": 0.8479, "mean_token_accuracy": 0.7509850561618805, "step": 1784 }, { "epoch": 1.2151855635001703, "grad_norm": 1.7656781673431396, "learning_rate": 9.013876946009748e-06, "loss": 0.6746, "mean_token_accuracy": 0.7668471336364746, "step": 1785 }, { "epoch": 1.215866530473272, "grad_norm": 1.7804994583129883, "learning_rate": 9.01253330498991e-06, "loss": 0.6796, "mean_token_accuracy": 0.7621986567974091, "step": 1786 }, { "epoch": 1.216547497446374, "grad_norm": 1.7044222354888916, "learning_rate": 9.011188849492638e-06, "loss": 0.7119, "mean_token_accuracy": 0.7640595138072968, "step": 1787 }, { "epoch": 1.2172284644194757, "grad_norm": 1.7886691093444824, "learning_rate": 9.00984357979083e-06, "loss": 0.7733, "mean_token_accuracy": 0.7437613308429718, "step": 1788 }, { "epoch": 1.2179094313925773, "grad_norm": 1.9008708000183105, "learning_rate": 9.008497496157554e-06, "loss": 0.7727, "mean_token_accuracy": 0.7270643413066864, "step": 1789 }, { "epoch": 1.2185903983656792, "grad_norm": 1.9679324626922607, "learning_rate": 9.007150598866042e-06, "loss": 0.7597, "mean_token_accuracy": 0.7549166083335876, "step": 1790 }, { "epoch": 1.219271365338781, "grad_norm": 1.9894566535949707, "learning_rate": 9.005802888189694e-06, "loss": 0.6759, "mean_token_accuracy": 0.7764894366264343, "step": 1791 }, { "epoch": 1.2199523323118828, "grad_norm": 1.8717480897903442, "learning_rate": 9.00445436440207e-06, "loss": 0.7497, "mean_token_accuracy": 0.7524578869342804, "step": 1792 }, { "epoch": 1.2206332992849847, "grad_norm": 1.7770695686340332, "learning_rate": 9.003105027776901e-06, "loss": 0.7959, "mean_token_accuracy": 0.7514853477478027, "step": 1793 }, { "epoch": 1.2213142662580865, "grad_norm": 1.838653326034546, "learning_rate": 9.001754878588079e-06, "loss": 0.7007, "mean_token_accuracy": 0.7756403088569641, "step": 1794 }, { "epoch": 1.2219952332311883, "grad_norm": 2.0486457347869873, "learning_rate": 9.00040391710966e-06, "loss": 0.5857, "mean_token_accuracy": 0.806473582983017, "step": 1795 }, { "epoch": 1.2226762002042901, "grad_norm": 1.7230045795440674, "learning_rate": 8.999052143615866e-06, "loss": 0.7516, "mean_token_accuracy": 0.7467727661132812, "step": 1796 }, { "epoch": 1.223357167177392, "grad_norm": 1.8030668497085571, "learning_rate": 8.99769955838109e-06, "loss": 0.6416, "mean_token_accuracy": 0.7760827541351318, "step": 1797 }, { "epoch": 1.2240381341504938, "grad_norm": 1.7492269277572632, "learning_rate": 8.996346161679877e-06, "loss": 0.7883, "mean_token_accuracy": 0.7530811131000519, "step": 1798 }, { "epoch": 1.2247191011235956, "grad_norm": 1.818162202835083, "learning_rate": 8.994991953786949e-06, "loss": 0.7255, "mean_token_accuracy": 0.7495923340320587, "step": 1799 }, { "epoch": 1.2254000680966972, "grad_norm": 1.8517234325408936, "learning_rate": 8.993636934977186e-06, "loss": 0.6866, "mean_token_accuracy": 0.7693005800247192, "step": 1800 }, { "epoch": 1.226081035069799, "grad_norm": 1.8627450466156006, "learning_rate": 8.992281105525635e-06, "loss": 0.6093, "mean_token_accuracy": 0.8061238825321198, "step": 1801 }, { "epoch": 1.2267620020429009, "grad_norm": 1.7507283687591553, "learning_rate": 8.990924465707504e-06, "loss": 0.6606, "mean_token_accuracy": 0.7697960734367371, "step": 1802 }, { "epoch": 1.2274429690160027, "grad_norm": 1.7730413675308228, "learning_rate": 8.989567015798173e-06, "loss": 0.6604, "mean_token_accuracy": 0.7768855094909668, "step": 1803 }, { "epoch": 1.2281239359891045, "grad_norm": 1.982144832611084, "learning_rate": 8.988208756073177e-06, "loss": 0.692, "mean_token_accuracy": 0.7720184326171875, "step": 1804 }, { "epoch": 1.2288049029622063, "grad_norm": 1.6614881753921509, "learning_rate": 8.986849686808226e-06, "loss": 0.7784, "mean_token_accuracy": 0.7518456280231476, "step": 1805 }, { "epoch": 1.2294858699353082, "grad_norm": 1.7016164064407349, "learning_rate": 8.985489808279181e-06, "loss": 0.7468, "mean_token_accuracy": 0.7576333284378052, "step": 1806 }, { "epoch": 1.23016683690841, "grad_norm": 1.8613407611846924, "learning_rate": 8.984129120762079e-06, "loss": 0.7052, "mean_token_accuracy": 0.7652458250522614, "step": 1807 }, { "epoch": 1.2308478038815118, "grad_norm": 1.8126311302185059, "learning_rate": 8.982767624533118e-06, "loss": 0.6753, "mean_token_accuracy": 0.7729280591011047, "step": 1808 }, { "epoch": 1.2315287708546134, "grad_norm": 1.8283745050430298, "learning_rate": 8.981405319868657e-06, "loss": 0.673, "mean_token_accuracy": 0.7765382528305054, "step": 1809 }, { "epoch": 1.2322097378277155, "grad_norm": 1.8992139101028442, "learning_rate": 8.98004220704522e-06, "loss": 0.6138, "mean_token_accuracy": 0.7941725552082062, "step": 1810 }, { "epoch": 1.232890704800817, "grad_norm": 1.9361796379089355, "learning_rate": 8.978678286339499e-06, "loss": 0.6129, "mean_token_accuracy": 0.7985895872116089, "step": 1811 }, { "epoch": 1.233571671773919, "grad_norm": 1.7746326923370361, "learning_rate": 8.977313558028347e-06, "loss": 0.779, "mean_token_accuracy": 0.7382631003856659, "step": 1812 }, { "epoch": 1.2342526387470207, "grad_norm": 1.8557454347610474, "learning_rate": 8.975948022388778e-06, "loss": 0.7178, "mean_token_accuracy": 0.7565895915031433, "step": 1813 }, { "epoch": 1.2349336057201226, "grad_norm": 1.8685427904129028, "learning_rate": 8.974581679697975e-06, "loss": 0.7308, "mean_token_accuracy": 0.7493669986724854, "step": 1814 }, { "epoch": 1.2356145726932244, "grad_norm": 1.9432954788208008, "learning_rate": 8.973214530233285e-06, "loss": 0.6945, "mean_token_accuracy": 0.7660083770751953, "step": 1815 }, { "epoch": 1.2362955396663262, "grad_norm": 1.6565096378326416, "learning_rate": 8.971846574272214e-06, "loss": 0.8011, "mean_token_accuracy": 0.7460070848464966, "step": 1816 }, { "epoch": 1.236976506639428, "grad_norm": 1.8150756359100342, "learning_rate": 8.970477812092434e-06, "loss": 0.7747, "mean_token_accuracy": 0.7488499283790588, "step": 1817 }, { "epoch": 1.2376574736125299, "grad_norm": 1.826412558555603, "learning_rate": 8.969108243971781e-06, "loss": 0.5503, "mean_token_accuracy": 0.7883681356906891, "step": 1818 }, { "epoch": 1.2383384405856317, "grad_norm": 1.8693950176239014, "learning_rate": 8.967737870188257e-06, "loss": 0.6563, "mean_token_accuracy": 0.7809664607048035, "step": 1819 }, { "epoch": 1.2390194075587333, "grad_norm": 1.6830534934997559, "learning_rate": 8.966366691020022e-06, "loss": 0.8238, "mean_token_accuracy": 0.727841854095459, "step": 1820 }, { "epoch": 1.2397003745318351, "grad_norm": 1.8000109195709229, "learning_rate": 8.964994706745404e-06, "loss": 0.7002, "mean_token_accuracy": 0.7710365951061249, "step": 1821 }, { "epoch": 1.240381341504937, "grad_norm": 1.762335181236267, "learning_rate": 8.963621917642892e-06, "loss": 0.6354, "mean_token_accuracy": 0.7776910960674286, "step": 1822 }, { "epoch": 1.2410623084780388, "grad_norm": 1.6829533576965332, "learning_rate": 8.96224832399114e-06, "loss": 0.8036, "mean_token_accuracy": 0.7445551753044128, "step": 1823 }, { "epoch": 1.2417432754511406, "grad_norm": 1.7306574583053589, "learning_rate": 8.960873926068967e-06, "loss": 0.6949, "mean_token_accuracy": 0.7713648080825806, "step": 1824 }, { "epoch": 1.2424242424242424, "grad_norm": 1.7456127405166626, "learning_rate": 8.959498724155349e-06, "loss": 0.749, "mean_token_accuracy": 0.7294979095458984, "step": 1825 }, { "epoch": 1.2431052093973443, "grad_norm": 1.7987291812896729, "learning_rate": 8.95812271852943e-06, "loss": 0.7896, "mean_token_accuracy": 0.7428217530250549, "step": 1826 }, { "epoch": 1.243786176370446, "grad_norm": 1.6670148372650146, "learning_rate": 8.956745909470517e-06, "loss": 0.9319, "mean_token_accuracy": 0.7072334289550781, "step": 1827 }, { "epoch": 1.244467143343548, "grad_norm": 1.6304590702056885, "learning_rate": 8.95536829725808e-06, "loss": 0.8088, "mean_token_accuracy": 0.7309253513813019, "step": 1828 }, { "epoch": 1.2451481103166497, "grad_norm": 1.792508602142334, "learning_rate": 8.95398988217175e-06, "loss": 0.6377, "mean_token_accuracy": 0.7945215702056885, "step": 1829 }, { "epoch": 1.2458290772897516, "grad_norm": 1.915236473083496, "learning_rate": 8.952610664491323e-06, "loss": 0.7711, "mean_token_accuracy": 0.7464571297168732, "step": 1830 }, { "epoch": 1.2465100442628532, "grad_norm": 1.7212415933609009, "learning_rate": 8.951230644496758e-06, "loss": 0.8052, "mean_token_accuracy": 0.723166823387146, "step": 1831 }, { "epoch": 1.247191011235955, "grad_norm": 1.6210681200027466, "learning_rate": 8.949849822468175e-06, "loss": 0.8309, "mean_token_accuracy": 0.7314178347587585, "step": 1832 }, { "epoch": 1.2478719782090568, "grad_norm": 1.7088890075683594, "learning_rate": 8.948468198685857e-06, "loss": 0.7119, "mean_token_accuracy": 0.7578313648700714, "step": 1833 }, { "epoch": 1.2485529451821586, "grad_norm": 1.6295654773712158, "learning_rate": 8.947085773430251e-06, "loss": 0.6769, "mean_token_accuracy": 0.7745909094810486, "step": 1834 }, { "epoch": 1.2492339121552605, "grad_norm": 1.6520298719406128, "learning_rate": 8.94570254698197e-06, "loss": 0.7792, "mean_token_accuracy": 0.7263862490653992, "step": 1835 }, { "epoch": 1.2499148791283623, "grad_norm": 1.771848201751709, "learning_rate": 8.94431851962178e-06, "loss": 0.7531, "mean_token_accuracy": 0.7237650454044342, "step": 1836 }, { "epoch": 1.2505958461014641, "grad_norm": 1.9032680988311768, "learning_rate": 8.942933691630618e-06, "loss": 0.7361, "mean_token_accuracy": 0.7619430124759674, "step": 1837 }, { "epoch": 1.251276813074566, "grad_norm": 1.814904808998108, "learning_rate": 8.941548063289584e-06, "loss": 0.7276, "mean_token_accuracy": 0.7620201706886292, "step": 1838 }, { "epoch": 1.2519577800476678, "grad_norm": 1.7108250856399536, "learning_rate": 8.940161634879933e-06, "loss": 0.8472, "mean_token_accuracy": 0.7238894701004028, "step": 1839 }, { "epoch": 1.2526387470207694, "grad_norm": 1.761521339416504, "learning_rate": 8.93877440668309e-06, "loss": 0.7877, "mean_token_accuracy": 0.7318826913833618, "step": 1840 }, { "epoch": 1.2533197139938714, "grad_norm": 1.8200691938400269, "learning_rate": 8.937386378980637e-06, "loss": 0.6777, "mean_token_accuracy": 0.7687727510929108, "step": 1841 }, { "epoch": 1.254000680966973, "grad_norm": 1.6731996536254883, "learning_rate": 8.93599755205432e-06, "loss": 0.7669, "mean_token_accuracy": 0.737275093793869, "step": 1842 }, { "epoch": 1.2546816479400749, "grad_norm": 1.7796034812927246, "learning_rate": 8.934607926186052e-06, "loss": 0.7434, "mean_token_accuracy": 0.7601605653762817, "step": 1843 }, { "epoch": 1.2553626149131767, "grad_norm": 1.5892688035964966, "learning_rate": 8.9332175016579e-06, "loss": 0.8286, "mean_token_accuracy": 0.7313100695610046, "step": 1844 }, { "epoch": 1.2560435818862785, "grad_norm": 1.83755362033844, "learning_rate": 8.931826278752098e-06, "loss": 0.7686, "mean_token_accuracy": 0.7571521103382111, "step": 1845 }, { "epoch": 1.2567245488593803, "grad_norm": 1.7304649353027344, "learning_rate": 8.930434257751041e-06, "loss": 0.5922, "mean_token_accuracy": 0.8011581301689148, "step": 1846 }, { "epoch": 1.2574055158324822, "grad_norm": 1.7581530809402466, "learning_rate": 8.929041438937287e-06, "loss": 0.7586, "mean_token_accuracy": 0.7552484571933746, "step": 1847 }, { "epoch": 1.258086482805584, "grad_norm": 1.8735061883926392, "learning_rate": 8.927647822593554e-06, "loss": 0.8492, "mean_token_accuracy": 0.7282113134860992, "step": 1848 }, { "epoch": 1.2587674497786856, "grad_norm": 1.7086904048919678, "learning_rate": 8.926253409002724e-06, "loss": 0.7902, "mean_token_accuracy": 0.7485077083110809, "step": 1849 }, { "epoch": 1.2594484167517876, "grad_norm": 1.9250469207763672, "learning_rate": 8.924858198447839e-06, "loss": 0.7195, "mean_token_accuracy": 0.7657827734947205, "step": 1850 }, { "epoch": 1.2601293837248893, "grad_norm": 1.7110787630081177, "learning_rate": 8.923462191212102e-06, "loss": 0.7827, "mean_token_accuracy": 0.7372272312641144, "step": 1851 }, { "epoch": 1.260810350697991, "grad_norm": 1.6457388401031494, "learning_rate": 8.922065387578881e-06, "loss": 0.8187, "mean_token_accuracy": 0.7298794388771057, "step": 1852 }, { "epoch": 1.261491317671093, "grad_norm": 1.6796797513961792, "learning_rate": 8.920667787831704e-06, "loss": 0.8454, "mean_token_accuracy": 0.7262552082538605, "step": 1853 }, { "epoch": 1.2621722846441947, "grad_norm": 1.7795770168304443, "learning_rate": 8.919269392254261e-06, "loss": 0.8114, "mean_token_accuracy": 0.7367283701896667, "step": 1854 }, { "epoch": 1.2628532516172966, "grad_norm": 1.5879627466201782, "learning_rate": 8.9178702011304e-06, "loss": 0.8262, "mean_token_accuracy": 0.7455514073371887, "step": 1855 }, { "epoch": 1.2635342185903984, "grad_norm": 1.8840993642807007, "learning_rate": 8.916470214744138e-06, "loss": 0.6008, "mean_token_accuracy": 0.8069439232349396, "step": 1856 }, { "epoch": 1.2642151855635002, "grad_norm": 1.7232149839401245, "learning_rate": 8.915069433379644e-06, "loss": 0.7657, "mean_token_accuracy": 0.7435509264469147, "step": 1857 }, { "epoch": 1.264896152536602, "grad_norm": 1.8819489479064941, "learning_rate": 8.913667857321257e-06, "loss": 0.6586, "mean_token_accuracy": 0.7895686626434326, "step": 1858 }, { "epoch": 1.2655771195097039, "grad_norm": 1.7238894701004028, "learning_rate": 8.912265486853474e-06, "loss": 0.7004, "mean_token_accuracy": 0.7640913128852844, "step": 1859 }, { "epoch": 1.2662580864828055, "grad_norm": 1.7072490453720093, "learning_rate": 8.91086232226095e-06, "loss": 0.7541, "mean_token_accuracy": 0.7625080645084381, "step": 1860 }, { "epoch": 1.2669390534559075, "grad_norm": 1.8088269233703613, "learning_rate": 8.909458363828505e-06, "loss": 0.7754, "mean_token_accuracy": 0.7406366765499115, "step": 1861 }, { "epoch": 1.2676200204290091, "grad_norm": 1.9093289375305176, "learning_rate": 8.908053611841119e-06, "loss": 0.7776, "mean_token_accuracy": 0.7525279819965363, "step": 1862 }, { "epoch": 1.268300987402111, "grad_norm": 1.7363427877426147, "learning_rate": 8.906648066583934e-06, "loss": 0.6912, "mean_token_accuracy": 0.7717678546905518, "step": 1863 }, { "epoch": 1.2689819543752128, "grad_norm": 1.7680310010910034, "learning_rate": 8.905241728342253e-06, "loss": 0.7513, "mean_token_accuracy": 0.7582981586456299, "step": 1864 }, { "epoch": 1.2696629213483146, "grad_norm": 1.7597377300262451, "learning_rate": 8.903834597401537e-06, "loss": 0.7611, "mean_token_accuracy": 0.7461023926734924, "step": 1865 }, { "epoch": 1.2703438883214164, "grad_norm": 1.830428123474121, "learning_rate": 8.90242667404741e-06, "loss": 0.7406, "mean_token_accuracy": 0.746415764093399, "step": 1866 }, { "epoch": 1.2710248552945183, "grad_norm": 1.6470613479614258, "learning_rate": 8.901017958565661e-06, "loss": 0.8111, "mean_token_accuracy": 0.7463804185390472, "step": 1867 }, { "epoch": 1.27170582226762, "grad_norm": 1.7735273838043213, "learning_rate": 8.899608451242233e-06, "loss": 0.7522, "mean_token_accuracy": 0.73903888463974, "step": 1868 }, { "epoch": 1.272386789240722, "grad_norm": 1.681498408317566, "learning_rate": 8.898198152363231e-06, "loss": 0.7834, "mean_token_accuracy": 0.7332159876823425, "step": 1869 }, { "epoch": 1.2730677562138237, "grad_norm": 1.911963939666748, "learning_rate": 8.896787062214926e-06, "loss": 0.6832, "mean_token_accuracy": 0.7709405720233917, "step": 1870 }, { "epoch": 1.2737487231869253, "grad_norm": 1.7766227722167969, "learning_rate": 8.895375181083741e-06, "loss": 0.6905, "mean_token_accuracy": 0.7735888957977295, "step": 1871 }, { "epoch": 1.2744296901600272, "grad_norm": 1.769210696220398, "learning_rate": 8.893962509256268e-06, "loss": 0.6788, "mean_token_accuracy": 0.7768207788467407, "step": 1872 }, { "epoch": 1.275110657133129, "grad_norm": 1.7188000679016113, "learning_rate": 8.892549047019257e-06, "loss": 0.6356, "mean_token_accuracy": 0.790284276008606, "step": 1873 }, { "epoch": 1.2757916241062308, "grad_norm": 1.716653823852539, "learning_rate": 8.891134794659613e-06, "loss": 0.7859, "mean_token_accuracy": 0.7372171580791473, "step": 1874 }, { "epoch": 1.2764725910793326, "grad_norm": 1.7942228317260742, "learning_rate": 8.889719752464408e-06, "loss": 0.7366, "mean_token_accuracy": 0.763150691986084, "step": 1875 }, { "epoch": 1.2771535580524345, "grad_norm": 1.7155572175979614, "learning_rate": 8.888303920720872e-06, "loss": 0.6968, "mean_token_accuracy": 0.7721517980098724, "step": 1876 }, { "epoch": 1.2778345250255363, "grad_norm": 1.8139407634735107, "learning_rate": 8.886887299716395e-06, "loss": 0.6802, "mean_token_accuracy": 0.7608284652233124, "step": 1877 }, { "epoch": 1.2785154919986381, "grad_norm": 2.0289289951324463, "learning_rate": 8.885469889738529e-06, "loss": 0.6836, "mean_token_accuracy": 0.7726547420024872, "step": 1878 }, { "epoch": 1.27919645897174, "grad_norm": 1.841859221458435, "learning_rate": 8.884051691074982e-06, "loss": 0.7832, "mean_token_accuracy": 0.7490105628967285, "step": 1879 }, { "epoch": 1.2798774259448416, "grad_norm": 1.6810497045516968, "learning_rate": 8.882632704013626e-06, "loss": 0.7119, "mean_token_accuracy": 0.7532955408096313, "step": 1880 }, { "epoch": 1.2805583929179436, "grad_norm": 1.6515531539916992, "learning_rate": 8.881212928842491e-06, "loss": 0.8286, "mean_token_accuracy": 0.7171813547611237, "step": 1881 }, { "epoch": 1.2812393598910452, "grad_norm": 1.7754255533218384, "learning_rate": 8.879792365849768e-06, "loss": 0.6924, "mean_token_accuracy": 0.7678831219673157, "step": 1882 }, { "epoch": 1.281920326864147, "grad_norm": 1.7210406064987183, "learning_rate": 8.87837101532381e-06, "loss": 0.6995, "mean_token_accuracy": 0.7560451328754425, "step": 1883 }, { "epoch": 1.2826012938372489, "grad_norm": 1.7172799110412598, "learning_rate": 8.876948877553124e-06, "loss": 0.7679, "mean_token_accuracy": 0.7357721030712128, "step": 1884 }, { "epoch": 1.2832822608103507, "grad_norm": 1.8726271390914917, "learning_rate": 8.875525952826383e-06, "loss": 0.7665, "mean_token_accuracy": 0.7500739395618439, "step": 1885 }, { "epoch": 1.2839632277834525, "grad_norm": 1.879784345626831, "learning_rate": 8.874102241432414e-06, "loss": 0.9252, "mean_token_accuracy": 0.6966257393360138, "step": 1886 }, { "epoch": 1.2846441947565543, "grad_norm": 1.773353934288025, "learning_rate": 8.872677743660209e-06, "loss": 0.7265, "mean_token_accuracy": 0.7506945431232452, "step": 1887 }, { "epoch": 1.2853251617296562, "grad_norm": 1.9048795700073242, "learning_rate": 8.871252459798918e-06, "loss": 0.7967, "mean_token_accuracy": 0.7397038638591766, "step": 1888 }, { "epoch": 1.286006128702758, "grad_norm": 1.784545660018921, "learning_rate": 8.869826390137848e-06, "loss": 0.6809, "mean_token_accuracy": 0.7727041840553284, "step": 1889 }, { "epoch": 1.2866870956758598, "grad_norm": 1.9361697435379028, "learning_rate": 8.868399534966469e-06, "loss": 0.7359, "mean_token_accuracy": 0.7670926451683044, "step": 1890 }, { "epoch": 1.2873680626489614, "grad_norm": 1.7252795696258545, "learning_rate": 8.866971894574407e-06, "loss": 0.6569, "mean_token_accuracy": 0.7844515144824982, "step": 1891 }, { "epoch": 1.2880490296220635, "grad_norm": 1.8092188835144043, "learning_rate": 8.86554346925145e-06, "loss": 0.7305, "mean_token_accuracy": 0.7587889730930328, "step": 1892 }, { "epoch": 1.288729996595165, "grad_norm": 1.5837903022766113, "learning_rate": 8.864114259287545e-06, "loss": 0.8369, "mean_token_accuracy": 0.7318941652774811, "step": 1893 }, { "epoch": 1.289410963568267, "grad_norm": 1.678444504737854, "learning_rate": 8.862684264972796e-06, "loss": 0.7348, "mean_token_accuracy": 0.752456545829773, "step": 1894 }, { "epoch": 1.2900919305413687, "grad_norm": 1.686727523803711, "learning_rate": 8.861253486597473e-06, "loss": 0.6891, "mean_token_accuracy": 0.7697069644927979, "step": 1895 }, { "epoch": 1.2907728975144706, "grad_norm": 1.7667802572250366, "learning_rate": 8.859821924451993e-06, "loss": 0.7549, "mean_token_accuracy": 0.7439951598644257, "step": 1896 }, { "epoch": 1.2914538644875724, "grad_norm": 1.844546914100647, "learning_rate": 8.858389578826945e-06, "loss": 0.6957, "mean_token_accuracy": 0.7589485347270966, "step": 1897 }, { "epoch": 1.2921348314606742, "grad_norm": 1.8690651655197144, "learning_rate": 8.856956450013068e-06, "loss": 0.6977, "mean_token_accuracy": 0.7714897990226746, "step": 1898 }, { "epoch": 1.292815798433776, "grad_norm": 1.681109070777893, "learning_rate": 8.855522538301266e-06, "loss": 0.844, "mean_token_accuracy": 0.738383561372757, "step": 1899 }, { "epoch": 1.2934967654068776, "grad_norm": 1.8881429433822632, "learning_rate": 8.854087843982597e-06, "loss": 0.7202, "mean_token_accuracy": 0.7443767488002777, "step": 1900 }, { "epoch": 1.2941777323799797, "grad_norm": 1.790487289428711, "learning_rate": 8.852652367348282e-06, "loss": 0.6486, "mean_token_accuracy": 0.7726520597934723, "step": 1901 }, { "epoch": 1.2948586993530813, "grad_norm": 1.732883095741272, "learning_rate": 8.851216108689697e-06, "loss": 0.6541, "mean_token_accuracy": 0.7831607460975647, "step": 1902 }, { "epoch": 1.2955396663261831, "grad_norm": 1.8239667415618896, "learning_rate": 8.84977906829838e-06, "loss": 0.7006, "mean_token_accuracy": 0.7691961228847504, "step": 1903 }, { "epoch": 1.296220633299285, "grad_norm": 1.6934466361999512, "learning_rate": 8.848341246466024e-06, "loss": 0.7206, "mean_token_accuracy": 0.7607898712158203, "step": 1904 }, { "epoch": 1.2969016002723868, "grad_norm": 1.7526789903640747, "learning_rate": 8.846902643484485e-06, "loss": 0.7009, "mean_token_accuracy": 0.7667503356933594, "step": 1905 }, { "epoch": 1.2975825672454886, "grad_norm": 1.779131293296814, "learning_rate": 8.845463259645774e-06, "loss": 0.7258, "mean_token_accuracy": 0.7649518847465515, "step": 1906 }, { "epoch": 1.2982635342185904, "grad_norm": 1.6796578168869019, "learning_rate": 8.844023095242065e-06, "loss": 0.7211, "mean_token_accuracy": 0.7603534460067749, "step": 1907 }, { "epoch": 1.2989445011916922, "grad_norm": 1.7980769872665405, "learning_rate": 8.842582150565685e-06, "loss": 0.6692, "mean_token_accuracy": 0.777534544467926, "step": 1908 }, { "epoch": 1.299625468164794, "grad_norm": 1.7279733419418335, "learning_rate": 8.84114042590912e-06, "loss": 0.6987, "mean_token_accuracy": 0.7540516257286072, "step": 1909 }, { "epoch": 1.300306435137896, "grad_norm": 1.7811402082443237, "learning_rate": 8.839697921565019e-06, "loss": 0.7426, "mean_token_accuracy": 0.7446891069412231, "step": 1910 }, { "epoch": 1.3009874021109975, "grad_norm": 1.6307579278945923, "learning_rate": 8.838254637826188e-06, "loss": 0.6773, "mean_token_accuracy": 0.7730073034763336, "step": 1911 }, { "epoch": 1.3016683690840996, "grad_norm": 1.8442343473434448, "learning_rate": 8.836810574985584e-06, "loss": 0.6195, "mean_token_accuracy": 0.79389488697052, "step": 1912 }, { "epoch": 1.3023493360572012, "grad_norm": 1.8182225227355957, "learning_rate": 8.835365733336329e-06, "loss": 0.6964, "mean_token_accuracy": 0.7540337145328522, "step": 1913 }, { "epoch": 1.303030303030303, "grad_norm": 1.7127459049224854, "learning_rate": 8.833920113171708e-06, "loss": 0.7938, "mean_token_accuracy": 0.7426830232143402, "step": 1914 }, { "epoch": 1.3037112700034048, "grad_norm": 1.6978070735931396, "learning_rate": 8.83247371478515e-06, "loss": 0.7879, "mean_token_accuracy": 0.745358943939209, "step": 1915 }, { "epoch": 1.3043922369765066, "grad_norm": 1.8390004634857178, "learning_rate": 8.831026538470256e-06, "loss": 0.7287, "mean_token_accuracy": 0.7544108331203461, "step": 1916 }, { "epoch": 1.3050732039496085, "grad_norm": 1.6289656162261963, "learning_rate": 8.829578584520773e-06, "loss": 0.8708, "mean_token_accuracy": 0.7234528958797455, "step": 1917 }, { "epoch": 1.3057541709227103, "grad_norm": 1.9995222091674805, "learning_rate": 8.828129853230617e-06, "loss": 0.7036, "mean_token_accuracy": 0.7715011835098267, "step": 1918 }, { "epoch": 1.3064351378958121, "grad_norm": 1.8390300273895264, "learning_rate": 8.826680344893852e-06, "loss": 0.6955, "mean_token_accuracy": 0.7586704790592194, "step": 1919 }, { "epoch": 1.3071161048689137, "grad_norm": 1.642879605293274, "learning_rate": 8.825230059804706e-06, "loss": 0.8307, "mean_token_accuracy": 0.7243013679981232, "step": 1920 }, { "epoch": 1.3077970718420158, "grad_norm": 1.6828973293304443, "learning_rate": 8.823778998257561e-06, "loss": 0.8663, "mean_token_accuracy": 0.7229560315608978, "step": 1921 }, { "epoch": 1.3084780388151174, "grad_norm": 1.8551701307296753, "learning_rate": 8.822327160546961e-06, "loss": 0.6749, "mean_token_accuracy": 0.7813259363174438, "step": 1922 }, { "epoch": 1.3091590057882192, "grad_norm": 1.8311564922332764, "learning_rate": 8.820874546967605e-06, "loss": 0.7756, "mean_token_accuracy": 0.733936607837677, "step": 1923 }, { "epoch": 1.309839972761321, "grad_norm": 1.716233253479004, "learning_rate": 8.819421157814347e-06, "loss": 0.7166, "mean_token_accuracy": 0.767715722322464, "step": 1924 }, { "epoch": 1.3105209397344229, "grad_norm": 1.79314124584198, "learning_rate": 8.817966993382202e-06, "loss": 0.6297, "mean_token_accuracy": 0.7907609343528748, "step": 1925 }, { "epoch": 1.3112019067075247, "grad_norm": 1.7578552961349487, "learning_rate": 8.81651205396634e-06, "loss": 0.7406, "mean_token_accuracy": 0.7567363083362579, "step": 1926 }, { "epoch": 1.3118828736806265, "grad_norm": 1.77146577835083, "learning_rate": 8.815056339862091e-06, "loss": 0.7402, "mean_token_accuracy": 0.755766749382019, "step": 1927 }, { "epoch": 1.3125638406537283, "grad_norm": 1.7407863140106201, "learning_rate": 8.81359985136494e-06, "loss": 0.7335, "mean_token_accuracy": 0.7418685555458069, "step": 1928 }, { "epoch": 1.3132448076268302, "grad_norm": 1.8860454559326172, "learning_rate": 8.812142588770531e-06, "loss": 0.7716, "mean_token_accuracy": 0.7556568682193756, "step": 1929 }, { "epoch": 1.313925774599932, "grad_norm": 1.9880234003067017, "learning_rate": 8.810684552374662e-06, "loss": 0.6703, "mean_token_accuracy": 0.7757235467433929, "step": 1930 }, { "epoch": 1.3146067415730336, "grad_norm": 1.7836774587631226, "learning_rate": 8.809225742473293e-06, "loss": 0.6491, "mean_token_accuracy": 0.7820946574211121, "step": 1931 }, { "epoch": 1.3152877085461356, "grad_norm": 1.8391543626785278, "learning_rate": 8.807766159362535e-06, "loss": 0.7069, "mean_token_accuracy": 0.7572974264621735, "step": 1932 }, { "epoch": 1.3159686755192372, "grad_norm": 1.7456772327423096, "learning_rate": 8.806305803338659e-06, "loss": 0.8766, "mean_token_accuracy": 0.7157209515571594, "step": 1933 }, { "epoch": 1.316649642492339, "grad_norm": 1.9141871929168701, "learning_rate": 8.804844674698097e-06, "loss": 0.6126, "mean_token_accuracy": 0.7998281419277191, "step": 1934 }, { "epoch": 1.317330609465441, "grad_norm": 1.5967057943344116, "learning_rate": 8.80338277373743e-06, "loss": 0.7149, "mean_token_accuracy": 0.7312992215156555, "step": 1935 }, { "epoch": 1.3180115764385427, "grad_norm": 1.8028663396835327, "learning_rate": 8.801920100753401e-06, "loss": 0.6345, "mean_token_accuracy": 0.7911082804203033, "step": 1936 }, { "epoch": 1.3186925434116445, "grad_norm": 1.7355409860610962, "learning_rate": 8.800456656042907e-06, "loss": 0.7582, "mean_token_accuracy": 0.7544325888156891, "step": 1937 }, { "epoch": 1.3193735103847464, "grad_norm": 1.7897602319717407, "learning_rate": 8.798992439903005e-06, "loss": 0.7677, "mean_token_accuracy": 0.7458985447883606, "step": 1938 }, { "epoch": 1.3200544773578482, "grad_norm": 1.843640685081482, "learning_rate": 8.797527452630904e-06, "loss": 0.6402, "mean_token_accuracy": 0.7806185781955719, "step": 1939 }, { "epoch": 1.32073544433095, "grad_norm": 1.9807162284851074, "learning_rate": 8.796061694523974e-06, "loss": 0.6093, "mean_token_accuracy": 0.7939153611660004, "step": 1940 }, { "epoch": 1.3214164113040519, "grad_norm": 1.7899010181427002, "learning_rate": 8.79459516587974e-06, "loss": 0.8038, "mean_token_accuracy": 0.7462877035140991, "step": 1941 }, { "epoch": 1.3220973782771535, "grad_norm": 1.7637827396392822, "learning_rate": 8.79312786699588e-06, "loss": 0.8452, "mean_token_accuracy": 0.7236028611660004, "step": 1942 }, { "epoch": 1.3227783452502553, "grad_norm": 1.7124147415161133, "learning_rate": 8.791659798170234e-06, "loss": 0.7488, "mean_token_accuracy": 0.7330819070339203, "step": 1943 }, { "epoch": 1.323459312223357, "grad_norm": 1.628735899925232, "learning_rate": 8.790190959700793e-06, "loss": 0.8085, "mean_token_accuracy": 0.7369545102119446, "step": 1944 }, { "epoch": 1.324140279196459, "grad_norm": 1.8712458610534668, "learning_rate": 8.78872135188571e-06, "loss": 0.6911, "mean_token_accuracy": 0.775566577911377, "step": 1945 }, { "epoch": 1.3248212461695608, "grad_norm": 1.81197988986969, "learning_rate": 8.787250975023288e-06, "loss": 0.7419, "mean_token_accuracy": 0.7603749930858612, "step": 1946 }, { "epoch": 1.3255022131426626, "grad_norm": 1.932469367980957, "learning_rate": 8.785779829411991e-06, "loss": 0.5677, "mean_token_accuracy": 0.818142294883728, "step": 1947 }, { "epoch": 1.3261831801157644, "grad_norm": 1.7406387329101562, "learning_rate": 8.784307915350436e-06, "loss": 0.8281, "mean_token_accuracy": 0.7421314716339111, "step": 1948 }, { "epoch": 1.3268641470888662, "grad_norm": 1.809381365776062, "learning_rate": 8.782835233137397e-06, "loss": 0.8594, "mean_token_accuracy": 0.7247633635997772, "step": 1949 }, { "epoch": 1.327545114061968, "grad_norm": 1.7031160593032837, "learning_rate": 8.781361783071804e-06, "loss": 0.7233, "mean_token_accuracy": 0.7620941996574402, "step": 1950 }, { "epoch": 1.3282260810350697, "grad_norm": 1.791658878326416, "learning_rate": 8.779887565452743e-06, "loss": 0.7731, "mean_token_accuracy": 0.7329408526420593, "step": 1951 }, { "epoch": 1.3289070480081717, "grad_norm": 1.6967908143997192, "learning_rate": 8.778412580579457e-06, "loss": 0.7898, "mean_token_accuracy": 0.7503160536289215, "step": 1952 }, { "epoch": 1.3295880149812733, "grad_norm": 1.765739917755127, "learning_rate": 8.776936828751341e-06, "loss": 0.7012, "mean_token_accuracy": 0.7532666027545929, "step": 1953 }, { "epoch": 1.3302689819543752, "grad_norm": 1.807860255241394, "learning_rate": 8.77546031026795e-06, "loss": 0.6825, "mean_token_accuracy": 0.7729334831237793, "step": 1954 }, { "epoch": 1.330949948927477, "grad_norm": 1.7892231941223145, "learning_rate": 8.773983025428989e-06, "loss": 0.7711, "mean_token_accuracy": 0.7389592826366425, "step": 1955 }, { "epoch": 1.3316309159005788, "grad_norm": 1.763679027557373, "learning_rate": 8.772504974534329e-06, "loss": 0.7123, "mean_token_accuracy": 0.7638707458972931, "step": 1956 }, { "epoch": 1.3323118828736806, "grad_norm": 1.9492566585540771, "learning_rate": 8.771026157883983e-06, "loss": 0.7267, "mean_token_accuracy": 0.7790367305278778, "step": 1957 }, { "epoch": 1.3329928498467825, "grad_norm": 1.7268896102905273, "learning_rate": 8.769546575778129e-06, "loss": 0.7447, "mean_token_accuracy": 0.7588078081607819, "step": 1958 }, { "epoch": 1.3336738168198843, "grad_norm": 1.7142763137817383, "learning_rate": 8.768066228517097e-06, "loss": 0.638, "mean_token_accuracy": 0.791984498500824, "step": 1959 }, { "epoch": 1.334354783792986, "grad_norm": 1.6823662519454956, "learning_rate": 8.766585116401374e-06, "loss": 0.6625, "mean_token_accuracy": 0.785545825958252, "step": 1960 }, { "epoch": 1.335035750766088, "grad_norm": 1.9097265005111694, "learning_rate": 8.765103239731602e-06, "loss": 0.8452, "mean_token_accuracy": 0.7241529226303101, "step": 1961 }, { "epoch": 1.3357167177391895, "grad_norm": 1.8464386463165283, "learning_rate": 8.763620598808573e-06, "loss": 0.6232, "mean_token_accuracy": 0.7996216714382172, "step": 1962 }, { "epoch": 1.3363976847122916, "grad_norm": 1.770129680633545, "learning_rate": 8.762137193933241e-06, "loss": 0.7208, "mean_token_accuracy": 0.7690006196498871, "step": 1963 }, { "epoch": 1.3370786516853932, "grad_norm": 1.8235447406768799, "learning_rate": 8.760653025406713e-06, "loss": 0.7161, "mean_token_accuracy": 0.7757138013839722, "step": 1964 }, { "epoch": 1.337759618658495, "grad_norm": 1.7251689434051514, "learning_rate": 8.759168093530252e-06, "loss": 0.6971, "mean_token_accuracy": 0.7726120054721832, "step": 1965 }, { "epoch": 1.3384405856315968, "grad_norm": 1.7589346170425415, "learning_rate": 8.75768239860527e-06, "loss": 0.6101, "mean_token_accuracy": 0.7995341718196869, "step": 1966 }, { "epoch": 1.3391215526046987, "grad_norm": 1.7637442350387573, "learning_rate": 8.756195940933342e-06, "loss": 0.7775, "mean_token_accuracy": 0.7570458352565765, "step": 1967 }, { "epoch": 1.3398025195778005, "grad_norm": 1.7228798866271973, "learning_rate": 8.754708720816191e-06, "loss": 0.8245, "mean_token_accuracy": 0.719502866268158, "step": 1968 }, { "epoch": 1.3404834865509023, "grad_norm": 1.5480629205703735, "learning_rate": 8.753220738555704e-06, "loss": 0.7299, "mean_token_accuracy": 0.7625630795955658, "step": 1969 }, { "epoch": 1.3411644535240042, "grad_norm": 1.9011061191558838, "learning_rate": 8.75173199445391e-06, "loss": 0.7213, "mean_token_accuracy": 0.7622510194778442, "step": 1970 }, { "epoch": 1.3418454204971058, "grad_norm": 1.7893602848052979, "learning_rate": 8.750242488813e-06, "loss": 0.6653, "mean_token_accuracy": 0.7874182462692261, "step": 1971 }, { "epoch": 1.3425263874702078, "grad_norm": 1.8021382093429565, "learning_rate": 8.748752221935321e-06, "loss": 0.7024, "mean_token_accuracy": 0.7667863667011261, "step": 1972 }, { "epoch": 1.3432073544433094, "grad_norm": 1.7422271966934204, "learning_rate": 8.747261194123373e-06, "loss": 0.6952, "mean_token_accuracy": 0.7690790295600891, "step": 1973 }, { "epoch": 1.3438883214164112, "grad_norm": 1.4751157760620117, "learning_rate": 8.74576940567981e-06, "loss": 0.9249, "mean_token_accuracy": 0.6977764666080475, "step": 1974 }, { "epoch": 1.344569288389513, "grad_norm": 1.8416657447814941, "learning_rate": 8.744276856907437e-06, "loss": 0.6264, "mean_token_accuracy": 0.8035623133182526, "step": 1975 }, { "epoch": 1.3452502553626149, "grad_norm": 2.0747737884521484, "learning_rate": 8.742783548109219e-06, "loss": 0.6025, "mean_token_accuracy": 0.7989988327026367, "step": 1976 }, { "epoch": 1.3459312223357167, "grad_norm": 1.723002552986145, "learning_rate": 8.741289479588271e-06, "loss": 0.7685, "mean_token_accuracy": 0.727622777223587, "step": 1977 }, { "epoch": 1.3466121893088185, "grad_norm": 1.7985351085662842, "learning_rate": 8.739794651647868e-06, "loss": 0.6449, "mean_token_accuracy": 0.7844003438949585, "step": 1978 }, { "epoch": 1.3472931562819204, "grad_norm": 1.8034104108810425, "learning_rate": 8.73829906459143e-06, "loss": 0.6854, "mean_token_accuracy": 0.7780974507331848, "step": 1979 }, { "epoch": 1.3479741232550222, "grad_norm": 1.7167818546295166, "learning_rate": 8.73680271872254e-06, "loss": 0.7213, "mean_token_accuracy": 0.7489154934883118, "step": 1980 }, { "epoch": 1.348655090228124, "grad_norm": 1.817505121231079, "learning_rate": 8.735305614344929e-06, "loss": 0.7783, "mean_token_accuracy": 0.7410545647144318, "step": 1981 }, { "epoch": 1.3493360572012256, "grad_norm": 1.7781707048416138, "learning_rate": 8.733807751762486e-06, "loss": 0.6599, "mean_token_accuracy": 0.7619809210300446, "step": 1982 }, { "epoch": 1.3500170241743277, "grad_norm": 2.0324690341949463, "learning_rate": 8.732309131279252e-06, "loss": 0.7053, "mean_token_accuracy": 0.759757786989212, "step": 1983 }, { "epoch": 1.3506979911474293, "grad_norm": 1.6947228908538818, "learning_rate": 8.73080975319942e-06, "loss": 0.8935, "mean_token_accuracy": 0.7150765657424927, "step": 1984 }, { "epoch": 1.351378958120531, "grad_norm": 1.7038638591766357, "learning_rate": 8.729309617827342e-06, "loss": 0.7667, "mean_token_accuracy": 0.746997743844986, "step": 1985 }, { "epoch": 1.352059925093633, "grad_norm": 1.7152096033096313, "learning_rate": 8.727808725467519e-06, "loss": 0.6447, "mean_token_accuracy": 0.7885347902774811, "step": 1986 }, { "epoch": 1.3527408920667348, "grad_norm": 1.5322048664093018, "learning_rate": 8.726307076424605e-06, "loss": 0.7782, "mean_token_accuracy": 0.7393187284469604, "step": 1987 }, { "epoch": 1.3534218590398366, "grad_norm": 1.553517460823059, "learning_rate": 8.724804671003413e-06, "loss": 0.8718, "mean_token_accuracy": 0.7210979461669922, "step": 1988 }, { "epoch": 1.3541028260129384, "grad_norm": 1.7706964015960693, "learning_rate": 8.723301509508903e-06, "loss": 0.8282, "mean_token_accuracy": 0.7245754897594452, "step": 1989 }, { "epoch": 1.3547837929860402, "grad_norm": 1.7709720134735107, "learning_rate": 8.721797592246196e-06, "loss": 0.6708, "mean_token_accuracy": 0.7851246297359467, "step": 1990 }, { "epoch": 1.355464759959142, "grad_norm": 1.681313395500183, "learning_rate": 8.720292919520558e-06, "loss": 0.7259, "mean_token_accuracy": 0.7449875473976135, "step": 1991 }, { "epoch": 1.356145726932244, "grad_norm": 1.8032212257385254, "learning_rate": 8.718787491637414e-06, "loss": 0.7263, "mean_token_accuracy": 0.7655259072780609, "step": 1992 }, { "epoch": 1.3568266939053455, "grad_norm": 1.734449863433838, "learning_rate": 8.717281308902343e-06, "loss": 0.8536, "mean_token_accuracy": 0.7221971750259399, "step": 1993 }, { "epoch": 1.3575076608784473, "grad_norm": 1.844112753868103, "learning_rate": 8.715774371621073e-06, "loss": 0.7269, "mean_token_accuracy": 0.7611609697341919, "step": 1994 }, { "epoch": 1.3581886278515491, "grad_norm": 1.7382960319519043, "learning_rate": 8.714266680099487e-06, "loss": 0.6258, "mean_token_accuracy": 0.7859601378440857, "step": 1995 }, { "epoch": 1.358869594824651, "grad_norm": 1.8116549253463745, "learning_rate": 8.712758234643621e-06, "loss": 0.6834, "mean_token_accuracy": 0.7682881653308868, "step": 1996 }, { "epoch": 1.3595505617977528, "grad_norm": 1.7935079336166382, "learning_rate": 8.711249035559666e-06, "loss": 0.9073, "mean_token_accuracy": 0.6996386051177979, "step": 1997 }, { "epoch": 1.3602315287708546, "grad_norm": 1.8474931716918945, "learning_rate": 8.70973908315396e-06, "loss": 0.7223, "mean_token_accuracy": 0.7534148395061493, "step": 1998 }, { "epoch": 1.3609124957439565, "grad_norm": 1.746803879737854, "learning_rate": 8.708228377733005e-06, "loss": 0.7, "mean_token_accuracy": 0.771309107542038, "step": 1999 }, { "epoch": 1.3615934627170583, "grad_norm": 1.5627562999725342, "learning_rate": 8.706716919603443e-06, "loss": 0.676, "mean_token_accuracy": 0.781534880399704, "step": 2000 }, { "epoch": 1.36227442969016, "grad_norm": 1.680046796798706, "learning_rate": 8.70520470907208e-06, "loss": 0.7666, "mean_token_accuracy": 0.7405770719051361, "step": 2001 }, { "epoch": 1.3629553966632617, "grad_norm": 1.7452867031097412, "learning_rate": 8.703691746445863e-06, "loss": 0.8273, "mean_token_accuracy": 0.7241105437278748, "step": 2002 }, { "epoch": 1.3636363636363638, "grad_norm": 1.756976842880249, "learning_rate": 8.702178032031905e-06, "loss": 0.8123, "mean_token_accuracy": 0.7291762232780457, "step": 2003 }, { "epoch": 1.3643173306094654, "grad_norm": 1.9365575313568115, "learning_rate": 8.700663566137462e-06, "loss": 0.7107, "mean_token_accuracy": 0.7665524184703827, "step": 2004 }, { "epoch": 1.3649982975825672, "grad_norm": 1.8171008825302124, "learning_rate": 8.699148349069944e-06, "loss": 0.6915, "mean_token_accuracy": 0.7777045965194702, "step": 2005 }, { "epoch": 1.365679264555669, "grad_norm": 1.9795445203781128, "learning_rate": 8.697632381136915e-06, "loss": 0.6648, "mean_token_accuracy": 0.7851660549640656, "step": 2006 }, { "epoch": 1.3663602315287708, "grad_norm": 1.660304069519043, "learning_rate": 8.696115662646093e-06, "loss": 0.7769, "mean_token_accuracy": 0.7506266534328461, "step": 2007 }, { "epoch": 1.3670411985018727, "grad_norm": 1.7262778282165527, "learning_rate": 8.694598193905346e-06, "loss": 0.5665, "mean_token_accuracy": 0.7957954704761505, "step": 2008 }, { "epoch": 1.3677221654749745, "grad_norm": 1.5972987413406372, "learning_rate": 8.693079975222694e-06, "loss": 0.7451, "mean_token_accuracy": 0.74956414103508, "step": 2009 }, { "epoch": 1.3684031324480763, "grad_norm": 1.7590484619140625, "learning_rate": 8.691561006906314e-06, "loss": 0.8543, "mean_token_accuracy": 0.7287825047969818, "step": 2010 }, { "epoch": 1.3690840994211781, "grad_norm": 1.7019118070602417, "learning_rate": 8.690041289264523e-06, "loss": 0.8496, "mean_token_accuracy": 0.7411366403102875, "step": 2011 }, { "epoch": 1.36976506639428, "grad_norm": 1.8683501482009888, "learning_rate": 8.688520822605808e-06, "loss": 0.592, "mean_token_accuracy": 0.8014708161354065, "step": 2012 }, { "epoch": 1.3704460333673816, "grad_norm": 1.7991811037063599, "learning_rate": 8.686999607238791e-06, "loss": 0.6603, "mean_token_accuracy": 0.775848776102066, "step": 2013 }, { "epoch": 1.3711270003404834, "grad_norm": 1.730696201324463, "learning_rate": 8.685477643472257e-06, "loss": 0.6965, "mean_token_accuracy": 0.7646900713443756, "step": 2014 }, { "epoch": 1.3718079673135852, "grad_norm": 1.778198003768921, "learning_rate": 8.68395493161514e-06, "loss": 0.6653, "mean_token_accuracy": 0.7651737630367279, "step": 2015 }, { "epoch": 1.372488934286687, "grad_norm": 1.7175694704055786, "learning_rate": 8.682431471976523e-06, "loss": 0.8046, "mean_token_accuracy": 0.7230232059955597, "step": 2016 }, { "epoch": 1.3731699012597889, "grad_norm": 2.0419745445251465, "learning_rate": 8.680907264865642e-06, "loss": 0.7007, "mean_token_accuracy": 0.7717506587505341, "step": 2017 }, { "epoch": 1.3738508682328907, "grad_norm": 1.7800674438476562, "learning_rate": 8.679382310591889e-06, "loss": 0.634, "mean_token_accuracy": 0.7991328835487366, "step": 2018 }, { "epoch": 1.3745318352059925, "grad_norm": 1.6735504865646362, "learning_rate": 8.677856609464802e-06, "loss": 0.7284, "mean_token_accuracy": 0.7560174465179443, "step": 2019 }, { "epoch": 1.3752128021790944, "grad_norm": 1.725569486618042, "learning_rate": 8.676330161794073e-06, "loss": 0.8813, "mean_token_accuracy": 0.7127203345298767, "step": 2020 }, { "epoch": 1.3758937691521962, "grad_norm": 1.839928388595581, "learning_rate": 8.674802967889547e-06, "loss": 0.948, "mean_token_accuracy": 0.6960031688213348, "step": 2021 }, { "epoch": 1.3765747361252978, "grad_norm": 1.6708478927612305, "learning_rate": 8.673275028061216e-06, "loss": 0.753, "mean_token_accuracy": 0.7637625336647034, "step": 2022 }, { "epoch": 1.3772557030983998, "grad_norm": 1.6110461950302124, "learning_rate": 8.671746342619231e-06, "loss": 0.7276, "mean_token_accuracy": 0.7523481547832489, "step": 2023 }, { "epoch": 1.3779366700715014, "grad_norm": 1.7444868087768555, "learning_rate": 8.670216911873886e-06, "loss": 0.6585, "mean_token_accuracy": 0.7669985294342041, "step": 2024 }, { "epoch": 1.3786176370446033, "grad_norm": 1.638556957244873, "learning_rate": 8.66868673613563e-06, "loss": 0.6898, "mean_token_accuracy": 0.758233368396759, "step": 2025 }, { "epoch": 1.379298604017705, "grad_norm": 1.586930274963379, "learning_rate": 8.667155815715065e-06, "loss": 0.8774, "mean_token_accuracy": 0.7190226018428802, "step": 2026 }, { "epoch": 1.379979570990807, "grad_norm": 1.703453779220581, "learning_rate": 8.66562415092294e-06, "loss": 0.6752, "mean_token_accuracy": 0.7840808629989624, "step": 2027 }, { "epoch": 1.3806605379639088, "grad_norm": 1.6275066137313843, "learning_rate": 8.66409174207016e-06, "loss": 0.6982, "mean_token_accuracy": 0.7774627804756165, "step": 2028 }, { "epoch": 1.3813415049370106, "grad_norm": 1.7413303852081299, "learning_rate": 8.662558589467778e-06, "loss": 0.6726, "mean_token_accuracy": 0.7670726180076599, "step": 2029 }, { "epoch": 1.3820224719101124, "grad_norm": 1.9018826484680176, "learning_rate": 8.661024693426998e-06, "loss": 0.8066, "mean_token_accuracy": 0.7206730544567108, "step": 2030 }, { "epoch": 1.3827034388832142, "grad_norm": 1.7672494649887085, "learning_rate": 8.659490054259174e-06, "loss": 0.7396, "mean_token_accuracy": 0.7580422759056091, "step": 2031 }, { "epoch": 1.383384405856316, "grad_norm": 1.636285662651062, "learning_rate": 8.657954672275815e-06, "loss": 0.7672, "mean_token_accuracy": 0.7407210469245911, "step": 2032 }, { "epoch": 1.3840653728294177, "grad_norm": 1.7890294790267944, "learning_rate": 8.656418547788574e-06, "loss": 0.6928, "mean_token_accuracy": 0.7678869068622589, "step": 2033 }, { "epoch": 1.3847463398025197, "grad_norm": 1.7257678508758545, "learning_rate": 8.654881681109263e-06, "loss": 0.8966, "mean_token_accuracy": 0.6986809372901917, "step": 2034 }, { "epoch": 1.3854273067756213, "grad_norm": 1.5877131223678589, "learning_rate": 8.653344072549837e-06, "loss": 0.8024, "mean_token_accuracy": 0.7273659110069275, "step": 2035 }, { "epoch": 1.3861082737487231, "grad_norm": 1.6738176345825195, "learning_rate": 8.651805722422406e-06, "loss": 0.9511, "mean_token_accuracy": 0.7223553955554962, "step": 2036 }, { "epoch": 1.386789240721825, "grad_norm": 1.677558183670044, "learning_rate": 8.650266631039232e-06, "loss": 0.8712, "mean_token_accuracy": 0.726225346326828, "step": 2037 }, { "epoch": 1.3874702076949268, "grad_norm": 1.6739695072174072, "learning_rate": 8.648726798712721e-06, "loss": 0.8058, "mean_token_accuracy": 0.744548887014389, "step": 2038 }, { "epoch": 1.3881511746680286, "grad_norm": 1.691565752029419, "learning_rate": 8.647186225755435e-06, "loss": 0.7343, "mean_token_accuracy": 0.7549123167991638, "step": 2039 }, { "epoch": 1.3888321416411304, "grad_norm": 1.6827250719070435, "learning_rate": 8.645644912480086e-06, "loss": 0.6414, "mean_token_accuracy": 0.7749837636947632, "step": 2040 }, { "epoch": 1.3895131086142323, "grad_norm": 1.774489402770996, "learning_rate": 8.644102859199532e-06, "loss": 0.7455, "mean_token_accuracy": 0.7597992718219757, "step": 2041 }, { "epoch": 1.3901940755873339, "grad_norm": 1.554215431213379, "learning_rate": 8.642560066226785e-06, "loss": 0.8448, "mean_token_accuracy": 0.7327285408973694, "step": 2042 }, { "epoch": 1.390875042560436, "grad_norm": 1.735182762145996, "learning_rate": 8.64101653387501e-06, "loss": 0.7756, "mean_token_accuracy": 0.7332989871501923, "step": 2043 }, { "epoch": 1.3915560095335375, "grad_norm": 1.7359379529953003, "learning_rate": 8.639472262457513e-06, "loss": 0.6704, "mean_token_accuracy": 0.7387229800224304, "step": 2044 }, { "epoch": 1.3922369765066394, "grad_norm": 1.8478822708129883, "learning_rate": 8.637927252287758e-06, "loss": 0.6284, "mean_token_accuracy": 0.8017570972442627, "step": 2045 }, { "epoch": 1.3929179434797412, "grad_norm": 1.8314194679260254, "learning_rate": 8.636381503679357e-06, "loss": 0.7837, "mean_token_accuracy": 0.7326376140117645, "step": 2046 }, { "epoch": 1.393598910452843, "grad_norm": 1.6739195585250854, "learning_rate": 8.63483501694607e-06, "loss": 0.7442, "mean_token_accuracy": 0.7514954805374146, "step": 2047 }, { "epoch": 1.3942798774259448, "grad_norm": 1.7952769994735718, "learning_rate": 8.633287792401808e-06, "loss": 0.8021, "mean_token_accuracy": 0.7444299757480621, "step": 2048 }, { "epoch": 1.3949608443990467, "grad_norm": 1.7550050020217896, "learning_rate": 8.631739830360632e-06, "loss": 0.7975, "mean_token_accuracy": 0.7299355864524841, "step": 2049 }, { "epoch": 1.3956418113721485, "grad_norm": 1.9780203104019165, "learning_rate": 8.630191131136754e-06, "loss": 0.6557, "mean_token_accuracy": 0.7754234671592712, "step": 2050 }, { "epoch": 1.3963227783452503, "grad_norm": 1.7241843938827515, "learning_rate": 8.628641695044531e-06, "loss": 0.7574, "mean_token_accuracy": 0.7537806034088135, "step": 2051 }, { "epoch": 1.3970037453183521, "grad_norm": 1.926102876663208, "learning_rate": 8.627091522398476e-06, "loss": 0.6232, "mean_token_accuracy": 0.7935283482074738, "step": 2052 }, { "epoch": 1.3976847122914537, "grad_norm": 1.7708325386047363, "learning_rate": 8.625540613513247e-06, "loss": 0.8227, "mean_token_accuracy": 0.7204873561859131, "step": 2053 }, { "epoch": 1.3983656792645558, "grad_norm": 1.7404906749725342, "learning_rate": 8.623988968703651e-06, "loss": 0.6808, "mean_token_accuracy": 0.7746590077877045, "step": 2054 }, { "epoch": 1.3990466462376574, "grad_norm": 1.988544225692749, "learning_rate": 8.622436588284649e-06, "loss": 0.6525, "mean_token_accuracy": 0.7870492935180664, "step": 2055 }, { "epoch": 1.3997276132107592, "grad_norm": 1.7317330837249756, "learning_rate": 8.620883472571346e-06, "loss": 0.753, "mean_token_accuracy": 0.7375084459781647, "step": 2056 }, { "epoch": 1.400408580183861, "grad_norm": 1.5538345575332642, "learning_rate": 8.619329621879e-06, "loss": 0.7943, "mean_token_accuracy": 0.7244156301021576, "step": 2057 }, { "epoch": 1.4010895471569629, "grad_norm": 1.717194676399231, "learning_rate": 8.617775036523014e-06, "loss": 0.672, "mean_token_accuracy": 0.7592485845088959, "step": 2058 }, { "epoch": 1.4017705141300647, "grad_norm": 1.557176947593689, "learning_rate": 8.616219716818948e-06, "loss": 0.8788, "mean_token_accuracy": 0.7362346947193146, "step": 2059 }, { "epoch": 1.4024514811031665, "grad_norm": 1.8438549041748047, "learning_rate": 8.614663663082503e-06, "loss": 0.6659, "mean_token_accuracy": 0.7795924544334412, "step": 2060 }, { "epoch": 1.4031324480762684, "grad_norm": 1.9061992168426514, "learning_rate": 8.61310687562953e-06, "loss": 0.6626, "mean_token_accuracy": 0.7858507335186005, "step": 2061 }, { "epoch": 1.4038134150493702, "grad_norm": 1.7595940828323364, "learning_rate": 8.611549354776036e-06, "loss": 0.7861, "mean_token_accuracy": 0.75189208984375, "step": 2062 }, { "epoch": 1.404494382022472, "grad_norm": 1.7913323640823364, "learning_rate": 8.609991100838167e-06, "loss": 0.7393, "mean_token_accuracy": 0.7614403665065765, "step": 2063 }, { "epoch": 1.4051753489955736, "grad_norm": 1.7942177057266235, "learning_rate": 8.608432114132226e-06, "loss": 0.7049, "mean_token_accuracy": 0.7702132761478424, "step": 2064 }, { "epoch": 1.4058563159686754, "grad_norm": 1.7778544425964355, "learning_rate": 8.606872394974659e-06, "loss": 0.7744, "mean_token_accuracy": 0.7525109052658081, "step": 2065 }, { "epoch": 1.4065372829417773, "grad_norm": 1.649656891822815, "learning_rate": 8.605311943682065e-06, "loss": 0.6808, "mean_token_accuracy": 0.7861151993274689, "step": 2066 }, { "epoch": 1.407218249914879, "grad_norm": 1.6739596128463745, "learning_rate": 8.603750760571187e-06, "loss": 0.7917, "mean_token_accuracy": 0.7160140573978424, "step": 2067 }, { "epoch": 1.407899216887981, "grad_norm": 1.8632322549819946, "learning_rate": 8.60218884595892e-06, "loss": 0.7312, "mean_token_accuracy": 0.7590509653091431, "step": 2068 }, { "epoch": 1.4085801838610827, "grad_norm": 1.9184736013412476, "learning_rate": 8.60062620016231e-06, "loss": 0.7184, "mean_token_accuracy": 0.7641745209693909, "step": 2069 }, { "epoch": 1.4092611508341846, "grad_norm": 1.6323137283325195, "learning_rate": 8.599062823498545e-06, "loss": 0.6991, "mean_token_accuracy": 0.7675134539604187, "step": 2070 }, { "epoch": 1.4099421178072864, "grad_norm": 1.87397301197052, "learning_rate": 8.597498716284965e-06, "loss": 0.7887, "mean_token_accuracy": 0.7184387445449829, "step": 2071 }, { "epoch": 1.4106230847803882, "grad_norm": 1.7490286827087402, "learning_rate": 8.595933878839055e-06, "loss": 0.6811, "mean_token_accuracy": 0.7788030207157135, "step": 2072 }, { "epoch": 1.4113040517534898, "grad_norm": 1.9447717666625977, "learning_rate": 8.594368311478456e-06, "loss": 0.5996, "mean_token_accuracy": 0.8047491014003754, "step": 2073 }, { "epoch": 1.4119850187265919, "grad_norm": 1.7044315338134766, "learning_rate": 8.592802014520949e-06, "loss": 0.6233, "mean_token_accuracy": 0.7853774130344391, "step": 2074 }, { "epoch": 1.4126659856996935, "grad_norm": 1.707370638847351, "learning_rate": 8.591234988284468e-06, "loss": 0.7866, "mean_token_accuracy": 0.733195573091507, "step": 2075 }, { "epoch": 1.4133469526727953, "grad_norm": 1.7911770343780518, "learning_rate": 8.589667233087089e-06, "loss": 0.7446, "mean_token_accuracy": 0.7636502683162689, "step": 2076 }, { "epoch": 1.4140279196458971, "grad_norm": 1.8378678560256958, "learning_rate": 8.588098749247045e-06, "loss": 0.5643, "mean_token_accuracy": 0.8102468550205231, "step": 2077 }, { "epoch": 1.414708886618999, "grad_norm": 1.8809592723846436, "learning_rate": 8.58652953708271e-06, "loss": 0.6443, "mean_token_accuracy": 0.7726792693138123, "step": 2078 }, { "epoch": 1.4153898535921008, "grad_norm": 1.556628704071045, "learning_rate": 8.58495959691261e-06, "loss": 0.7421, "mean_token_accuracy": 0.74406498670578, "step": 2079 }, { "epoch": 1.4160708205652026, "grad_norm": 1.8158057928085327, "learning_rate": 8.583388929055414e-06, "loss": 0.7948, "mean_token_accuracy": 0.7390445470809937, "step": 2080 }, { "epoch": 1.4167517875383044, "grad_norm": 1.7369353771209717, "learning_rate": 8.581817533829941e-06, "loss": 0.734, "mean_token_accuracy": 0.7446632981300354, "step": 2081 }, { "epoch": 1.4174327545114063, "grad_norm": 1.8620017766952515, "learning_rate": 8.580245411555162e-06, "loss": 0.7042, "mean_token_accuracy": 0.7579741775989532, "step": 2082 }, { "epoch": 1.418113721484508, "grad_norm": 1.8635506629943848, "learning_rate": 8.578672562550188e-06, "loss": 0.7144, "mean_token_accuracy": 0.7594528496265411, "step": 2083 }, { "epoch": 1.4187946884576097, "grad_norm": 1.7525684833526611, "learning_rate": 8.577098987134283e-06, "loss": 0.6403, "mean_token_accuracy": 0.7830910682678223, "step": 2084 }, { "epoch": 1.4194756554307117, "grad_norm": 1.887157678604126, "learning_rate": 8.575524685626853e-06, "loss": 0.699, "mean_token_accuracy": 0.7792257070541382, "step": 2085 }, { "epoch": 1.4201566224038134, "grad_norm": 1.7702162265777588, "learning_rate": 8.57394965834746e-06, "loss": 0.6823, "mean_token_accuracy": 0.7700012028217316, "step": 2086 }, { "epoch": 1.4208375893769152, "grad_norm": 1.9809762239456177, "learning_rate": 8.572373905615806e-06, "loss": 0.7087, "mean_token_accuracy": 0.762517899274826, "step": 2087 }, { "epoch": 1.421518556350017, "grad_norm": 1.78669011592865, "learning_rate": 8.570797427751743e-06, "loss": 0.7317, "mean_token_accuracy": 0.7404779195785522, "step": 2088 }, { "epoch": 1.4221995233231188, "grad_norm": 1.9646856784820557, "learning_rate": 8.569220225075269e-06, "loss": 0.6034, "mean_token_accuracy": 0.7962596416473389, "step": 2089 }, { "epoch": 1.4228804902962207, "grad_norm": 1.7777684926986694, "learning_rate": 8.567642297906531e-06, "loss": 0.7009, "mean_token_accuracy": 0.7621127367019653, "step": 2090 }, { "epoch": 1.4235614572693225, "grad_norm": 1.801500916481018, "learning_rate": 8.56606364656582e-06, "loss": 0.8846, "mean_token_accuracy": 0.7209506034851074, "step": 2091 }, { "epoch": 1.4242424242424243, "grad_norm": 1.7446929216384888, "learning_rate": 8.564484271373578e-06, "loss": 0.7357, "mean_token_accuracy": 0.7544176876544952, "step": 2092 }, { "epoch": 1.424923391215526, "grad_norm": 1.7984509468078613, "learning_rate": 8.562904172650392e-06, "loss": 0.7788, "mean_token_accuracy": 0.746805727481842, "step": 2093 }, { "epoch": 1.425604358188628, "grad_norm": 1.8719604015350342, "learning_rate": 8.561323350716994e-06, "loss": 0.6525, "mean_token_accuracy": 0.7893910706043243, "step": 2094 }, { "epoch": 1.4262853251617296, "grad_norm": 1.7265715599060059, "learning_rate": 8.559741805894266e-06, "loss": 0.8613, "mean_token_accuracy": 0.7210221588611603, "step": 2095 }, { "epoch": 1.4269662921348314, "grad_norm": 1.7584172487258911, "learning_rate": 8.558159538503234e-06, "loss": 0.7244, "mean_token_accuracy": 0.7795511782169342, "step": 2096 }, { "epoch": 1.4276472591079332, "grad_norm": 1.670937418937683, "learning_rate": 8.556576548865073e-06, "loss": 0.6657, "mean_token_accuracy": 0.7812475860118866, "step": 2097 }, { "epoch": 1.428328226081035, "grad_norm": 1.9070731401443481, "learning_rate": 8.554992837301101e-06, "loss": 0.7641, "mean_token_accuracy": 0.7298775911331177, "step": 2098 }, { "epoch": 1.4290091930541369, "grad_norm": 1.6671972274780273, "learning_rate": 8.553408404132789e-06, "loss": 0.8156, "mean_token_accuracy": 0.7357039749622345, "step": 2099 }, { "epoch": 1.4296901600272387, "grad_norm": 1.7446328401565552, "learning_rate": 8.551823249681748e-06, "loss": 0.6115, "mean_token_accuracy": 0.7884605526924133, "step": 2100 }, { "epoch": 1.4303711270003405, "grad_norm": 1.842963457107544, "learning_rate": 8.55023737426974e-06, "loss": 0.6997, "mean_token_accuracy": 0.7396327555179596, "step": 2101 }, { "epoch": 1.4310520939734424, "grad_norm": 1.7304290533065796, "learning_rate": 8.548650778218667e-06, "loss": 0.6895, "mean_token_accuracy": 0.7729621827602386, "step": 2102 }, { "epoch": 1.4317330609465442, "grad_norm": 1.7331875562667847, "learning_rate": 8.547063461850588e-06, "loss": 0.7883, "mean_token_accuracy": 0.7444688975811005, "step": 2103 }, { "epoch": 1.4324140279196458, "grad_norm": 1.7665516138076782, "learning_rate": 8.545475425487697e-06, "loss": 0.8106, "mean_token_accuracy": 0.7283543944358826, "step": 2104 }, { "epoch": 1.4330949948927478, "grad_norm": 1.7071738243103027, "learning_rate": 8.543886669452339e-06, "loss": 0.7712, "mean_token_accuracy": 0.7586405277252197, "step": 2105 }, { "epoch": 1.4337759618658494, "grad_norm": 1.6815739870071411, "learning_rate": 8.542297194067008e-06, "loss": 0.758, "mean_token_accuracy": 0.7504871189594269, "step": 2106 }, { "epoch": 1.4344569288389513, "grad_norm": 1.8850075006484985, "learning_rate": 8.540706999654338e-06, "loss": 0.5995, "mean_token_accuracy": 0.8044551610946655, "step": 2107 }, { "epoch": 1.435137895812053, "grad_norm": 1.6928837299346924, "learning_rate": 8.539116086537114e-06, "loss": 0.8095, "mean_token_accuracy": 0.7365705072879791, "step": 2108 }, { "epoch": 1.435818862785155, "grad_norm": 1.8135095834732056, "learning_rate": 8.537524455038264e-06, "loss": 0.702, "mean_token_accuracy": 0.7458747923374176, "step": 2109 }, { "epoch": 1.4364998297582567, "grad_norm": 1.7343076467514038, "learning_rate": 8.535932105480864e-06, "loss": 0.682, "mean_token_accuracy": 0.774036169052124, "step": 2110 }, { "epoch": 1.4371807967313586, "grad_norm": 1.756227970123291, "learning_rate": 8.53433903818813e-06, "loss": 0.6359, "mean_token_accuracy": 0.773484081029892, "step": 2111 }, { "epoch": 1.4378617637044604, "grad_norm": 1.7451810836791992, "learning_rate": 8.532745253483434e-06, "loss": 0.7409, "mean_token_accuracy": 0.7620583772659302, "step": 2112 }, { "epoch": 1.4385427306775622, "grad_norm": 1.7277295589447021, "learning_rate": 8.531150751690284e-06, "loss": 0.778, "mean_token_accuracy": 0.7461383044719696, "step": 2113 }, { "epoch": 1.439223697650664, "grad_norm": 1.886805534362793, "learning_rate": 8.529555533132341e-06, "loss": 0.6323, "mean_token_accuracy": 0.7803187072277069, "step": 2114 }, { "epoch": 1.4399046646237657, "grad_norm": 2.0506374835968018, "learning_rate": 8.527959598133403e-06, "loss": 0.7603, "mean_token_accuracy": 0.7512661218643188, "step": 2115 }, { "epoch": 1.4405856315968675, "grad_norm": 1.6196858882904053, "learning_rate": 8.52636294701742e-06, "loss": 0.9148, "mean_token_accuracy": 0.7215964794158936, "step": 2116 }, { "epoch": 1.4412665985699693, "grad_norm": 1.7818580865859985, "learning_rate": 8.524765580108487e-06, "loss": 0.8237, "mean_token_accuracy": 0.7397749423980713, "step": 2117 }, { "epoch": 1.4419475655430711, "grad_norm": 1.8941627740859985, "learning_rate": 8.523167497730842e-06, "loss": 0.7111, "mean_token_accuracy": 0.7402827739715576, "step": 2118 }, { "epoch": 1.442628532516173, "grad_norm": 1.8644366264343262, "learning_rate": 8.521568700208868e-06, "loss": 0.6218, "mean_token_accuracy": 0.7854858636856079, "step": 2119 }, { "epoch": 1.4433094994892748, "grad_norm": 1.9443472623825073, "learning_rate": 8.519969187867098e-06, "loss": 0.7137, "mean_token_accuracy": 0.7609954178333282, "step": 2120 }, { "epoch": 1.4439904664623766, "grad_norm": 1.7649277448654175, "learning_rate": 8.518368961030201e-06, "loss": 0.7789, "mean_token_accuracy": 0.7434145510196686, "step": 2121 }, { "epoch": 1.4446714334354784, "grad_norm": 1.7433308362960815, "learning_rate": 8.516768020023e-06, "loss": 0.8358, "mean_token_accuracy": 0.7061368525028229, "step": 2122 }, { "epoch": 1.4453524004085803, "grad_norm": 1.6087785959243774, "learning_rate": 8.51516636517046e-06, "loss": 0.7964, "mean_token_accuracy": 0.7445855736732483, "step": 2123 }, { "epoch": 1.4460333673816819, "grad_norm": 1.6837581396102905, "learning_rate": 8.513563996797686e-06, "loss": 0.8088, "mean_token_accuracy": 0.7312234342098236, "step": 2124 }, { "epoch": 1.446714334354784, "grad_norm": 1.7260791063308716, "learning_rate": 8.511960915229936e-06, "loss": 0.6296, "mean_token_accuracy": 0.786022961139679, "step": 2125 }, { "epoch": 1.4473953013278855, "grad_norm": 1.7545000314712524, "learning_rate": 8.510357120792609e-06, "loss": 0.6663, "mean_token_accuracy": 0.7779738008975983, "step": 2126 }, { "epoch": 1.4480762683009873, "grad_norm": 1.699160099029541, "learning_rate": 8.508752613811245e-06, "loss": 0.7059, "mean_token_accuracy": 0.762269526720047, "step": 2127 }, { "epoch": 1.4487572352740892, "grad_norm": 1.7395869493484497, "learning_rate": 8.507147394611536e-06, "loss": 0.7117, "mean_token_accuracy": 0.7661511898040771, "step": 2128 }, { "epoch": 1.449438202247191, "grad_norm": 1.7986513376235962, "learning_rate": 8.505541463519313e-06, "loss": 0.7956, "mean_token_accuracy": 0.7196171581745148, "step": 2129 }, { "epoch": 1.4501191692202928, "grad_norm": 1.854256272315979, "learning_rate": 8.503934820860554e-06, "loss": 0.7566, "mean_token_accuracy": 0.752440333366394, "step": 2130 }, { "epoch": 1.4508001361933947, "grad_norm": 1.7324211597442627, "learning_rate": 8.502327466961378e-06, "loss": 0.7398, "mean_token_accuracy": 0.7384723126888275, "step": 2131 }, { "epoch": 1.4514811031664965, "grad_norm": 1.7356780767440796, "learning_rate": 8.500719402148055e-06, "loss": 0.76, "mean_token_accuracy": 0.7311006486415863, "step": 2132 }, { "epoch": 1.4521620701395983, "grad_norm": 1.6736754179000854, "learning_rate": 8.499110626746993e-06, "loss": 0.6767, "mean_token_accuracy": 0.7798322141170502, "step": 2133 }, { "epoch": 1.4528430371127001, "grad_norm": 1.7189851999282837, "learning_rate": 8.497501141084746e-06, "loss": 0.6939, "mean_token_accuracy": 0.7753101885318756, "step": 2134 }, { "epoch": 1.4535240040858017, "grad_norm": 1.4952164888381958, "learning_rate": 8.495890945488017e-06, "loss": 0.9388, "mean_token_accuracy": 0.7069167196750641, "step": 2135 }, { "epoch": 1.4542049710589036, "grad_norm": 1.708530068397522, "learning_rate": 8.494280040283643e-06, "loss": 0.7093, "mean_token_accuracy": 0.771348387002945, "step": 2136 }, { "epoch": 1.4548859380320054, "grad_norm": 1.8355077505111694, "learning_rate": 8.492668425798616e-06, "loss": 0.9131, "mean_token_accuracy": 0.6931357085704803, "step": 2137 }, { "epoch": 1.4555669050051072, "grad_norm": 1.7606531381607056, "learning_rate": 8.491056102360063e-06, "loss": 0.7683, "mean_token_accuracy": 0.7549967169761658, "step": 2138 }, { "epoch": 1.456247871978209, "grad_norm": 1.640960454940796, "learning_rate": 8.48944307029526e-06, "loss": 0.7178, "mean_token_accuracy": 0.7687535583972931, "step": 2139 }, { "epoch": 1.4569288389513109, "grad_norm": 1.8809359073638916, "learning_rate": 8.48782932993163e-06, "loss": 0.6334, "mean_token_accuracy": 0.7918348610401154, "step": 2140 }, { "epoch": 1.4576098059244127, "grad_norm": 1.5849809646606445, "learning_rate": 8.486214881596728e-06, "loss": 0.7711, "mean_token_accuracy": 0.7436593770980835, "step": 2141 }, { "epoch": 1.4582907728975145, "grad_norm": 1.8422356843948364, "learning_rate": 8.484599725618266e-06, "loss": 0.6489, "mean_token_accuracy": 0.7919272184371948, "step": 2142 }, { "epoch": 1.4589717398706163, "grad_norm": 1.6687662601470947, "learning_rate": 8.482983862324091e-06, "loss": 0.7751, "mean_token_accuracy": 0.7549503445625305, "step": 2143 }, { "epoch": 1.459652706843718, "grad_norm": 1.9573934078216553, "learning_rate": 8.481367292042196e-06, "loss": 0.7339, "mean_token_accuracy": 0.7228973507881165, "step": 2144 }, { "epoch": 1.46033367381682, "grad_norm": 1.6691193580627441, "learning_rate": 8.479750015100718e-06, "loss": 0.8921, "mean_token_accuracy": 0.7193824946880341, "step": 2145 }, { "epoch": 1.4610146407899216, "grad_norm": 1.6814053058624268, "learning_rate": 8.47813203182794e-06, "loss": 0.6745, "mean_token_accuracy": 0.7849554121494293, "step": 2146 }, { "epoch": 1.4616956077630234, "grad_norm": 1.945192575454712, "learning_rate": 8.476513342552282e-06, "loss": 0.7649, "mean_token_accuracy": 0.7570013403892517, "step": 2147 }, { "epoch": 1.4623765747361253, "grad_norm": 1.827139139175415, "learning_rate": 8.474893947602313e-06, "loss": 0.6804, "mean_token_accuracy": 0.7750130593776703, "step": 2148 }, { "epoch": 1.463057541709227, "grad_norm": 1.6467219591140747, "learning_rate": 8.473273847306742e-06, "loss": 0.7953, "mean_token_accuracy": 0.7303609549999237, "step": 2149 }, { "epoch": 1.463738508682329, "grad_norm": 1.6995737552642822, "learning_rate": 8.471653041994425e-06, "loss": 0.6965, "mean_token_accuracy": 0.7708194255828857, "step": 2150 }, { "epoch": 1.4644194756554307, "grad_norm": 1.659906029701233, "learning_rate": 8.470031531994355e-06, "loss": 0.8548, "mean_token_accuracy": 0.7287424504756927, "step": 2151 }, { "epoch": 1.4651004426285326, "grad_norm": 1.8688666820526123, "learning_rate": 8.468409317635674e-06, "loss": 0.6956, "mean_token_accuracy": 0.7596157491207123, "step": 2152 }, { "epoch": 1.4657814096016344, "grad_norm": 1.64429771900177, "learning_rate": 8.466786399247663e-06, "loss": 0.6944, "mean_token_accuracy": 0.7610139846801758, "step": 2153 }, { "epoch": 1.4664623765747362, "grad_norm": 1.6871297359466553, "learning_rate": 8.46516277715975e-06, "loss": 0.7919, "mean_token_accuracy": 0.7314015030860901, "step": 2154 }, { "epoch": 1.4671433435478378, "grad_norm": 1.811564326286316, "learning_rate": 8.4635384517015e-06, "loss": 0.595, "mean_token_accuracy": 0.7991557419300079, "step": 2155 }, { "epoch": 1.4678243105209399, "grad_norm": 1.792759895324707, "learning_rate": 8.461913423202626e-06, "loss": 0.761, "mean_token_accuracy": 0.7581494748592377, "step": 2156 }, { "epoch": 1.4685052774940415, "grad_norm": 1.6965900659561157, "learning_rate": 8.46028769199298e-06, "loss": 0.7537, "mean_token_accuracy": 0.7523248791694641, "step": 2157 }, { "epoch": 1.4691862444671433, "grad_norm": 1.726775884628296, "learning_rate": 8.45866125840256e-06, "loss": 0.8562, "mean_token_accuracy": 0.7157973647117615, "step": 2158 }, { "epoch": 1.4698672114402451, "grad_norm": 1.658469557762146, "learning_rate": 8.457034122761505e-06, "loss": 0.7176, "mean_token_accuracy": 0.7510478794574738, "step": 2159 }, { "epoch": 1.470548178413347, "grad_norm": 1.7506623268127441, "learning_rate": 8.455406285400096e-06, "loss": 0.7343, "mean_token_accuracy": 0.7568727135658264, "step": 2160 }, { "epoch": 1.4712291453864488, "grad_norm": 1.7347825765609741, "learning_rate": 8.453777746648754e-06, "loss": 0.7677, "mean_token_accuracy": 0.7445345520973206, "step": 2161 }, { "epoch": 1.4719101123595506, "grad_norm": 1.7542445659637451, "learning_rate": 8.452148506838052e-06, "loss": 0.6521, "mean_token_accuracy": 0.783242255449295, "step": 2162 }, { "epoch": 1.4725910793326524, "grad_norm": 1.9593628644943237, "learning_rate": 8.45051856629869e-06, "loss": 0.717, "mean_token_accuracy": 0.7615822851657867, "step": 2163 }, { "epoch": 1.473272046305754, "grad_norm": 1.7731927633285522, "learning_rate": 8.448887925361527e-06, "loss": 0.7097, "mean_token_accuracy": 0.7585003674030304, "step": 2164 }, { "epoch": 1.473953013278856, "grad_norm": 1.8347268104553223, "learning_rate": 8.44725658435755e-06, "loss": 0.6724, "mean_token_accuracy": 0.7743549048900604, "step": 2165 }, { "epoch": 1.4746339802519577, "grad_norm": 1.7347285747528076, "learning_rate": 8.445624543617897e-06, "loss": 0.7673, "mean_token_accuracy": 0.7556937038898468, "step": 2166 }, { "epoch": 1.4753149472250595, "grad_norm": 1.7192436456680298, "learning_rate": 8.443991803473845e-06, "loss": 0.8035, "mean_token_accuracy": 0.7458344101905823, "step": 2167 }, { "epoch": 1.4759959141981613, "grad_norm": 2.034714698791504, "learning_rate": 8.442358364256811e-06, "loss": 0.6563, "mean_token_accuracy": 0.7478813529014587, "step": 2168 }, { "epoch": 1.4766768811712632, "grad_norm": 1.9738928079605103, "learning_rate": 8.440724226298358e-06, "loss": 0.7341, "mean_token_accuracy": 0.7414292693138123, "step": 2169 }, { "epoch": 1.477357848144365, "grad_norm": 1.8754783868789673, "learning_rate": 8.439089389930188e-06, "loss": 0.683, "mean_token_accuracy": 0.772093802690506, "step": 2170 }, { "epoch": 1.4780388151174668, "grad_norm": 1.7309985160827637, "learning_rate": 8.437453855484147e-06, "loss": 0.7662, "mean_token_accuracy": 0.7465266287326813, "step": 2171 }, { "epoch": 1.4787197820905686, "grad_norm": 1.7459160089492798, "learning_rate": 8.43581762329222e-06, "loss": 0.7298, "mean_token_accuracy": 0.7491482496261597, "step": 2172 }, { "epoch": 1.4794007490636705, "grad_norm": 1.6320127248764038, "learning_rate": 8.434180693686533e-06, "loss": 0.8045, "mean_token_accuracy": 0.7280436456203461, "step": 2173 }, { "epoch": 1.4800817160367723, "grad_norm": 1.9391459226608276, "learning_rate": 8.43254306699936e-06, "loss": 0.6516, "mean_token_accuracy": 0.7850238382816315, "step": 2174 }, { "epoch": 1.480762683009874, "grad_norm": 1.741099238395691, "learning_rate": 8.430904743563106e-06, "loss": 0.5626, "mean_token_accuracy": 0.8017044067382812, "step": 2175 }, { "epoch": 1.481443649982976, "grad_norm": 1.8042328357696533, "learning_rate": 8.429265723710329e-06, "loss": 0.8162, "mean_token_accuracy": 0.7432850003242493, "step": 2176 }, { "epoch": 1.4821246169560776, "grad_norm": 1.6079285144805908, "learning_rate": 8.42762600777372e-06, "loss": 0.8327, "mean_token_accuracy": 0.7309678494930267, "step": 2177 }, { "epoch": 1.4828055839291794, "grad_norm": 1.8898696899414062, "learning_rate": 8.425985596086112e-06, "loss": 0.7008, "mean_token_accuracy": 0.7556427419185638, "step": 2178 }, { "epoch": 1.4834865509022812, "grad_norm": 1.817289113998413, "learning_rate": 8.424344488980486e-06, "loss": 0.7472, "mean_token_accuracy": 0.7579267024993896, "step": 2179 }, { "epoch": 1.484167517875383, "grad_norm": 1.7519279718399048, "learning_rate": 8.422702686789957e-06, "loss": 0.7811, "mean_token_accuracy": 0.7225235104560852, "step": 2180 }, { "epoch": 1.4848484848484849, "grad_norm": 1.822940707206726, "learning_rate": 8.421060189847783e-06, "loss": 0.637, "mean_token_accuracy": 0.7943740487098694, "step": 2181 }, { "epoch": 1.4855294518215867, "grad_norm": 1.554886817932129, "learning_rate": 8.419416998487364e-06, "loss": 0.9083, "mean_token_accuracy": 0.7150353491306305, "step": 2182 }, { "epoch": 1.4862104187946885, "grad_norm": 1.6257176399230957, "learning_rate": 8.417773113042241e-06, "loss": 0.7507, "mean_token_accuracy": 0.7422707974910736, "step": 2183 }, { "epoch": 1.4868913857677903, "grad_norm": 1.5645148754119873, "learning_rate": 8.416128533846095e-06, "loss": 0.8109, "mean_token_accuracy": 0.7208797931671143, "step": 2184 }, { "epoch": 1.4875723527408922, "grad_norm": 1.8479151725769043, "learning_rate": 8.414483261232748e-06, "loss": 0.6402, "mean_token_accuracy": 0.7854574918746948, "step": 2185 }, { "epoch": 1.4882533197139938, "grad_norm": 1.6347057819366455, "learning_rate": 8.412837295536163e-06, "loss": 0.8504, "mean_token_accuracy": 0.7333767116069794, "step": 2186 }, { "epoch": 1.4889342866870956, "grad_norm": 1.9066247940063477, "learning_rate": 8.411190637090443e-06, "loss": 0.675, "mean_token_accuracy": 0.7869012355804443, "step": 2187 }, { "epoch": 1.4896152536601974, "grad_norm": 1.506258487701416, "learning_rate": 8.409543286229835e-06, "loss": 0.8377, "mean_token_accuracy": 0.7288956046104431, "step": 2188 }, { "epoch": 1.4902962206332993, "grad_norm": 1.5926228761672974, "learning_rate": 8.407895243288719e-06, "loss": 0.8714, "mean_token_accuracy": 0.7138026356697083, "step": 2189 }, { "epoch": 1.490977187606401, "grad_norm": 1.6725025177001953, "learning_rate": 8.406246508601624e-06, "loss": 0.7122, "mean_token_accuracy": 0.7376072704792023, "step": 2190 }, { "epoch": 1.491658154579503, "grad_norm": 1.8163589239120483, "learning_rate": 8.404597082503216e-06, "loss": 0.74, "mean_token_accuracy": 0.7468857169151306, "step": 2191 }, { "epoch": 1.4923391215526047, "grad_norm": 1.7711644172668457, "learning_rate": 8.402946965328298e-06, "loss": 0.7683, "mean_token_accuracy": 0.7468812167644501, "step": 2192 }, { "epoch": 1.4930200885257066, "grad_norm": 1.609979510307312, "learning_rate": 8.40129615741182e-06, "loss": 0.7899, "mean_token_accuracy": 0.7409053146839142, "step": 2193 }, { "epoch": 1.4937010554988084, "grad_norm": 1.79615318775177, "learning_rate": 8.399644659088864e-06, "loss": 0.7243, "mean_token_accuracy": 0.7709987163543701, "step": 2194 }, { "epoch": 1.49438202247191, "grad_norm": 1.8636842966079712, "learning_rate": 8.39799247069466e-06, "loss": 0.7104, "mean_token_accuracy": 0.7762072384357452, "step": 2195 }, { "epoch": 1.495062989445012, "grad_norm": 1.8655732870101929, "learning_rate": 8.396339592564575e-06, "loss": 0.7472, "mean_token_accuracy": 0.7578190863132477, "step": 2196 }, { "epoch": 1.4957439564181136, "grad_norm": 1.6914176940917969, "learning_rate": 8.394686025034114e-06, "loss": 0.8271, "mean_token_accuracy": 0.7245065569877625, "step": 2197 }, { "epoch": 1.4964249233912155, "grad_norm": 1.694422960281372, "learning_rate": 8.393031768438924e-06, "loss": 0.8408, "mean_token_accuracy": 0.7307563126087189, "step": 2198 }, { "epoch": 1.4971058903643173, "grad_norm": 1.8326208591461182, "learning_rate": 8.391376823114792e-06, "loss": 0.7331, "mean_token_accuracy": 0.7584218978881836, "step": 2199 }, { "epoch": 1.4977868573374191, "grad_norm": 1.9103803634643555, "learning_rate": 8.389721189397646e-06, "loss": 0.8016, "mean_token_accuracy": 0.7445414662361145, "step": 2200 }, { "epoch": 1.498467824310521, "grad_norm": 1.8601614236831665, "learning_rate": 8.388064867623547e-06, "loss": 0.7438, "mean_token_accuracy": 0.7518108785152435, "step": 2201 }, { "epoch": 1.4991487912836228, "grad_norm": 1.656261920928955, "learning_rate": 8.386407858128707e-06, "loss": 0.6895, "mean_token_accuracy": 0.7601664960384369, "step": 2202 }, { "epoch": 1.4998297582567246, "grad_norm": 1.769494652748108, "learning_rate": 8.384750161249467e-06, "loss": 0.5811, "mean_token_accuracy": 0.8120369017124176, "step": 2203 }, { "epoch": 1.5005107252298262, "grad_norm": 1.7661151885986328, "learning_rate": 8.383091777322312e-06, "loss": 0.7442, "mean_token_accuracy": 0.7470752596855164, "step": 2204 }, { "epoch": 1.5011916922029283, "grad_norm": 1.7823621034622192, "learning_rate": 8.381432706683869e-06, "loss": 0.6262, "mean_token_accuracy": 0.7857277691364288, "step": 2205 }, { "epoch": 1.5018726591760299, "grad_norm": 1.7732332944869995, "learning_rate": 8.3797729496709e-06, "loss": 0.7426, "mean_token_accuracy": 0.7598006725311279, "step": 2206 }, { "epoch": 1.502553626149132, "grad_norm": 1.6123323440551758, "learning_rate": 8.378112506620309e-06, "loss": 0.9373, "mean_token_accuracy": 0.7138825953006744, "step": 2207 }, { "epoch": 1.5032345931222335, "grad_norm": 1.8485331535339355, "learning_rate": 8.376451377869136e-06, "loss": 0.6853, "mean_token_accuracy": 0.7679067254066467, "step": 2208 }, { "epoch": 1.5039155600953353, "grad_norm": 1.7564663887023926, "learning_rate": 8.374789563754563e-06, "loss": 0.7009, "mean_token_accuracy": 0.7597989141941071, "step": 2209 }, { "epoch": 1.5045965270684372, "grad_norm": 1.860978126525879, "learning_rate": 8.373127064613915e-06, "loss": 0.6546, "mean_token_accuracy": 0.783953994512558, "step": 2210 }, { "epoch": 1.505277494041539, "grad_norm": 1.7324405908584595, "learning_rate": 8.371463880784647e-06, "loss": 0.8033, "mean_token_accuracy": 0.7287924885749817, "step": 2211 }, { "epoch": 1.5059584610146408, "grad_norm": 1.7677316665649414, "learning_rate": 8.369800012604358e-06, "loss": 0.7504, "mean_token_accuracy": 0.7565902173519135, "step": 2212 }, { "epoch": 1.5066394279877426, "grad_norm": 1.6955368518829346, "learning_rate": 8.368135460410787e-06, "loss": 0.798, "mean_token_accuracy": 0.7573666870594025, "step": 2213 }, { "epoch": 1.5073203949608445, "grad_norm": 1.7328943014144897, "learning_rate": 8.366470224541809e-06, "loss": 0.7217, "mean_token_accuracy": 0.7661310732364655, "step": 2214 }, { "epoch": 1.508001361933946, "grad_norm": 1.8387324810028076, "learning_rate": 8.364804305335441e-06, "loss": 0.652, "mean_token_accuracy": 0.7832226157188416, "step": 2215 }, { "epoch": 1.5086823289070481, "grad_norm": 1.8789352178573608, "learning_rate": 8.363137703129835e-06, "loss": 0.7159, "mean_token_accuracy": 0.7683544158935547, "step": 2216 }, { "epoch": 1.5093632958801497, "grad_norm": 1.75888192653656, "learning_rate": 8.361470418263284e-06, "loss": 0.7653, "mean_token_accuracy": 0.7418259084224701, "step": 2217 }, { "epoch": 1.5100442628532518, "grad_norm": 1.6710660457611084, "learning_rate": 8.35980245107422e-06, "loss": 0.735, "mean_token_accuracy": 0.7551637589931488, "step": 2218 }, { "epoch": 1.5107252298263534, "grad_norm": 1.6681649684906006, "learning_rate": 8.358133801901211e-06, "loss": 0.7783, "mean_token_accuracy": 0.7433710992336273, "step": 2219 }, { "epoch": 1.5114061967994552, "grad_norm": 1.7678825855255127, "learning_rate": 8.356464471082968e-06, "loss": 0.8725, "mean_token_accuracy": 0.7231588959693909, "step": 2220 }, { "epoch": 1.512087163772557, "grad_norm": 1.8784730434417725, "learning_rate": 8.354794458958333e-06, "loss": 0.7226, "mean_token_accuracy": 0.7513501048088074, "step": 2221 }, { "epoch": 1.5127681307456589, "grad_norm": 1.8654206991195679, "learning_rate": 8.353123765866294e-06, "loss": 0.6436, "mean_token_accuracy": 0.7927818298339844, "step": 2222 }, { "epoch": 1.5134490977187607, "grad_norm": 1.6603683233261108, "learning_rate": 8.351452392145973e-06, "loss": 0.7871, "mean_token_accuracy": 0.7517729997634888, "step": 2223 }, { "epoch": 1.5141300646918623, "grad_norm": 1.8200299739837646, "learning_rate": 8.34978033813663e-06, "loss": 0.7106, "mean_token_accuracy": 0.766595184803009, "step": 2224 }, { "epoch": 1.5148110316649643, "grad_norm": 1.668992042541504, "learning_rate": 8.348107604177664e-06, "loss": 0.7929, "mean_token_accuracy": 0.7067726254463196, "step": 2225 }, { "epoch": 1.515491998638066, "grad_norm": 1.8981492519378662, "learning_rate": 8.346434190608615e-06, "loss": 0.7314, "mean_token_accuracy": 0.7538259625434875, "step": 2226 }, { "epoch": 1.516172965611168, "grad_norm": 1.8472440242767334, "learning_rate": 8.344760097769156e-06, "loss": 0.6389, "mean_token_accuracy": 0.7792957425117493, "step": 2227 }, { "epoch": 1.5168539325842696, "grad_norm": 1.8182529211044312, "learning_rate": 8.343085325999099e-06, "loss": 0.6196, "mean_token_accuracy": 0.7954197525978088, "step": 2228 }, { "epoch": 1.5175348995573714, "grad_norm": 1.8503931760787964, "learning_rate": 8.341409875638396e-06, "loss": 0.7183, "mean_token_accuracy": 0.7734889686107635, "step": 2229 }, { "epoch": 1.5182158665304732, "grad_norm": 1.8751201629638672, "learning_rate": 8.339733747027136e-06, "loss": 0.7006, "mean_token_accuracy": 0.7721170783042908, "step": 2230 }, { "epoch": 1.518896833503575, "grad_norm": 1.750134825706482, "learning_rate": 8.338056940505544e-06, "loss": 0.7632, "mean_token_accuracy": 0.7385984659194946, "step": 2231 }, { "epoch": 1.519577800476677, "grad_norm": 1.540748119354248, "learning_rate": 8.336379456413986e-06, "loss": 0.7454, "mean_token_accuracy": 0.75299271941185, "step": 2232 }, { "epoch": 1.5202587674497787, "grad_norm": 1.8689874410629272, "learning_rate": 8.33470129509296e-06, "loss": 0.6884, "mean_token_accuracy": 0.7627294659614563, "step": 2233 }, { "epoch": 1.5209397344228806, "grad_norm": 1.6611818075180054, "learning_rate": 8.333022456883107e-06, "loss": 0.7701, "mean_token_accuracy": 0.740577757358551, "step": 2234 }, { "epoch": 1.5216207013959822, "grad_norm": 1.6297810077667236, "learning_rate": 8.331342942125202e-06, "loss": 0.7244, "mean_token_accuracy": 0.7489356696605682, "step": 2235 }, { "epoch": 1.5223016683690842, "grad_norm": 1.827042579650879, "learning_rate": 8.32966275116016e-06, "loss": 0.729, "mean_token_accuracy": 0.7526697516441345, "step": 2236 }, { "epoch": 1.5229826353421858, "grad_norm": 1.5906692743301392, "learning_rate": 8.327981884329031e-06, "loss": 0.8596, "mean_token_accuracy": 0.7265205979347229, "step": 2237 }, { "epoch": 1.5236636023152879, "grad_norm": 1.852982997894287, "learning_rate": 8.326300341973003e-06, "loss": 0.7494, "mean_token_accuracy": 0.7626123428344727, "step": 2238 }, { "epoch": 1.5243445692883895, "grad_norm": 1.9390043020248413, "learning_rate": 8.324618124433401e-06, "loss": 0.6787, "mean_token_accuracy": 0.7812559306621552, "step": 2239 }, { "epoch": 1.5250255362614913, "grad_norm": 1.7607860565185547, "learning_rate": 8.322935232051686e-06, "loss": 0.7214, "mean_token_accuracy": 0.7604998648166656, "step": 2240 }, { "epoch": 1.5257065032345931, "grad_norm": 1.522955298423767, "learning_rate": 8.321251665169461e-06, "loss": 0.7856, "mean_token_accuracy": 0.7436890006065369, "step": 2241 }, { "epoch": 1.526387470207695, "grad_norm": 1.634684443473816, "learning_rate": 8.319567424128459e-06, "loss": 0.7806, "mean_token_accuracy": 0.7544761002063751, "step": 2242 }, { "epoch": 1.5270684371807968, "grad_norm": 1.8202438354492188, "learning_rate": 8.317882509270551e-06, "loss": 0.7054, "mean_token_accuracy": 0.7738664448261261, "step": 2243 }, { "epoch": 1.5277494041538984, "grad_norm": 1.5945740938186646, "learning_rate": 8.31619692093775e-06, "loss": 0.7267, "mean_token_accuracy": 0.7664953470230103, "step": 2244 }, { "epoch": 1.5284303711270004, "grad_norm": 1.7989070415496826, "learning_rate": 8.314510659472201e-06, "loss": 0.6193, "mean_token_accuracy": 0.7820746600627899, "step": 2245 }, { "epoch": 1.529111338100102, "grad_norm": 1.7055144309997559, "learning_rate": 8.312823725216188e-06, "loss": 0.8168, "mean_token_accuracy": 0.7380210757255554, "step": 2246 }, { "epoch": 1.529792305073204, "grad_norm": 1.635902762413025, "learning_rate": 8.311136118512131e-06, "loss": 0.8736, "mean_token_accuracy": 0.7256532609462738, "step": 2247 }, { "epoch": 1.5304732720463057, "grad_norm": 1.6573154926300049, "learning_rate": 8.309447839702583e-06, "loss": 0.8484, "mean_token_accuracy": 0.7228921353816986, "step": 2248 }, { "epoch": 1.5311542390194075, "grad_norm": 1.6255394220352173, "learning_rate": 8.30775888913024e-06, "loss": 0.7462, "mean_token_accuracy": 0.7468843162059784, "step": 2249 }, { "epoch": 1.5318352059925093, "grad_norm": 1.9844809770584106, "learning_rate": 8.306069267137927e-06, "loss": 0.6739, "mean_token_accuracy": 0.7835918068885803, "step": 2250 }, { "epoch": 1.5325161729656112, "grad_norm": 1.7412755489349365, "learning_rate": 8.304378974068612e-06, "loss": 0.6207, "mean_token_accuracy": 0.8028515875339508, "step": 2251 }, { "epoch": 1.533197139938713, "grad_norm": 1.8579139709472656, "learning_rate": 8.302688010265397e-06, "loss": 0.67, "mean_token_accuracy": 0.7857018113136292, "step": 2252 }, { "epoch": 1.5338781069118148, "grad_norm": 1.961970329284668, "learning_rate": 8.300996376071517e-06, "loss": 0.5814, "mean_token_accuracy": 0.8072145581245422, "step": 2253 }, { "epoch": 1.5345590738849166, "grad_norm": 1.658990740776062, "learning_rate": 8.299304071830347e-06, "loss": 0.8135, "mean_token_accuracy": 0.7188812792301178, "step": 2254 }, { "epoch": 1.5352400408580182, "grad_norm": 1.6610716581344604, "learning_rate": 8.2976110978854e-06, "loss": 0.7601, "mean_token_accuracy": 0.7506247162818909, "step": 2255 }, { "epoch": 1.5359210078311203, "grad_norm": 1.6155744791030884, "learning_rate": 8.295917454580312e-06, "loss": 0.8348, "mean_token_accuracy": 0.7211669981479645, "step": 2256 }, { "epoch": 1.536601974804222, "grad_norm": 1.9351743459701538, "learning_rate": 8.294223142258874e-06, "loss": 0.7118, "mean_token_accuracy": 0.7735992670059204, "step": 2257 }, { "epoch": 1.537282941777324, "grad_norm": 1.8249211311340332, "learning_rate": 8.292528161264997e-06, "loss": 0.7098, "mean_token_accuracy": 0.759293794631958, "step": 2258 }, { "epoch": 1.5379639087504255, "grad_norm": 1.7551145553588867, "learning_rate": 8.290832511942739e-06, "loss": 0.6779, "mean_token_accuracy": 0.7753246426582336, "step": 2259 }, { "epoch": 1.5386448757235274, "grad_norm": 1.6152162551879883, "learning_rate": 8.289136194636284e-06, "loss": 0.7918, "mean_token_accuracy": 0.7306999862194061, "step": 2260 }, { "epoch": 1.5393258426966292, "grad_norm": 1.8665447235107422, "learning_rate": 8.287439209689958e-06, "loss": 0.6503, "mean_token_accuracy": 0.7935374081134796, "step": 2261 }, { "epoch": 1.540006809669731, "grad_norm": 1.8835124969482422, "learning_rate": 8.285741557448222e-06, "loss": 0.7219, "mean_token_accuracy": 0.7354235649108887, "step": 2262 }, { "epoch": 1.5406877766428329, "grad_norm": 1.8242502212524414, "learning_rate": 8.284043238255669e-06, "loss": 0.6595, "mean_token_accuracy": 0.792201429605484, "step": 2263 }, { "epoch": 1.5413687436159347, "grad_norm": 1.8348064422607422, "learning_rate": 8.28234425245703e-06, "loss": 0.7441, "mean_token_accuracy": 0.7591201066970825, "step": 2264 }, { "epoch": 1.5420497105890365, "grad_norm": 1.698111891746521, "learning_rate": 8.280644600397172e-06, "loss": 0.7608, "mean_token_accuracy": 0.7477371096611023, "step": 2265 }, { "epoch": 1.542730677562138, "grad_norm": 1.4736472368240356, "learning_rate": 8.278944282421095e-06, "loss": 0.9163, "mean_token_accuracy": 0.7084740400314331, "step": 2266 }, { "epoch": 1.5434116445352402, "grad_norm": 1.7640451192855835, "learning_rate": 8.277243298873936e-06, "loss": 0.7876, "mean_token_accuracy": 0.7511483430862427, "step": 2267 }, { "epoch": 1.5440926115083418, "grad_norm": 1.7349106073379517, "learning_rate": 8.275541650100966e-06, "loss": 0.6198, "mean_token_accuracy": 0.7900622189044952, "step": 2268 }, { "epoch": 1.5447735784814438, "grad_norm": 1.9823329448699951, "learning_rate": 8.27383933644759e-06, "loss": 0.7346, "mean_token_accuracy": 0.7649463415145874, "step": 2269 }, { "epoch": 1.5454545454545454, "grad_norm": 1.6590690612792969, "learning_rate": 8.272136358259351e-06, "loss": 0.7359, "mean_token_accuracy": 0.7525522708892822, "step": 2270 }, { "epoch": 1.5461355124276472, "grad_norm": 1.8032909631729126, "learning_rate": 8.270432715881925e-06, "loss": 0.6323, "mean_token_accuracy": 0.7930571138858795, "step": 2271 }, { "epoch": 1.546816479400749, "grad_norm": 1.7393696308135986, "learning_rate": 8.268728409661123e-06, "loss": 0.7284, "mean_token_accuracy": 0.7668803632259369, "step": 2272 }, { "epoch": 1.547497446373851, "grad_norm": 1.8103909492492676, "learning_rate": 8.26702343994289e-06, "loss": 0.713, "mean_token_accuracy": 0.7696127891540527, "step": 2273 }, { "epoch": 1.5481784133469527, "grad_norm": 1.7208921909332275, "learning_rate": 8.265317807073308e-06, "loss": 0.7639, "mean_token_accuracy": 0.7307837605476379, "step": 2274 }, { "epoch": 1.5488593803200543, "grad_norm": 1.9442247152328491, "learning_rate": 8.26361151139859e-06, "loss": 0.7221, "mean_token_accuracy": 0.772310882806778, "step": 2275 }, { "epoch": 1.5495403472931564, "grad_norm": 1.6945854425430298, "learning_rate": 8.261904553265088e-06, "loss": 0.7742, "mean_token_accuracy": 0.736450731754303, "step": 2276 }, { "epoch": 1.550221314266258, "grad_norm": 1.906537413597107, "learning_rate": 8.260196933019285e-06, "loss": 0.7449, "mean_token_accuracy": 0.7541445791721344, "step": 2277 }, { "epoch": 1.55090228123936, "grad_norm": 1.7629040479660034, "learning_rate": 8.258488651007796e-06, "loss": 0.6691, "mean_token_accuracy": 0.7798046171665192, "step": 2278 }, { "epoch": 1.5515832482124616, "grad_norm": 1.7445306777954102, "learning_rate": 8.25677970757738e-06, "loss": 0.7339, "mean_token_accuracy": 0.7540619671344757, "step": 2279 }, { "epoch": 1.5522642151855635, "grad_norm": 1.6822863817214966, "learning_rate": 8.25507010307492e-06, "loss": 0.7974, "mean_token_accuracy": 0.7287287414073944, "step": 2280 }, { "epoch": 1.5529451821586653, "grad_norm": 1.7629002332687378, "learning_rate": 8.253359837847438e-06, "loss": 0.6322, "mean_token_accuracy": 0.7934295535087585, "step": 2281 }, { "epoch": 1.553626149131767, "grad_norm": 1.7784773111343384, "learning_rate": 8.251648912242092e-06, "loss": 0.7089, "mean_token_accuracy": 0.778150737285614, "step": 2282 }, { "epoch": 1.554307116104869, "grad_norm": 1.7926242351531982, "learning_rate": 8.249937326606167e-06, "loss": 0.691, "mean_token_accuracy": 0.77667036652565, "step": 2283 }, { "epoch": 1.5549880830779708, "grad_norm": 1.6726614236831665, "learning_rate": 8.248225081287089e-06, "loss": 0.884, "mean_token_accuracy": 0.7200429737567902, "step": 2284 }, { "epoch": 1.5556690500510726, "grad_norm": 1.9913660287857056, "learning_rate": 8.246512176632415e-06, "loss": 0.5851, "mean_token_accuracy": 0.8079727590084076, "step": 2285 }, { "epoch": 1.5563500170241742, "grad_norm": 1.7475476264953613, "learning_rate": 8.244798612989837e-06, "loss": 0.8259, "mean_token_accuracy": 0.7332585453987122, "step": 2286 }, { "epoch": 1.5570309839972762, "grad_norm": 1.8875137567520142, "learning_rate": 8.243084390707176e-06, "loss": 0.7263, "mean_token_accuracy": 0.7703423798084259, "step": 2287 }, { "epoch": 1.5577119509703778, "grad_norm": 1.9003608226776123, "learning_rate": 8.241369510132395e-06, "loss": 0.655, "mean_token_accuracy": 0.7791987061500549, "step": 2288 }, { "epoch": 1.55839291794348, "grad_norm": 2.0433385372161865, "learning_rate": 8.239653971613584e-06, "loss": 0.6946, "mean_token_accuracy": 0.781920313835144, "step": 2289 }, { "epoch": 1.5590738849165815, "grad_norm": 1.6927937269210815, "learning_rate": 8.23793777549897e-06, "loss": 0.7742, "mean_token_accuracy": 0.7562968134880066, "step": 2290 }, { "epoch": 1.5597548518896833, "grad_norm": 1.7070441246032715, "learning_rate": 8.236220922136911e-06, "loss": 0.8495, "mean_token_accuracy": 0.7135491073131561, "step": 2291 }, { "epoch": 1.5604358188627852, "grad_norm": 1.7835160493850708, "learning_rate": 8.2345034118759e-06, "loss": 0.7506, "mean_token_accuracy": 0.7440353631973267, "step": 2292 }, { "epoch": 1.561116785835887, "grad_norm": 1.8022342920303345, "learning_rate": 8.232785245064564e-06, "loss": 0.758, "mean_token_accuracy": 0.7560567557811737, "step": 2293 }, { "epoch": 1.5617977528089888, "grad_norm": 1.7458531856536865, "learning_rate": 8.23106642205166e-06, "loss": 0.7643, "mean_token_accuracy": 0.7464056313037872, "step": 2294 }, { "epoch": 1.5624787197820904, "grad_norm": 1.8909989595413208, "learning_rate": 8.229346943186083e-06, "loss": 0.7263, "mean_token_accuracy": 0.7662869393825531, "step": 2295 }, { "epoch": 1.5631596867551925, "grad_norm": 1.845470905303955, "learning_rate": 8.227626808816857e-06, "loss": 0.6664, "mean_token_accuracy": 0.7758574187755585, "step": 2296 }, { "epoch": 1.563840653728294, "grad_norm": 1.5778120756149292, "learning_rate": 8.225906019293142e-06, "loss": 0.7119, "mean_token_accuracy": 0.7611395120620728, "step": 2297 }, { "epoch": 1.564521620701396, "grad_norm": 1.9137423038482666, "learning_rate": 8.224184574964226e-06, "loss": 0.7137, "mean_token_accuracy": 0.7662442922592163, "step": 2298 }, { "epoch": 1.5652025876744977, "grad_norm": 1.896957516670227, "learning_rate": 8.222462476179538e-06, "loss": 0.7204, "mean_token_accuracy": 0.7630518972873688, "step": 2299 }, { "epoch": 1.5658835546475995, "grad_norm": 1.7231659889221191, "learning_rate": 8.220739723288634e-06, "loss": 0.7817, "mean_token_accuracy": 0.7688569128513336, "step": 2300 }, { "epoch": 1.5665645216207014, "grad_norm": 1.8730698823928833, "learning_rate": 8.219016316641205e-06, "loss": 0.6827, "mean_token_accuracy": 0.7815950214862823, "step": 2301 }, { "epoch": 1.5672454885938032, "grad_norm": 1.6514015197753906, "learning_rate": 8.217292256587067e-06, "loss": 0.7893, "mean_token_accuracy": 0.7427369654178619, "step": 2302 }, { "epoch": 1.567926455566905, "grad_norm": 1.7668280601501465, "learning_rate": 8.215567543476186e-06, "loss": 0.5874, "mean_token_accuracy": 0.8021031618118286, "step": 2303 }, { "epoch": 1.5686074225400068, "grad_norm": 1.8461464643478394, "learning_rate": 8.213842177658644e-06, "loss": 0.678, "mean_token_accuracy": 0.7778797149658203, "step": 2304 }, { "epoch": 1.5692883895131087, "grad_norm": 1.791015386581421, "learning_rate": 8.212116159484663e-06, "loss": 0.6588, "mean_token_accuracy": 0.7756021916866302, "step": 2305 }, { "epoch": 1.5699693564862103, "grad_norm": 1.8075019121170044, "learning_rate": 8.210389489304596e-06, "loss": 0.7373, "mean_token_accuracy": 0.7587003111839294, "step": 2306 }, { "epoch": 1.5706503234593123, "grad_norm": 1.6519310474395752, "learning_rate": 8.208662167468926e-06, "loss": 0.7883, "mean_token_accuracy": 0.7383992075920105, "step": 2307 }, { "epoch": 1.571331290432414, "grad_norm": 1.8674629926681519, "learning_rate": 8.206934194328273e-06, "loss": 0.6716, "mean_token_accuracy": 0.7625816762447357, "step": 2308 }, { "epoch": 1.572012257405516, "grad_norm": 1.9102132320404053, "learning_rate": 8.205205570233386e-06, "loss": 0.7024, "mean_token_accuracy": 0.7597867250442505, "step": 2309 }, { "epoch": 1.5726932243786176, "grad_norm": 2.0594823360443115, "learning_rate": 8.203476295535148e-06, "loss": 0.6987, "mean_token_accuracy": 0.7765325605869293, "step": 2310 }, { "epoch": 1.5733741913517194, "grad_norm": 1.8388992547988892, "learning_rate": 8.20174637058457e-06, "loss": 0.6537, "mean_token_accuracy": 0.7888996005058289, "step": 2311 }, { "epoch": 1.5740551583248212, "grad_norm": 1.7725868225097656, "learning_rate": 8.200015795732804e-06, "loss": 0.595, "mean_token_accuracy": 0.8053150177001953, "step": 2312 }, { "epoch": 1.574736125297923, "grad_norm": 1.6697919368743896, "learning_rate": 8.19828457133112e-06, "loss": 0.7323, "mean_token_accuracy": 0.7556951642036438, "step": 2313 }, { "epoch": 1.5754170922710249, "grad_norm": 1.709134817123413, "learning_rate": 8.196552697730933e-06, "loss": 0.7278, "mean_token_accuracy": 0.7604875266551971, "step": 2314 }, { "epoch": 1.5760980592441267, "grad_norm": 1.7272546291351318, "learning_rate": 8.194820175283783e-06, "loss": 0.7982, "mean_token_accuracy": 0.7346405982971191, "step": 2315 }, { "epoch": 1.5767790262172285, "grad_norm": 1.645015001296997, "learning_rate": 8.193087004341344e-06, "loss": 0.7687, "mean_token_accuracy": 0.7432602047920227, "step": 2316 }, { "epoch": 1.5774599931903301, "grad_norm": 1.8821797370910645, "learning_rate": 8.191353185255418e-06, "loss": 0.6502, "mean_token_accuracy": 0.7834992706775665, "step": 2317 }, { "epoch": 1.5781409601634322, "grad_norm": 1.9275140762329102, "learning_rate": 8.189618718377946e-06, "loss": 0.6393, "mean_token_accuracy": 0.7833586037158966, "step": 2318 }, { "epoch": 1.5788219271365338, "grad_norm": 1.9373055696487427, "learning_rate": 8.187883604060993e-06, "loss": 0.7172, "mean_token_accuracy": 0.7698738873004913, "step": 2319 }, { "epoch": 1.5795028941096358, "grad_norm": 1.7663888931274414, "learning_rate": 8.186147842656758e-06, "loss": 0.7763, "mean_token_accuracy": 0.7394621074199677, "step": 2320 }, { "epoch": 1.5801838610827375, "grad_norm": 1.8746466636657715, "learning_rate": 8.184411434517572e-06, "loss": 0.6754, "mean_token_accuracy": 0.7675864696502686, "step": 2321 }, { "epoch": 1.5808648280558393, "grad_norm": 1.9361399412155151, "learning_rate": 8.182674379995898e-06, "loss": 0.7679, "mean_token_accuracy": 0.7536993324756622, "step": 2322 }, { "epoch": 1.581545795028941, "grad_norm": 1.77513587474823, "learning_rate": 8.180936679444328e-06, "loss": 0.6956, "mean_token_accuracy": 0.7708942890167236, "step": 2323 }, { "epoch": 1.582226762002043, "grad_norm": 1.522628664970398, "learning_rate": 8.179198333215588e-06, "loss": 0.8131, "mean_token_accuracy": 0.7285827696323395, "step": 2324 }, { "epoch": 1.5829077289751448, "grad_norm": 1.6966851949691772, "learning_rate": 8.17745934166253e-06, "loss": 0.7168, "mean_token_accuracy": 0.7737019062042236, "step": 2325 }, { "epoch": 1.5835886959482464, "grad_norm": 1.712699055671692, "learning_rate": 8.175719705138143e-06, "loss": 0.7353, "mean_token_accuracy": 0.7767726480960846, "step": 2326 }, { "epoch": 1.5842696629213484, "grad_norm": 1.784293293952942, "learning_rate": 8.173979423995545e-06, "loss": 0.8004, "mean_token_accuracy": 0.7451833784580231, "step": 2327 }, { "epoch": 1.58495062989445, "grad_norm": 1.9125157594680786, "learning_rate": 8.17223849858798e-06, "loss": 0.7031, "mean_token_accuracy": 0.7707779109477997, "step": 2328 }, { "epoch": 1.585631596867552, "grad_norm": 1.6736714839935303, "learning_rate": 8.170496929268831e-06, "loss": 0.7138, "mean_token_accuracy": 0.7596569955348969, "step": 2329 }, { "epoch": 1.5863125638406537, "grad_norm": 1.6329773664474487, "learning_rate": 8.168754716391608e-06, "loss": 0.7332, "mean_token_accuracy": 0.7693117260932922, "step": 2330 }, { "epoch": 1.5869935308137555, "grad_norm": 1.6519320011138916, "learning_rate": 8.167011860309948e-06, "loss": 0.7761, "mean_token_accuracy": 0.7536723017692566, "step": 2331 }, { "epoch": 1.5876744977868573, "grad_norm": 1.8311197757720947, "learning_rate": 8.165268361377624e-06, "loss": 0.7209, "mean_token_accuracy": 0.7654381692409515, "step": 2332 }, { "epoch": 1.5883554647599591, "grad_norm": 1.9953371286392212, "learning_rate": 8.163524219948533e-06, "loss": 0.6426, "mean_token_accuracy": 0.792680948972702, "step": 2333 }, { "epoch": 1.589036431733061, "grad_norm": 1.5504153966903687, "learning_rate": 8.161779436376715e-06, "loss": 0.7858, "mean_token_accuracy": 0.7317825555801392, "step": 2334 }, { "epoch": 1.5897173987061628, "grad_norm": 1.7315857410430908, "learning_rate": 8.160034011016325e-06, "loss": 0.6659, "mean_token_accuracy": 0.7760992646217346, "step": 2335 }, { "epoch": 1.5903983656792646, "grad_norm": 1.8012579679489136, "learning_rate": 8.158287944221654e-06, "loss": 0.7591, "mean_token_accuracy": 0.7416942119598389, "step": 2336 }, { "epoch": 1.5910793326523662, "grad_norm": 1.8059582710266113, "learning_rate": 8.156541236347132e-06, "loss": 0.6531, "mean_token_accuracy": 0.7814556956291199, "step": 2337 }, { "epoch": 1.5917602996254683, "grad_norm": 1.753398060798645, "learning_rate": 8.154793887747305e-06, "loss": 0.7465, "mean_token_accuracy": 0.7457754015922546, "step": 2338 }, { "epoch": 1.5924412665985699, "grad_norm": 1.813267469406128, "learning_rate": 8.15304589877686e-06, "loss": 0.7852, "mean_token_accuracy": 0.7182435989379883, "step": 2339 }, { "epoch": 1.593122233571672, "grad_norm": 1.7605842351913452, "learning_rate": 8.151297269790607e-06, "loss": 0.6901, "mean_token_accuracy": 0.7624569535255432, "step": 2340 }, { "epoch": 1.5938032005447735, "grad_norm": 1.854027509689331, "learning_rate": 8.149548001143488e-06, "loss": 0.6892, "mean_token_accuracy": 0.7803259789943695, "step": 2341 }, { "epoch": 1.5944841675178754, "grad_norm": 1.7883923053741455, "learning_rate": 8.147798093190578e-06, "loss": 0.8023, "mean_token_accuracy": 0.7296684384346008, "step": 2342 }, { "epoch": 1.5951651344909772, "grad_norm": 1.9788144826889038, "learning_rate": 8.146047546287077e-06, "loss": 0.6711, "mean_token_accuracy": 0.7922974526882172, "step": 2343 }, { "epoch": 1.595846101464079, "grad_norm": 1.851570963859558, "learning_rate": 8.14429636078832e-06, "loss": 0.7618, "mean_token_accuracy": 0.7580606639385223, "step": 2344 }, { "epoch": 1.5965270684371808, "grad_norm": 1.8217006921768188, "learning_rate": 8.142544537049764e-06, "loss": 0.7788, "mean_token_accuracy": 0.7620437741279602, "step": 2345 }, { "epoch": 1.5972080354102824, "grad_norm": 1.7002791166305542, "learning_rate": 8.140792075427003e-06, "loss": 0.7455, "mean_token_accuracy": 0.7592360079288483, "step": 2346 }, { "epoch": 1.5978890023833845, "grad_norm": 1.8520665168762207, "learning_rate": 8.139038976275755e-06, "loss": 0.668, "mean_token_accuracy": 0.7812033593654633, "step": 2347 }, { "epoch": 1.598569969356486, "grad_norm": 1.643820881843567, "learning_rate": 8.137285239951876e-06, "loss": 0.71, "mean_token_accuracy": 0.7581562399864197, "step": 2348 }, { "epoch": 1.5992509363295881, "grad_norm": 1.6083451509475708, "learning_rate": 8.135530866811338e-06, "loss": 0.7344, "mean_token_accuracy": 0.7639294266700745, "step": 2349 }, { "epoch": 1.5999319033026898, "grad_norm": 1.520490288734436, "learning_rate": 8.133775857210251e-06, "loss": 0.731, "mean_token_accuracy": 0.7719920873641968, "step": 2350 }, { "epoch": 1.6006128702757916, "grad_norm": 1.9784667491912842, "learning_rate": 8.132020211504856e-06, "loss": 0.6893, "mean_token_accuracy": 0.7563549876213074, "step": 2351 }, { "epoch": 1.6012938372488934, "grad_norm": 1.8483506441116333, "learning_rate": 8.130263930051518e-06, "loss": 0.7778, "mean_token_accuracy": 0.7365587055683136, "step": 2352 }, { "epoch": 1.6019748042219952, "grad_norm": 1.5994340181350708, "learning_rate": 8.128507013206734e-06, "loss": 0.7728, "mean_token_accuracy": 0.7343439161777496, "step": 2353 }, { "epoch": 1.602655771195097, "grad_norm": 1.819961667060852, "learning_rate": 8.126749461327129e-06, "loss": 0.6613, "mean_token_accuracy": 0.779820054769516, "step": 2354 }, { "epoch": 1.6033367381681989, "grad_norm": 1.9261342287063599, "learning_rate": 8.124991274769451e-06, "loss": 0.7597, "mean_token_accuracy": 0.7336136102676392, "step": 2355 }, { "epoch": 1.6040177051413007, "grad_norm": 1.971501350402832, "learning_rate": 8.12323245389059e-06, "loss": 0.609, "mean_token_accuracy": 0.8017107546329498, "step": 2356 }, { "epoch": 1.6046986721144023, "grad_norm": 1.666882038116455, "learning_rate": 8.121472999047555e-06, "loss": 0.6864, "mean_token_accuracy": 0.7827315032482147, "step": 2357 }, { "epoch": 1.6053796390875044, "grad_norm": 1.656469464302063, "learning_rate": 8.119712910597483e-06, "loss": 0.7423, "mean_token_accuracy": 0.7418036758899689, "step": 2358 }, { "epoch": 1.606060606060606, "grad_norm": 1.6701706647872925, "learning_rate": 8.117952188897649e-06, "loss": 0.6645, "mean_token_accuracy": 0.7801912426948547, "step": 2359 }, { "epoch": 1.606741573033708, "grad_norm": 1.761390209197998, "learning_rate": 8.116190834305443e-06, "loss": 0.7988, "mean_token_accuracy": 0.7102548480033875, "step": 2360 }, { "epoch": 1.6074225400068096, "grad_norm": 1.8377206325531006, "learning_rate": 8.114428847178398e-06, "loss": 0.7345, "mean_token_accuracy": 0.7665740549564362, "step": 2361 }, { "epoch": 1.6081035069799114, "grad_norm": 1.849509835243225, "learning_rate": 8.11266622787416e-06, "loss": 0.7773, "mean_token_accuracy": 0.740784227848053, "step": 2362 }, { "epoch": 1.6087844739530133, "grad_norm": 1.8018114566802979, "learning_rate": 8.110902976750516e-06, "loss": 0.6953, "mean_token_accuracy": 0.7691622674465179, "step": 2363 }, { "epoch": 1.609465440926115, "grad_norm": 1.8071980476379395, "learning_rate": 8.109139094165376e-06, "loss": 0.605, "mean_token_accuracy": 0.8048100471496582, "step": 2364 }, { "epoch": 1.610146407899217, "grad_norm": 1.8242454528808594, "learning_rate": 8.107374580476781e-06, "loss": 0.708, "mean_token_accuracy": 0.759245365858078, "step": 2365 }, { "epoch": 1.6108273748723185, "grad_norm": 1.888822078704834, "learning_rate": 8.105609436042895e-06, "loss": 0.7407, "mean_token_accuracy": 0.7560961246490479, "step": 2366 }, { "epoch": 1.6115083418454206, "grad_norm": 1.7960444688796997, "learning_rate": 8.103843661222013e-06, "loss": 0.6603, "mean_token_accuracy": 0.7852883636951447, "step": 2367 }, { "epoch": 1.6121893088185222, "grad_norm": 1.866562008857727, "learning_rate": 8.102077256372559e-06, "loss": 0.757, "mean_token_accuracy": 0.7441695034503937, "step": 2368 }, { "epoch": 1.6128702757916242, "grad_norm": 1.849812626838684, "learning_rate": 8.100310221853082e-06, "loss": 0.6919, "mean_token_accuracy": 0.7849198579788208, "step": 2369 }, { "epoch": 1.6135512427647258, "grad_norm": 1.7893357276916504, "learning_rate": 8.098542558022263e-06, "loss": 0.7159, "mean_token_accuracy": 0.7784062027931213, "step": 2370 }, { "epoch": 1.6142322097378277, "grad_norm": 1.5639159679412842, "learning_rate": 8.096774265238908e-06, "loss": 0.7296, "mean_token_accuracy": 0.7540026009082794, "step": 2371 }, { "epoch": 1.6149131767109295, "grad_norm": 1.7777276039123535, "learning_rate": 8.095005343861951e-06, "loss": 0.616, "mean_token_accuracy": 0.7913300395011902, "step": 2372 }, { "epoch": 1.6155941436840313, "grad_norm": 1.8252952098846436, "learning_rate": 8.09323579425045e-06, "loss": 0.7541, "mean_token_accuracy": 0.761983335018158, "step": 2373 }, { "epoch": 1.6162751106571331, "grad_norm": 1.827549695968628, "learning_rate": 8.091465616763598e-06, "loss": 0.6778, "mean_token_accuracy": 0.7751969993114471, "step": 2374 }, { "epoch": 1.616956077630235, "grad_norm": 1.7053189277648926, "learning_rate": 8.089694811760712e-06, "loss": 0.7624, "mean_token_accuracy": 0.7451250553131104, "step": 2375 }, { "epoch": 1.6176370446033368, "grad_norm": 1.7777485847473145, "learning_rate": 8.087923379601233e-06, "loss": 0.6984, "mean_token_accuracy": 0.7631044089794159, "step": 2376 }, { "epoch": 1.6183180115764384, "grad_norm": 1.8096811771392822, "learning_rate": 8.086151320644732e-06, "loss": 0.5967, "mean_token_accuracy": 0.7857804894447327, "step": 2377 }, { "epoch": 1.6189989785495404, "grad_norm": 1.841963291168213, "learning_rate": 8.08437863525091e-06, "loss": 0.6711, "mean_token_accuracy": 0.7688048481941223, "step": 2378 }, { "epoch": 1.619679945522642, "grad_norm": 1.7061266899108887, "learning_rate": 8.08260532377959e-06, "loss": 0.7747, "mean_token_accuracy": 0.7504962980747223, "step": 2379 }, { "epoch": 1.620360912495744, "grad_norm": 1.6035230159759521, "learning_rate": 8.08083138659073e-06, "loss": 0.8645, "mean_token_accuracy": 0.7293758392333984, "step": 2380 }, { "epoch": 1.6210418794688457, "grad_norm": 1.7015553712844849, "learning_rate": 8.079056824044405e-06, "loss": 0.6785, "mean_token_accuracy": 0.7474694848060608, "step": 2381 }, { "epoch": 1.6217228464419475, "grad_norm": 1.835941195487976, "learning_rate": 8.077281636500822e-06, "loss": 0.7017, "mean_token_accuracy": 0.7667097747325897, "step": 2382 }, { "epoch": 1.6224038134150494, "grad_norm": 1.72034752368927, "learning_rate": 8.075505824320316e-06, "loss": 0.7185, "mean_token_accuracy": 0.7621194124221802, "step": 2383 }, { "epoch": 1.6230847803881512, "grad_norm": 1.7060221433639526, "learning_rate": 8.073729387863345e-06, "loss": 0.6883, "mean_token_accuracy": 0.7670305669307709, "step": 2384 }, { "epoch": 1.623765747361253, "grad_norm": 1.6381306648254395, "learning_rate": 8.071952327490499e-06, "loss": 0.7337, "mean_token_accuracy": 0.7584928274154663, "step": 2385 }, { "epoch": 1.6244467143343548, "grad_norm": 1.6454914808273315, "learning_rate": 8.070174643562489e-06, "loss": 0.7001, "mean_token_accuracy": 0.7578051388263702, "step": 2386 }, { "epoch": 1.6251276813074567, "grad_norm": 1.9485481977462769, "learning_rate": 8.068396336440158e-06, "loss": 0.7227, "mean_token_accuracy": 0.7624926269054413, "step": 2387 }, { "epoch": 1.6258086482805583, "grad_norm": 1.8482465744018555, "learning_rate": 8.066617406484469e-06, "loss": 0.6902, "mean_token_accuracy": 0.7801486253738403, "step": 2388 }, { "epoch": 1.6264896152536603, "grad_norm": 1.675877571105957, "learning_rate": 8.064837854056518e-06, "loss": 0.7382, "mean_token_accuracy": 0.7619036436080933, "step": 2389 }, { "epoch": 1.627170582226762, "grad_norm": 1.723188877105713, "learning_rate": 8.063057679517525e-06, "loss": 0.8204, "mean_token_accuracy": 0.7382096946239471, "step": 2390 }, { "epoch": 1.627851549199864, "grad_norm": 1.6659787893295288, "learning_rate": 8.061276883228835e-06, "loss": 0.7646, "mean_token_accuracy": 0.7536091208457947, "step": 2391 }, { "epoch": 1.6285325161729656, "grad_norm": 1.6672472953796387, "learning_rate": 8.059495465551916e-06, "loss": 0.8266, "mean_token_accuracy": 0.7273957431316376, "step": 2392 }, { "epoch": 1.6292134831460674, "grad_norm": 1.6072560548782349, "learning_rate": 8.057713426848372e-06, "loss": 0.7766, "mean_token_accuracy": 0.7384826838970184, "step": 2393 }, { "epoch": 1.6298944501191692, "grad_norm": 1.7041107416152954, "learning_rate": 8.055930767479925e-06, "loss": 0.6712, "mean_token_accuracy": 0.7792491614818573, "step": 2394 }, { "epoch": 1.630575417092271, "grad_norm": 1.6925641298294067, "learning_rate": 8.054147487808426e-06, "loss": 0.7745, "mean_token_accuracy": 0.7350006103515625, "step": 2395 }, { "epoch": 1.6312563840653729, "grad_norm": 1.5970796346664429, "learning_rate": 8.052363588195848e-06, "loss": 0.7899, "mean_token_accuracy": 0.7481289207935333, "step": 2396 }, { "epoch": 1.6319373510384745, "grad_norm": 1.7659963369369507, "learning_rate": 8.050579069004298e-06, "loss": 0.8244, "mean_token_accuracy": 0.7421850264072418, "step": 2397 }, { "epoch": 1.6326183180115765, "grad_norm": 1.6381423473358154, "learning_rate": 8.048793930595998e-06, "loss": 0.8271, "mean_token_accuracy": 0.7343368828296661, "step": 2398 }, { "epoch": 1.6332992849846781, "grad_norm": 1.791287899017334, "learning_rate": 8.047008173333303e-06, "loss": 0.6873, "mean_token_accuracy": 0.7685923278331757, "step": 2399 }, { "epoch": 1.6339802519577802, "grad_norm": 1.7048544883728027, "learning_rate": 8.045221797578698e-06, "loss": 0.61, "mean_token_accuracy": 0.7892748713493347, "step": 2400 }, { "epoch": 1.6346612189308818, "grad_norm": 1.7261425256729126, "learning_rate": 8.043434803694779e-06, "loss": 0.7613, "mean_token_accuracy": 0.7342685759067535, "step": 2401 }, { "epoch": 1.6353421859039836, "grad_norm": 1.7991477251052856, "learning_rate": 8.04164719204428e-06, "loss": 0.7525, "mean_token_accuracy": 0.7498789727687836, "step": 2402 }, { "epoch": 1.6360231528770854, "grad_norm": 1.7178336381912231, "learning_rate": 8.039858962990054e-06, "loss": 0.7472, "mean_token_accuracy": 0.7568758428096771, "step": 2403 }, { "epoch": 1.6367041198501873, "grad_norm": 1.9056274890899658, "learning_rate": 8.038070116895087e-06, "loss": 0.6556, "mean_token_accuracy": 0.7906689047813416, "step": 2404 }, { "epoch": 1.637385086823289, "grad_norm": 1.6110057830810547, "learning_rate": 8.036280654122481e-06, "loss": 0.8131, "mean_token_accuracy": 0.734329104423523, "step": 2405 }, { "epoch": 1.638066053796391, "grad_norm": 1.6081024408340454, "learning_rate": 8.034490575035467e-06, "loss": 0.7266, "mean_token_accuracy": 0.7683785557746887, "step": 2406 }, { "epoch": 1.6387470207694927, "grad_norm": 1.7304707765579224, "learning_rate": 8.032699879997402e-06, "loss": 0.6895, "mean_token_accuracy": 0.7411260604858398, "step": 2407 }, { "epoch": 1.6394279877425944, "grad_norm": 1.674010992050171, "learning_rate": 8.030908569371768e-06, "loss": 0.8004, "mean_token_accuracy": 0.7187275290489197, "step": 2408 }, { "epoch": 1.6401089547156964, "grad_norm": 1.670987606048584, "learning_rate": 8.02911664352217e-06, "loss": 0.8, "mean_token_accuracy": 0.7426292896270752, "step": 2409 }, { "epoch": 1.640789921688798, "grad_norm": 1.8527638912200928, "learning_rate": 8.02732410281234e-06, "loss": 0.5823, "mean_token_accuracy": 0.7824245393276215, "step": 2410 }, { "epoch": 1.6414708886619, "grad_norm": 1.7334682941436768, "learning_rate": 8.025530947606133e-06, "loss": 0.6533, "mean_token_accuracy": 0.7900812327861786, "step": 2411 }, { "epoch": 1.6421518556350017, "grad_norm": 1.7414960861206055, "learning_rate": 8.02373717826753e-06, "loss": 0.7547, "mean_token_accuracy": 0.7357233762741089, "step": 2412 }, { "epoch": 1.6428328226081035, "grad_norm": 1.7989250421524048, "learning_rate": 8.021942795160638e-06, "loss": 0.6837, "mean_token_accuracy": 0.7304797470569611, "step": 2413 }, { "epoch": 1.6435137895812053, "grad_norm": 1.8813081979751587, "learning_rate": 8.020147798649685e-06, "loss": 0.6857, "mean_token_accuracy": 0.754576176404953, "step": 2414 }, { "epoch": 1.6441947565543071, "grad_norm": 2.010899543762207, "learning_rate": 8.018352189099023e-06, "loss": 0.6591, "mean_token_accuracy": 0.7732565999031067, "step": 2415 }, { "epoch": 1.644875723527409, "grad_norm": 1.529868721961975, "learning_rate": 8.016555966873138e-06, "loss": 0.7075, "mean_token_accuracy": 0.763297438621521, "step": 2416 }, { "epoch": 1.6455566905005106, "grad_norm": 1.6175391674041748, "learning_rate": 8.014759132336627e-06, "loss": 0.7091, "mean_token_accuracy": 0.7735458612442017, "step": 2417 }, { "epoch": 1.6462376574736126, "grad_norm": 1.7428381443023682, "learning_rate": 8.012961685854217e-06, "loss": 0.738, "mean_token_accuracy": 0.7595548629760742, "step": 2418 }, { "epoch": 1.6469186244467142, "grad_norm": 1.7821840047836304, "learning_rate": 8.011163627790765e-06, "loss": 0.6923, "mean_token_accuracy": 0.7630263566970825, "step": 2419 }, { "epoch": 1.6475995914198163, "grad_norm": 1.7201615571975708, "learning_rate": 8.009364958511243e-06, "loss": 0.7292, "mean_token_accuracy": 0.7648959755897522, "step": 2420 }, { "epoch": 1.6482805583929179, "grad_norm": 1.7648961544036865, "learning_rate": 8.007565678380751e-06, "loss": 0.7161, "mean_token_accuracy": 0.7604701817035675, "step": 2421 }, { "epoch": 1.6489615253660197, "grad_norm": 1.8698267936706543, "learning_rate": 8.005765787764515e-06, "loss": 0.7084, "mean_token_accuracy": 0.7738290131092072, "step": 2422 }, { "epoch": 1.6496424923391215, "grad_norm": 1.714150309562683, "learning_rate": 8.00396528702788e-06, "loss": 0.7403, "mean_token_accuracy": 0.7567915618419647, "step": 2423 }, { "epoch": 1.6503234593122234, "grad_norm": 1.6173641681671143, "learning_rate": 8.00216417653632e-06, "loss": 0.7622, "mean_token_accuracy": 0.7392474710941315, "step": 2424 }, { "epoch": 1.6510044262853252, "grad_norm": 1.7369482517242432, "learning_rate": 8.00036245665543e-06, "loss": 0.6754, "mean_token_accuracy": 0.7806086242198944, "step": 2425 }, { "epoch": 1.651685393258427, "grad_norm": 1.626709222793579, "learning_rate": 7.998560127750928e-06, "loss": 0.8448, "mean_token_accuracy": 0.7344914972782135, "step": 2426 }, { "epoch": 1.6523663602315288, "grad_norm": 1.5934797525405884, "learning_rate": 7.996757190188658e-06, "loss": 0.9103, "mean_token_accuracy": 0.708854615688324, "step": 2427 }, { "epoch": 1.6530473272046304, "grad_norm": 1.7810418605804443, "learning_rate": 7.994953644334584e-06, "loss": 0.832, "mean_token_accuracy": 0.7290987372398376, "step": 2428 }, { "epoch": 1.6537282941777325, "grad_norm": 1.7560110092163086, "learning_rate": 7.9931494905548e-06, "loss": 0.7694, "mean_token_accuracy": 0.756297379732132, "step": 2429 }, { "epoch": 1.654409261150834, "grad_norm": 1.6240524053573608, "learning_rate": 7.991344729215516e-06, "loss": 0.794, "mean_token_accuracy": 0.746269017457962, "step": 2430 }, { "epoch": 1.6550902281239361, "grad_norm": 1.7051143646240234, "learning_rate": 7.98953936068307e-06, "loss": 0.8303, "mean_token_accuracy": 0.739756166934967, "step": 2431 }, { "epoch": 1.6557711950970377, "grad_norm": 1.720461130142212, "learning_rate": 7.987733385323919e-06, "loss": 0.7366, "mean_token_accuracy": 0.771687924861908, "step": 2432 }, { "epoch": 1.6564521620701396, "grad_norm": 1.5307334661483765, "learning_rate": 7.98592680350465e-06, "loss": 0.8228, "mean_token_accuracy": 0.7516302168369293, "step": 2433 }, { "epoch": 1.6571331290432414, "grad_norm": 1.7821983098983765, "learning_rate": 7.984119615591963e-06, "loss": 0.677, "mean_token_accuracy": 0.7888535857200623, "step": 2434 }, { "epoch": 1.6578140960163432, "grad_norm": 1.692435622215271, "learning_rate": 7.982311821952695e-06, "loss": 0.7404, "mean_token_accuracy": 0.7526972889900208, "step": 2435 }, { "epoch": 1.658495062989445, "grad_norm": 1.7832391262054443, "learning_rate": 7.980503422953792e-06, "loss": 0.7251, "mean_token_accuracy": 0.7564901411533356, "step": 2436 }, { "epoch": 1.6591760299625467, "grad_norm": 1.816854476928711, "learning_rate": 7.978694418962332e-06, "loss": 0.731, "mean_token_accuracy": 0.7686124444007874, "step": 2437 }, { "epoch": 1.6598569969356487, "grad_norm": 1.7102551460266113, "learning_rate": 7.97688481034551e-06, "loss": 0.7622, "mean_token_accuracy": 0.7506580948829651, "step": 2438 }, { "epoch": 1.6605379639087503, "grad_norm": 1.8357001543045044, "learning_rate": 7.975074597470649e-06, "loss": 0.7217, "mean_token_accuracy": 0.769867330789566, "step": 2439 }, { "epoch": 1.6612189308818524, "grad_norm": 1.8884345293045044, "learning_rate": 7.973263780705191e-06, "loss": 0.6043, "mean_token_accuracy": 0.8079686164855957, "step": 2440 }, { "epoch": 1.661899897854954, "grad_norm": 1.859971523284912, "learning_rate": 7.9714523604167e-06, "loss": 0.5958, "mean_token_accuracy": 0.7929188013076782, "step": 2441 }, { "epoch": 1.662580864828056, "grad_norm": 1.7054277658462524, "learning_rate": 7.969640336972868e-06, "loss": 0.7272, "mean_token_accuracy": 0.7578799426555634, "step": 2442 }, { "epoch": 1.6632618318011576, "grad_norm": 1.865739345550537, "learning_rate": 7.967827710741503e-06, "loss": 0.6409, "mean_token_accuracy": 0.7925314605236053, "step": 2443 }, { "epoch": 1.6639427987742594, "grad_norm": 1.8981904983520508, "learning_rate": 7.966014482090538e-06, "loss": 0.6639, "mean_token_accuracy": 0.7893042266368866, "step": 2444 }, { "epoch": 1.6646237657473613, "grad_norm": 1.823394536972046, "learning_rate": 7.964200651388027e-06, "loss": 0.7006, "mean_token_accuracy": 0.7689620852470398, "step": 2445 }, { "epoch": 1.665304732720463, "grad_norm": 1.7435104846954346, "learning_rate": 7.962386219002151e-06, "loss": 0.7853, "mean_token_accuracy": 0.7446867525577545, "step": 2446 }, { "epoch": 1.665985699693565, "grad_norm": 1.7315001487731934, "learning_rate": 7.960571185301207e-06, "loss": 0.7197, "mean_token_accuracy": 0.7587207555770874, "step": 2447 }, { "epoch": 1.6666666666666665, "grad_norm": 1.7599656581878662, "learning_rate": 7.958755550653616e-06, "loss": 0.7651, "mean_token_accuracy": 0.7528558373451233, "step": 2448 }, { "epoch": 1.6673476336397686, "grad_norm": 1.7909214496612549, "learning_rate": 7.956939315427926e-06, "loss": 0.6834, "mean_token_accuracy": 0.7762831449508667, "step": 2449 }, { "epoch": 1.6680286006128702, "grad_norm": 1.7350128889083862, "learning_rate": 7.955122479992795e-06, "loss": 0.7627, "mean_token_accuracy": 0.7293658554553986, "step": 2450 }, { "epoch": 1.6687095675859722, "grad_norm": 1.9627317190170288, "learning_rate": 7.953305044717018e-06, "loss": 0.6892, "mean_token_accuracy": 0.7654350996017456, "step": 2451 }, { "epoch": 1.6693905345590738, "grad_norm": 1.6458271741867065, "learning_rate": 7.9514870099695e-06, "loss": 0.8159, "mean_token_accuracy": 0.7246590256690979, "step": 2452 }, { "epoch": 1.6700715015321757, "grad_norm": 1.6304723024368286, "learning_rate": 7.94966837611927e-06, "loss": 0.8066, "mean_token_accuracy": 0.7441689372062683, "step": 2453 }, { "epoch": 1.6707524685052775, "grad_norm": 1.7677278518676758, "learning_rate": 7.947849143535484e-06, "loss": 0.7435, "mean_token_accuracy": 0.749816358089447, "step": 2454 }, { "epoch": 1.6714334354783793, "grad_norm": 1.7814199924468994, "learning_rate": 7.946029312587415e-06, "loss": 0.7067, "mean_token_accuracy": 0.7577739655971527, "step": 2455 }, { "epoch": 1.6721144024514811, "grad_norm": 1.7859365940093994, "learning_rate": 7.944208883644458e-06, "loss": 0.8261, "mean_token_accuracy": 0.7174777090549469, "step": 2456 }, { "epoch": 1.672795369424583, "grad_norm": 1.9370005130767822, "learning_rate": 7.94238785707613e-06, "loss": 0.7243, "mean_token_accuracy": 0.7584713399410248, "step": 2457 }, { "epoch": 1.6734763363976848, "grad_norm": 1.7051547765731812, "learning_rate": 7.940566233252069e-06, "loss": 0.723, "mean_token_accuracy": 0.7594318389892578, "step": 2458 }, { "epoch": 1.6741573033707864, "grad_norm": 1.7738547325134277, "learning_rate": 7.938744012542035e-06, "loss": 0.6726, "mean_token_accuracy": 0.7778187692165375, "step": 2459 }, { "epoch": 1.6748382703438884, "grad_norm": 1.6527248620986938, "learning_rate": 7.936921195315907e-06, "loss": 0.6935, "mean_token_accuracy": 0.7809694707393646, "step": 2460 }, { "epoch": 1.67551923731699, "grad_norm": 1.8083899021148682, "learning_rate": 7.935097781943686e-06, "loss": 0.6506, "mean_token_accuracy": 0.7922780811786652, "step": 2461 }, { "epoch": 1.676200204290092, "grad_norm": 1.7161763906478882, "learning_rate": 7.933273772795497e-06, "loss": 0.638, "mean_token_accuracy": 0.7922519147396088, "step": 2462 }, { "epoch": 1.6768811712631937, "grad_norm": 1.753266453742981, "learning_rate": 7.931449168241583e-06, "loss": 0.7627, "mean_token_accuracy": 0.7546161711215973, "step": 2463 }, { "epoch": 1.6775621382362955, "grad_norm": 1.588321328163147, "learning_rate": 7.929623968652307e-06, "loss": 0.752, "mean_token_accuracy": 0.7422908544540405, "step": 2464 }, { "epoch": 1.6782431052093973, "grad_norm": 1.8622816801071167, "learning_rate": 7.927798174398153e-06, "loss": 0.7641, "mean_token_accuracy": 0.7585095465183258, "step": 2465 }, { "epoch": 1.6789240721824992, "grad_norm": 1.7327864170074463, "learning_rate": 7.92597178584973e-06, "loss": 0.7549, "mean_token_accuracy": 0.7592517137527466, "step": 2466 }, { "epoch": 1.679605039155601, "grad_norm": 1.7119196653366089, "learning_rate": 7.924144803377765e-06, "loss": 0.8479, "mean_token_accuracy": 0.7550695240497589, "step": 2467 }, { "epoch": 1.6802860061287026, "grad_norm": 1.7357077598571777, "learning_rate": 7.9223172273531e-06, "loss": 0.68, "mean_token_accuracy": 0.7726376950740814, "step": 2468 }, { "epoch": 1.6809669731018047, "grad_norm": 1.7404158115386963, "learning_rate": 7.920489058146706e-06, "loss": 0.7513, "mean_token_accuracy": 0.7538954317569733, "step": 2469 }, { "epoch": 1.6816479400749063, "grad_norm": 1.689340353012085, "learning_rate": 7.918660296129672e-06, "loss": 0.7504, "mean_token_accuracy": 0.7601602077484131, "step": 2470 }, { "epoch": 1.6823289070480083, "grad_norm": 1.6428338289260864, "learning_rate": 7.916830941673204e-06, "loss": 0.766, "mean_token_accuracy": 0.747194916009903, "step": 2471 }, { "epoch": 1.68300987402111, "grad_norm": 1.6374891996383667, "learning_rate": 7.915000995148631e-06, "loss": 0.6915, "mean_token_accuracy": 0.7710846066474915, "step": 2472 }, { "epoch": 1.6836908409942117, "grad_norm": 1.7694485187530518, "learning_rate": 7.913170456927402e-06, "loss": 0.7174, "mean_token_accuracy": 0.7675787806510925, "step": 2473 }, { "epoch": 1.6843718079673136, "grad_norm": 1.6160403490066528, "learning_rate": 7.911339327381085e-06, "loss": 0.8906, "mean_token_accuracy": 0.7199163436889648, "step": 2474 }, { "epoch": 1.6850527749404154, "grad_norm": 1.7782236337661743, "learning_rate": 7.909507606881368e-06, "loss": 0.6469, "mean_token_accuracy": 0.7934743165969849, "step": 2475 }, { "epoch": 1.6857337419135172, "grad_norm": 1.8295482397079468, "learning_rate": 7.907675295800062e-06, "loss": 0.7103, "mean_token_accuracy": 0.7646538019180298, "step": 2476 }, { "epoch": 1.686414708886619, "grad_norm": 1.6190756559371948, "learning_rate": 7.905842394509098e-06, "loss": 0.708, "mean_token_accuracy": 0.7663131058216095, "step": 2477 }, { "epoch": 1.6870956758597209, "grad_norm": 2.096724271774292, "learning_rate": 7.904008903380518e-06, "loss": 0.6856, "mean_token_accuracy": 0.7794474363327026, "step": 2478 }, { "epoch": 1.6877766428328225, "grad_norm": 1.9123386144638062, "learning_rate": 7.902174822786493e-06, "loss": 0.6368, "mean_token_accuracy": 0.7986210286617279, "step": 2479 }, { "epoch": 1.6884576098059245, "grad_norm": 1.7131659984588623, "learning_rate": 7.900340153099314e-06, "loss": 0.7732, "mean_token_accuracy": 0.749535322189331, "step": 2480 }, { "epoch": 1.6891385767790261, "grad_norm": 1.817177176475525, "learning_rate": 7.898504894691383e-06, "loss": 0.8156, "mean_token_accuracy": 0.7382869720458984, "step": 2481 }, { "epoch": 1.6898195437521282, "grad_norm": 1.7188152074813843, "learning_rate": 7.89666904793523e-06, "loss": 0.6877, "mean_token_accuracy": 0.7779442965984344, "step": 2482 }, { "epoch": 1.6905005107252298, "grad_norm": 1.758375883102417, "learning_rate": 7.8948326132035e-06, "loss": 0.6756, "mean_token_accuracy": 0.7564009428024292, "step": 2483 }, { "epoch": 1.6911814776983316, "grad_norm": 1.7556986808776855, "learning_rate": 7.892995590868961e-06, "loss": 0.6754, "mean_token_accuracy": 0.7643812596797943, "step": 2484 }, { "epoch": 1.6918624446714334, "grad_norm": 1.9575873613357544, "learning_rate": 7.891157981304496e-06, "loss": 0.7236, "mean_token_accuracy": 0.7583251297473907, "step": 2485 }, { "epoch": 1.6925434116445353, "grad_norm": 1.7310420274734497, "learning_rate": 7.889319784883109e-06, "loss": 0.7571, "mean_token_accuracy": 0.7341907918453217, "step": 2486 }, { "epoch": 1.693224378617637, "grad_norm": 1.7422093152999878, "learning_rate": 7.887481001977925e-06, "loss": 0.7427, "mean_token_accuracy": 0.7328177392482758, "step": 2487 }, { "epoch": 1.6939053455907387, "grad_norm": 1.772135615348816, "learning_rate": 7.885641632962185e-06, "loss": 0.7851, "mean_token_accuracy": 0.738412469625473, "step": 2488 }, { "epoch": 1.6945863125638407, "grad_norm": 1.5989478826522827, "learning_rate": 7.883801678209249e-06, "loss": 0.8933, "mean_token_accuracy": 0.7210519909858704, "step": 2489 }, { "epoch": 1.6952672795369423, "grad_norm": 2.017794370651245, "learning_rate": 7.8819611380926e-06, "loss": 0.5748, "mean_token_accuracy": 0.8082555830478668, "step": 2490 }, { "epoch": 1.6959482465100444, "grad_norm": 1.6021653413772583, "learning_rate": 7.880120012985834e-06, "loss": 0.7014, "mean_token_accuracy": 0.7616929411888123, "step": 2491 }, { "epoch": 1.696629213483146, "grad_norm": 1.833275318145752, "learning_rate": 7.878278303262673e-06, "loss": 0.6997, "mean_token_accuracy": 0.7785347402095795, "step": 2492 }, { "epoch": 1.6973101804562478, "grad_norm": 1.7506200075149536, "learning_rate": 7.876436009296949e-06, "loss": 0.6691, "mean_token_accuracy": 0.773836225271225, "step": 2493 }, { "epoch": 1.6979911474293496, "grad_norm": 1.732724905014038, "learning_rate": 7.874593131462619e-06, "loss": 0.7322, "mean_token_accuracy": 0.7380525171756744, "step": 2494 }, { "epoch": 1.6986721144024515, "grad_norm": 1.6968764066696167, "learning_rate": 7.872749670133754e-06, "loss": 0.8373, "mean_token_accuracy": 0.7261971533298492, "step": 2495 }, { "epoch": 1.6993530813755533, "grad_norm": 1.582054853439331, "learning_rate": 7.870905625684549e-06, "loss": 0.7533, "mean_token_accuracy": 0.7662078142166138, "step": 2496 }, { "epoch": 1.7000340483486551, "grad_norm": 1.7236131429672241, "learning_rate": 7.869060998489316e-06, "loss": 0.7002, "mean_token_accuracy": 0.7560252547264099, "step": 2497 }, { "epoch": 1.700715015321757, "grad_norm": 1.8318089246749878, "learning_rate": 7.867215788922478e-06, "loss": 0.7701, "mean_token_accuracy": 0.7506299614906311, "step": 2498 }, { "epoch": 1.7013959822948586, "grad_norm": 1.845382571220398, "learning_rate": 7.865369997358588e-06, "loss": 0.8112, "mean_token_accuracy": 0.7434545159339905, "step": 2499 }, { "epoch": 1.7020769492679606, "grad_norm": 1.7900789976119995, "learning_rate": 7.863523624172304e-06, "loss": 0.7805, "mean_token_accuracy": 0.7391313314437866, "step": 2500 }, { "epoch": 1.7027579162410622, "grad_norm": 1.827054738998413, "learning_rate": 7.861676669738417e-06, "loss": 0.763, "mean_token_accuracy": 0.7512713968753815, "step": 2501 }, { "epoch": 1.7034388832141643, "grad_norm": 1.7649227380752563, "learning_rate": 7.85982913443182e-06, "loss": 0.7651, "mean_token_accuracy": 0.7341921925544739, "step": 2502 }, { "epoch": 1.7041198501872659, "grad_norm": 1.7552275657653809, "learning_rate": 7.857981018627538e-06, "loss": 0.7903, "mean_token_accuracy": 0.7418408095836639, "step": 2503 }, { "epoch": 1.7048008171603677, "grad_norm": 1.7351008653640747, "learning_rate": 7.856132322700705e-06, "loss": 0.7721, "mean_token_accuracy": 0.7527831792831421, "step": 2504 }, { "epoch": 1.7054817841334695, "grad_norm": 1.6622439622879028, "learning_rate": 7.854283047026577e-06, "loss": 0.6917, "mean_token_accuracy": 0.7772172093391418, "step": 2505 }, { "epoch": 1.7061627511065713, "grad_norm": 1.8154983520507812, "learning_rate": 7.852433191980522e-06, "loss": 0.6252, "mean_token_accuracy": 0.8007673919200897, "step": 2506 }, { "epoch": 1.7068437180796732, "grad_norm": 1.720146656036377, "learning_rate": 7.850582757938036e-06, "loss": 0.7307, "mean_token_accuracy": 0.7647093534469604, "step": 2507 }, { "epoch": 1.707524685052775, "grad_norm": 1.9880543947219849, "learning_rate": 7.848731745274722e-06, "loss": 0.6957, "mean_token_accuracy": 0.7750038802623749, "step": 2508 }, { "epoch": 1.7082056520258768, "grad_norm": 1.7405198812484741, "learning_rate": 7.846880154366306e-06, "loss": 0.7091, "mean_token_accuracy": 0.7616172432899475, "step": 2509 }, { "epoch": 1.7088866189989784, "grad_norm": 1.6133008003234863, "learning_rate": 7.845027985588629e-06, "loss": 0.8291, "mean_token_accuracy": 0.726063072681427, "step": 2510 }, { "epoch": 1.7095675859720805, "grad_norm": 1.8183860778808594, "learning_rate": 7.843175239317651e-06, "loss": 0.7337, "mean_token_accuracy": 0.7671979665756226, "step": 2511 }, { "epoch": 1.710248552945182, "grad_norm": 1.7418339252471924, "learning_rate": 7.841321915929452e-06, "loss": 0.7515, "mean_token_accuracy": 0.7607157528400421, "step": 2512 }, { "epoch": 1.7109295199182841, "grad_norm": 1.731079339981079, "learning_rate": 7.83946801580022e-06, "loss": 0.6996, "mean_token_accuracy": 0.7712866067886353, "step": 2513 }, { "epoch": 1.7116104868913857, "grad_norm": 1.7672785520553589, "learning_rate": 7.83761353930627e-06, "loss": 0.8184, "mean_token_accuracy": 0.7256055772304535, "step": 2514 }, { "epoch": 1.7122914538644876, "grad_norm": 1.834544062614441, "learning_rate": 7.835758486824028e-06, "loss": 0.6982, "mean_token_accuracy": 0.774132639169693, "step": 2515 }, { "epoch": 1.7129724208375894, "grad_norm": 1.8344874382019043, "learning_rate": 7.83390285873004e-06, "loss": 0.6029, "mean_token_accuracy": 0.8014883995056152, "step": 2516 }, { "epoch": 1.7136533878106912, "grad_norm": 1.7232756614685059, "learning_rate": 7.832046655400967e-06, "loss": 0.8012, "mean_token_accuracy": 0.7470330893993378, "step": 2517 }, { "epoch": 1.714334354783793, "grad_norm": 1.6402987241744995, "learning_rate": 7.830189877213588e-06, "loss": 0.77, "mean_token_accuracy": 0.7327710092067719, "step": 2518 }, { "epoch": 1.7150153217568946, "grad_norm": 1.6786630153656006, "learning_rate": 7.828332524544795e-06, "loss": 0.8599, "mean_token_accuracy": 0.7198387384414673, "step": 2519 }, { "epoch": 1.7156962887299967, "grad_norm": 1.7437751293182373, "learning_rate": 7.826474597771604e-06, "loss": 0.7976, "mean_token_accuracy": 0.7391820847988129, "step": 2520 }, { "epoch": 1.7163772557030983, "grad_norm": 1.7271144390106201, "learning_rate": 7.824616097271141e-06, "loss": 0.8322, "mean_token_accuracy": 0.7491404116153717, "step": 2521 }, { "epoch": 1.7170582226762003, "grad_norm": 1.6630187034606934, "learning_rate": 7.822757023420649e-06, "loss": 0.7303, "mean_token_accuracy": 0.7541258633136749, "step": 2522 }, { "epoch": 1.717739189649302, "grad_norm": 1.7989858388900757, "learning_rate": 7.820897376597493e-06, "loss": 0.6458, "mean_token_accuracy": 0.7734544575214386, "step": 2523 }, { "epoch": 1.7184201566224038, "grad_norm": 1.7600653171539307, "learning_rate": 7.819037157179147e-06, "loss": 0.6188, "mean_token_accuracy": 0.7760956287384033, "step": 2524 }, { "epoch": 1.7191011235955056, "grad_norm": 1.6920216083526611, "learning_rate": 7.817176365543206e-06, "loss": 0.6736, "mean_token_accuracy": 0.7707127928733826, "step": 2525 }, { "epoch": 1.7197820905686074, "grad_norm": 1.9614648818969727, "learning_rate": 7.81531500206738e-06, "loss": 0.7206, "mean_token_accuracy": 0.7598630785942078, "step": 2526 }, { "epoch": 1.7204630575417093, "grad_norm": 1.886417269706726, "learning_rate": 7.813453067129493e-06, "loss": 0.7122, "mean_token_accuracy": 0.7527415454387665, "step": 2527 }, { "epoch": 1.721144024514811, "grad_norm": 1.7760595083236694, "learning_rate": 7.81159056110749e-06, "loss": 0.6717, "mean_token_accuracy": 0.7822202444076538, "step": 2528 }, { "epoch": 1.721824991487913, "grad_norm": 1.6945176124572754, "learning_rate": 7.809727484379424e-06, "loss": 0.7477, "mean_token_accuracy": 0.7403103709220886, "step": 2529 }, { "epoch": 1.7225059584610145, "grad_norm": 1.7060363292694092, "learning_rate": 7.807863837323472e-06, "loss": 0.7415, "mean_token_accuracy": 0.7551876306533813, "step": 2530 }, { "epoch": 1.7231869254341166, "grad_norm": 1.8542426824569702, "learning_rate": 7.805999620317925e-06, "loss": 0.6516, "mean_token_accuracy": 0.7782950103282928, "step": 2531 }, { "epoch": 1.7238678924072182, "grad_norm": 1.890220046043396, "learning_rate": 7.804134833741184e-06, "loss": 0.6641, "mean_token_accuracy": 0.7873983085155487, "step": 2532 }, { "epoch": 1.7245488593803202, "grad_norm": 1.8519560098648071, "learning_rate": 7.802269477971771e-06, "loss": 0.6752, "mean_token_accuracy": 0.7896825969219208, "step": 2533 }, { "epoch": 1.7252298263534218, "grad_norm": 1.9827033281326294, "learning_rate": 7.800403553388323e-06, "loss": 0.7132, "mean_token_accuracy": 0.7630014717578888, "step": 2534 }, { "epoch": 1.7259107933265236, "grad_norm": 1.6830345392227173, "learning_rate": 7.798537060369591e-06, "loss": 0.753, "mean_token_accuracy": 0.7321875691413879, "step": 2535 }, { "epoch": 1.7265917602996255, "grad_norm": 1.7052271366119385, "learning_rate": 7.796669999294441e-06, "loss": 0.6244, "mean_token_accuracy": 0.7933385074138641, "step": 2536 }, { "epoch": 1.7272727272727273, "grad_norm": 1.723610520362854, "learning_rate": 7.794802370541857e-06, "loss": 0.6946, "mean_token_accuracy": 0.7736456096172333, "step": 2537 }, { "epoch": 1.7279536942458291, "grad_norm": 1.7155705690383911, "learning_rate": 7.792934174490936e-06, "loss": 0.713, "mean_token_accuracy": 0.7670412361621857, "step": 2538 }, { "epoch": 1.7286346612189307, "grad_norm": 1.7221612930297852, "learning_rate": 7.791065411520891e-06, "loss": 0.7483, "mean_token_accuracy": 0.7583925724029541, "step": 2539 }, { "epoch": 1.7293156281920328, "grad_norm": 2.0448710918426514, "learning_rate": 7.789196082011049e-06, "loss": 0.6814, "mean_token_accuracy": 0.7791231870651245, "step": 2540 }, { "epoch": 1.7299965951651344, "grad_norm": 1.8312963247299194, "learning_rate": 7.787326186340852e-06, "loss": 0.794, "mean_token_accuracy": 0.7547344863414764, "step": 2541 }, { "epoch": 1.7306775621382364, "grad_norm": 1.7240263223648071, "learning_rate": 7.785455724889858e-06, "loss": 0.758, "mean_token_accuracy": 0.7466892898082733, "step": 2542 }, { "epoch": 1.731358529111338, "grad_norm": 1.8301376104354858, "learning_rate": 7.783584698037743e-06, "loss": 0.6974, "mean_token_accuracy": 0.7644175589084625, "step": 2543 }, { "epoch": 1.7320394960844399, "grad_norm": 1.76872980594635, "learning_rate": 7.78171310616429e-06, "loss": 0.734, "mean_token_accuracy": 0.760280430316925, "step": 2544 }, { "epoch": 1.7327204630575417, "grad_norm": 1.6686601638793945, "learning_rate": 7.779840949649402e-06, "loss": 0.7236, "mean_token_accuracy": 0.7548805773258209, "step": 2545 }, { "epoch": 1.7334014300306435, "grad_norm": 1.718361496925354, "learning_rate": 7.777968228873096e-06, "loss": 0.8898, "mean_token_accuracy": 0.7040141224861145, "step": 2546 }, { "epoch": 1.7340823970037453, "grad_norm": 1.8759686946868896, "learning_rate": 7.776094944215504e-06, "loss": 0.6488, "mean_token_accuracy": 0.7832852602005005, "step": 2547 }, { "epoch": 1.7347633639768472, "grad_norm": 1.7387855052947998, "learning_rate": 7.77422109605687e-06, "loss": 0.707, "mean_token_accuracy": 0.7456679344177246, "step": 2548 }, { "epoch": 1.735444330949949, "grad_norm": 1.822176218032837, "learning_rate": 7.772346684777558e-06, "loss": 0.7396, "mean_token_accuracy": 0.7543987929821014, "step": 2549 }, { "epoch": 1.7361252979230506, "grad_norm": 1.5636299848556519, "learning_rate": 7.770471710758037e-06, "loss": 0.9135, "mean_token_accuracy": 0.7047868072986603, "step": 2550 }, { "epoch": 1.7368062648961526, "grad_norm": 1.5984071493148804, "learning_rate": 7.768596174378896e-06, "loss": 0.8568, "mean_token_accuracy": 0.7241581380367279, "step": 2551 }, { "epoch": 1.7374872318692542, "grad_norm": 1.6529347896575928, "learning_rate": 7.76672007602084e-06, "loss": 0.8652, "mean_token_accuracy": 0.7242274284362793, "step": 2552 }, { "epoch": 1.7381681988423563, "grad_norm": 1.8437511920928955, "learning_rate": 7.764843416064688e-06, "loss": 0.6449, "mean_token_accuracy": 0.7926639318466187, "step": 2553 }, { "epoch": 1.738849165815458, "grad_norm": 1.79288649559021, "learning_rate": 7.762966194891363e-06, "loss": 0.8667, "mean_token_accuracy": 0.7250056564807892, "step": 2554 }, { "epoch": 1.7395301327885597, "grad_norm": 1.865576148033142, "learning_rate": 7.761088412881917e-06, "loss": 0.6483, "mean_token_accuracy": 0.7851472795009613, "step": 2555 }, { "epoch": 1.7402110997616616, "grad_norm": 1.8550467491149902, "learning_rate": 7.759210070417506e-06, "loss": 0.7419, "mean_token_accuracy": 0.7273185849189758, "step": 2556 }, { "epoch": 1.7408920667347634, "grad_norm": 1.7774324417114258, "learning_rate": 7.7573311678794e-06, "loss": 0.6287, "mean_token_accuracy": 0.7966981828212738, "step": 2557 }, { "epoch": 1.7415730337078652, "grad_norm": 1.6206032037734985, "learning_rate": 7.75545170564899e-06, "loss": 0.7719, "mean_token_accuracy": 0.7548860311508179, "step": 2558 }, { "epoch": 1.7422540006809668, "grad_norm": 1.7492214441299438, "learning_rate": 7.75357168410777e-06, "loss": 0.6943, "mean_token_accuracy": 0.7672472596168518, "step": 2559 }, { "epoch": 1.7429349676540689, "grad_norm": 1.7774120569229126, "learning_rate": 7.751691103637355e-06, "loss": 0.637, "mean_token_accuracy": 0.7816348671913147, "step": 2560 }, { "epoch": 1.7436159346271705, "grad_norm": 1.7840579748153687, "learning_rate": 7.749809964619475e-06, "loss": 0.7059, "mean_token_accuracy": 0.7693662643432617, "step": 2561 }, { "epoch": 1.7442969016002725, "grad_norm": 1.8130426406860352, "learning_rate": 7.747928267435965e-06, "loss": 0.7218, "mean_token_accuracy": 0.7618862986564636, "step": 2562 }, { "epoch": 1.7449778685733741, "grad_norm": 1.749221682548523, "learning_rate": 7.74604601246878e-06, "loss": 0.8444, "mean_token_accuracy": 0.7430354058742523, "step": 2563 }, { "epoch": 1.7456588355464762, "grad_norm": 1.835204005241394, "learning_rate": 7.744163200099987e-06, "loss": 0.5731, "mean_token_accuracy": 0.8052840530872345, "step": 2564 }, { "epoch": 1.7463398025195778, "grad_norm": 1.8027058839797974, "learning_rate": 7.742279830711765e-06, "loss": 0.6893, "mean_token_accuracy": 0.7741183340549469, "step": 2565 }, { "epoch": 1.7470207694926796, "grad_norm": 1.6205134391784668, "learning_rate": 7.740395904686407e-06, "loss": 0.828, "mean_token_accuracy": 0.7181025445461273, "step": 2566 }, { "epoch": 1.7477017364657814, "grad_norm": 1.6772617101669312, "learning_rate": 7.738511422406319e-06, "loss": 0.834, "mean_token_accuracy": 0.7364521622657776, "step": 2567 }, { "epoch": 1.7483827034388832, "grad_norm": 1.8959431648254395, "learning_rate": 7.736626384254017e-06, "loss": 0.7305, "mean_token_accuracy": 0.7706084847450256, "step": 2568 }, { "epoch": 1.749063670411985, "grad_norm": 1.7671624422073364, "learning_rate": 7.734740790612137e-06, "loss": 0.7833, "mean_token_accuracy": 0.7263516187667847, "step": 2569 }, { "epoch": 1.7497446373850867, "grad_norm": 1.766596794128418, "learning_rate": 7.732854641863416e-06, "loss": 0.6914, "mean_token_accuracy": 0.7525404393672943, "step": 2570 }, { "epoch": 1.7504256043581887, "grad_norm": 1.8030822277069092, "learning_rate": 7.730967938390718e-06, "loss": 0.6654, "mean_token_accuracy": 0.7787505686283112, "step": 2571 }, { "epoch": 1.7511065713312903, "grad_norm": 1.7863719463348389, "learning_rate": 7.72908068057701e-06, "loss": 0.6765, "mean_token_accuracy": 0.7779499292373657, "step": 2572 }, { "epoch": 1.7517875383043924, "grad_norm": 1.889750361442566, "learning_rate": 7.727192868805371e-06, "loss": 0.6169, "mean_token_accuracy": 0.7922307252883911, "step": 2573 }, { "epoch": 1.752468505277494, "grad_norm": 1.7228353023529053, "learning_rate": 7.725304503459e-06, "loss": 0.6619, "mean_token_accuracy": 0.7618470788002014, "step": 2574 }, { "epoch": 1.7531494722505958, "grad_norm": 1.6992989778518677, "learning_rate": 7.723415584921201e-06, "loss": 0.6925, "mean_token_accuracy": 0.7765218019485474, "step": 2575 }, { "epoch": 1.7538304392236976, "grad_norm": 1.8473403453826904, "learning_rate": 7.721526113575394e-06, "loss": 0.7379, "mean_token_accuracy": 0.765343576669693, "step": 2576 }, { "epoch": 1.7545114061967995, "grad_norm": 1.8120911121368408, "learning_rate": 7.719636089805106e-06, "loss": 0.6389, "mean_token_accuracy": 0.7846944332122803, "step": 2577 }, { "epoch": 1.7551923731699013, "grad_norm": 1.8384000062942505, "learning_rate": 7.717745513993989e-06, "loss": 0.9015, "mean_token_accuracy": 0.7040509581565857, "step": 2578 }, { "epoch": 1.7558733401430031, "grad_norm": 1.9651625156402588, "learning_rate": 7.71585438652579e-06, "loss": 0.6141, "mean_token_accuracy": 0.7971543669700623, "step": 2579 }, { "epoch": 1.756554307116105, "grad_norm": 1.649552583694458, "learning_rate": 7.713962707784382e-06, "loss": 0.7362, "mean_token_accuracy": 0.7652300596237183, "step": 2580 }, { "epoch": 1.7572352740892065, "grad_norm": 1.8709138631820679, "learning_rate": 7.71207047815374e-06, "loss": 0.639, "mean_token_accuracy": 0.792895495891571, "step": 2581 }, { "epoch": 1.7579162410623086, "grad_norm": 1.834048867225647, "learning_rate": 7.71017769801796e-06, "loss": 0.632, "mean_token_accuracy": 0.7900158762931824, "step": 2582 }, { "epoch": 1.7585972080354102, "grad_norm": 1.7146788835525513, "learning_rate": 7.70828436776124e-06, "loss": 0.758, "mean_token_accuracy": 0.7265433967113495, "step": 2583 }, { "epoch": 1.7592781750085122, "grad_norm": 1.711235523223877, "learning_rate": 7.706390487767896e-06, "loss": 0.7988, "mean_token_accuracy": 0.7468386292457581, "step": 2584 }, { "epoch": 1.7599591419816139, "grad_norm": 1.7523739337921143, "learning_rate": 7.704496058422356e-06, "loss": 0.6472, "mean_token_accuracy": 0.7774987518787384, "step": 2585 }, { "epoch": 1.7606401089547157, "grad_norm": 1.9061822891235352, "learning_rate": 7.702601080109155e-06, "loss": 0.7472, "mean_token_accuracy": 0.7338480055332184, "step": 2586 }, { "epoch": 1.7613210759278175, "grad_norm": 1.7913142442703247, "learning_rate": 7.700705553212945e-06, "loss": 0.8213, "mean_token_accuracy": 0.7401351928710938, "step": 2587 }, { "epoch": 1.7620020429009193, "grad_norm": 1.626386046409607, "learning_rate": 7.698809478118484e-06, "loss": 0.6866, "mean_token_accuracy": 0.762957751750946, "step": 2588 }, { "epoch": 1.7626830098740212, "grad_norm": 1.9355688095092773, "learning_rate": 7.696912855210644e-06, "loss": 0.6801, "mean_token_accuracy": 0.785697340965271, "step": 2589 }, { "epoch": 1.7633639768471228, "grad_norm": 1.784509301185608, "learning_rate": 7.69501568487441e-06, "loss": 0.6907, "mean_token_accuracy": 0.7755035161972046, "step": 2590 }, { "epoch": 1.7640449438202248, "grad_norm": 1.8420021533966064, "learning_rate": 7.69311796749487e-06, "loss": 0.6131, "mean_token_accuracy": 0.7944734692573547, "step": 2591 }, { "epoch": 1.7647259107933264, "grad_norm": 1.781553864479065, "learning_rate": 7.691219703457236e-06, "loss": 0.7751, "mean_token_accuracy": 0.7447803020477295, "step": 2592 }, { "epoch": 1.7654068777664285, "grad_norm": 1.7084553241729736, "learning_rate": 7.689320893146821e-06, "loss": 0.6866, "mean_token_accuracy": 0.7774416208267212, "step": 2593 }, { "epoch": 1.76608784473953, "grad_norm": 1.7533740997314453, "learning_rate": 7.687421536949053e-06, "loss": 0.7646, "mean_token_accuracy": 0.760994017124176, "step": 2594 }, { "epoch": 1.766768811712632, "grad_norm": 1.6832720041275024, "learning_rate": 7.68552163524947e-06, "loss": 0.7491, "mean_token_accuracy": 0.7444806098937988, "step": 2595 }, { "epoch": 1.7674497786857337, "grad_norm": 1.829506516456604, "learning_rate": 7.683621188433714e-06, "loss": 0.6738, "mean_token_accuracy": 0.7573264539241791, "step": 2596 }, { "epoch": 1.7681307456588355, "grad_norm": 1.7673567533493042, "learning_rate": 7.681720196887551e-06, "loss": 0.769, "mean_token_accuracy": 0.7317969799041748, "step": 2597 }, { "epoch": 1.7688117126319374, "grad_norm": 1.8528804779052734, "learning_rate": 7.679818660996851e-06, "loss": 0.7136, "mean_token_accuracy": 0.757018119096756, "step": 2598 }, { "epoch": 1.7694926796050392, "grad_norm": 2.0018274784088135, "learning_rate": 7.677916581147593e-06, "loss": 0.7743, "mean_token_accuracy": 0.7449090778827667, "step": 2599 }, { "epoch": 1.770173646578141, "grad_norm": 1.9784246683120728, "learning_rate": 7.676013957725864e-06, "loss": 0.6532, "mean_token_accuracy": 0.7851862907409668, "step": 2600 }, { "epoch": 1.7708546135512426, "grad_norm": 1.6909499168395996, "learning_rate": 7.674110791117869e-06, "loss": 0.8295, "mean_token_accuracy": 0.7330290079116821, "step": 2601 }, { "epoch": 1.7715355805243447, "grad_norm": 1.685018539428711, "learning_rate": 7.672207081709914e-06, "loss": 0.777, "mean_token_accuracy": 0.736683577299118, "step": 2602 }, { "epoch": 1.7722165474974463, "grad_norm": 1.9552536010742188, "learning_rate": 7.670302829888427e-06, "loss": 0.6906, "mean_token_accuracy": 0.7763275504112244, "step": 2603 }, { "epoch": 1.7728975144705483, "grad_norm": 1.6207478046417236, "learning_rate": 7.668398036039936e-06, "loss": 0.7364, "mean_token_accuracy": 0.7356691956520081, "step": 2604 }, { "epoch": 1.77357848144365, "grad_norm": 1.755366563796997, "learning_rate": 7.666492700551083e-06, "loss": 0.7675, "mean_token_accuracy": 0.7520604729652405, "step": 2605 }, { "epoch": 1.7742594484167518, "grad_norm": 1.7446707487106323, "learning_rate": 7.664586823808618e-06, "loss": 0.709, "mean_token_accuracy": 0.7681779265403748, "step": 2606 }, { "epoch": 1.7749404153898536, "grad_norm": 1.653554081916809, "learning_rate": 7.662680406199402e-06, "loss": 0.7103, "mean_token_accuracy": 0.7485825419425964, "step": 2607 }, { "epoch": 1.7756213823629554, "grad_norm": 1.6386430263519287, "learning_rate": 7.660773448110408e-06, "loss": 0.7817, "mean_token_accuracy": 0.7381926476955414, "step": 2608 }, { "epoch": 1.7763023493360572, "grad_norm": 1.7419747114181519, "learning_rate": 7.658865949928717e-06, "loss": 0.584, "mean_token_accuracy": 0.8131625652313232, "step": 2609 }, { "epoch": 1.7769833163091588, "grad_norm": 1.6809585094451904, "learning_rate": 7.656957912041519e-06, "loss": 0.794, "mean_token_accuracy": 0.7497469782829285, "step": 2610 }, { "epoch": 1.777664283282261, "grad_norm": 1.847926139831543, "learning_rate": 7.655049334836111e-06, "loss": 0.7017, "mean_token_accuracy": 0.7548607289791107, "step": 2611 }, { "epoch": 1.7783452502553625, "grad_norm": 1.791825771331787, "learning_rate": 7.653140218699906e-06, "loss": 0.6834, "mean_token_accuracy": 0.763765275478363, "step": 2612 }, { "epoch": 1.7790262172284645, "grad_norm": 1.8539193868637085, "learning_rate": 7.651230564020423e-06, "loss": 0.7787, "mean_token_accuracy": 0.7442516088485718, "step": 2613 }, { "epoch": 1.7797071842015662, "grad_norm": 1.813123345375061, "learning_rate": 7.649320371185287e-06, "loss": 0.7898, "mean_token_accuracy": 0.7199622392654419, "step": 2614 }, { "epoch": 1.780388151174668, "grad_norm": 1.8346434831619263, "learning_rate": 7.647409640582234e-06, "loss": 0.7361, "mean_token_accuracy": 0.7346886694431305, "step": 2615 }, { "epoch": 1.7810691181477698, "grad_norm": 1.7471404075622559, "learning_rate": 7.645498372599117e-06, "loss": 0.7111, "mean_token_accuracy": 0.7721620202064514, "step": 2616 }, { "epoch": 1.7817500851208716, "grad_norm": 1.970525860786438, "learning_rate": 7.643586567623889e-06, "loss": 0.758, "mean_token_accuracy": 0.7589958310127258, "step": 2617 }, { "epoch": 1.7824310520939735, "grad_norm": 1.7812138795852661, "learning_rate": 7.64167422604461e-06, "loss": 0.825, "mean_token_accuracy": 0.7062784433364868, "step": 2618 }, { "epoch": 1.7831120190670753, "grad_norm": 1.7942270040512085, "learning_rate": 7.639761348249459e-06, "loss": 0.6556, "mean_token_accuracy": 0.7698946595191956, "step": 2619 }, { "epoch": 1.783792986040177, "grad_norm": 1.8737635612487793, "learning_rate": 7.637847934626717e-06, "loss": 0.6729, "mean_token_accuracy": 0.7728573381900787, "step": 2620 }, { "epoch": 1.7844739530132787, "grad_norm": 1.8924401998519897, "learning_rate": 7.635933985564774e-06, "loss": 0.7056, "mean_token_accuracy": 0.7754846215248108, "step": 2621 }, { "epoch": 1.7851549199863808, "grad_norm": 1.8498435020446777, "learning_rate": 7.634019501452128e-06, "loss": 0.6428, "mean_token_accuracy": 0.7877029776573181, "step": 2622 }, { "epoch": 1.7858358869594824, "grad_norm": 1.7105543613433838, "learning_rate": 7.632104482677392e-06, "loss": 0.8421, "mean_token_accuracy": 0.7295881807804108, "step": 2623 }, { "epoch": 1.7865168539325844, "grad_norm": 1.7067312002182007, "learning_rate": 7.630188929629279e-06, "loss": 0.8283, "mean_token_accuracy": 0.7432414889335632, "step": 2624 }, { "epoch": 1.787197820905686, "grad_norm": 1.7202378511428833, "learning_rate": 7.628272842696618e-06, "loss": 0.7633, "mean_token_accuracy": 0.7398443818092346, "step": 2625 }, { "epoch": 1.7878787878787878, "grad_norm": 1.8566533327102661, "learning_rate": 7.6263562222683385e-06, "loss": 0.8024, "mean_token_accuracy": 0.7469236552715302, "step": 2626 }, { "epoch": 1.7885597548518897, "grad_norm": 1.5257207155227661, "learning_rate": 7.624439068733485e-06, "loss": 0.779, "mean_token_accuracy": 0.743416428565979, "step": 2627 }, { "epoch": 1.7892407218249915, "grad_norm": 1.823519229888916, "learning_rate": 7.622521382481208e-06, "loss": 0.7484, "mean_token_accuracy": 0.76353320479393, "step": 2628 }, { "epoch": 1.7899216887980933, "grad_norm": 1.6711076498031616, "learning_rate": 7.6206031639007646e-06, "loss": 0.6636, "mean_token_accuracy": 0.7835299372673035, "step": 2629 }, { "epoch": 1.7906026557711952, "grad_norm": 1.8915756940841675, "learning_rate": 7.618684413381523e-06, "loss": 0.6652, "mean_token_accuracy": 0.7818475365638733, "step": 2630 }, { "epoch": 1.791283622744297, "grad_norm": 1.9255069494247437, "learning_rate": 7.616765131312955e-06, "loss": 0.7773, "mean_token_accuracy": 0.7261184453964233, "step": 2631 }, { "epoch": 1.7919645897173986, "grad_norm": 1.894938349723816, "learning_rate": 7.614845318084647e-06, "loss": 0.6193, "mean_token_accuracy": 0.7900662422180176, "step": 2632 }, { "epoch": 1.7926455566905006, "grad_norm": 1.764191746711731, "learning_rate": 7.6129249740862844e-06, "loss": 0.8073, "mean_token_accuracy": 0.7316588163375854, "step": 2633 }, { "epoch": 1.7933265236636022, "grad_norm": 1.8971145153045654, "learning_rate": 7.611004099707668e-06, "loss": 0.6574, "mean_token_accuracy": 0.780353456735611, "step": 2634 }, { "epoch": 1.7940074906367043, "grad_norm": 1.9240455627441406, "learning_rate": 7.609082695338703e-06, "loss": 0.6416, "mean_token_accuracy": 0.791772186756134, "step": 2635 }, { "epoch": 1.7946884576098059, "grad_norm": 1.7023851871490479, "learning_rate": 7.607160761369401e-06, "loss": 0.6529, "mean_token_accuracy": 0.7875257730484009, "step": 2636 }, { "epoch": 1.7953694245829077, "grad_norm": 1.6456338167190552, "learning_rate": 7.605238298189883e-06, "loss": 0.7413, "mean_token_accuracy": 0.7443355023860931, "step": 2637 }, { "epoch": 1.7960503915560095, "grad_norm": 1.7462388277053833, "learning_rate": 7.603315306190377e-06, "loss": 0.7379, "mean_token_accuracy": 0.7628051340579987, "step": 2638 }, { "epoch": 1.7967313585291114, "grad_norm": 1.6902940273284912, "learning_rate": 7.601391785761222e-06, "loss": 0.7138, "mean_token_accuracy": 0.7614813148975372, "step": 2639 }, { "epoch": 1.7974123255022132, "grad_norm": 1.830885648727417, "learning_rate": 7.599467737292856e-06, "loss": 0.723, "mean_token_accuracy": 0.74856036901474, "step": 2640 }, { "epoch": 1.7980932924753148, "grad_norm": 1.560758352279663, "learning_rate": 7.5975431611758295e-06, "loss": 0.8993, "mean_token_accuracy": 0.7194406688213348, "step": 2641 }, { "epoch": 1.7987742594484168, "grad_norm": 1.7514225244522095, "learning_rate": 7.595618057800801e-06, "loss": 0.7497, "mean_token_accuracy": 0.7467973232269287, "step": 2642 }, { "epoch": 1.7994552264215185, "grad_norm": 1.837546706199646, "learning_rate": 7.593692427558535e-06, "loss": 0.6915, "mean_token_accuracy": 0.7727757692337036, "step": 2643 }, { "epoch": 1.8001361933946205, "grad_norm": 1.807426929473877, "learning_rate": 7.591766270839897e-06, "loss": 0.7707, "mean_token_accuracy": 0.7514422237873077, "step": 2644 }, { "epoch": 1.800817160367722, "grad_norm": 1.6326695680618286, "learning_rate": 7.5898395880358725e-06, "loss": 0.7242, "mean_token_accuracy": 0.7675060331821442, "step": 2645 }, { "epoch": 1.801498127340824, "grad_norm": 1.9096651077270508, "learning_rate": 7.587912379537541e-06, "loss": 0.7753, "mean_token_accuracy": 0.7496428191661835, "step": 2646 }, { "epoch": 1.8021790943139258, "grad_norm": 1.6652699708938599, "learning_rate": 7.585984645736095e-06, "loss": 0.6546, "mean_token_accuracy": 0.7652639746665955, "step": 2647 }, { "epoch": 1.8028600612870276, "grad_norm": 1.6854914426803589, "learning_rate": 7.584056387022832e-06, "loss": 0.6715, "mean_token_accuracy": 0.779331237077713, "step": 2648 }, { "epoch": 1.8035410282601294, "grad_norm": 1.5978360176086426, "learning_rate": 7.582127603789157e-06, "loss": 0.6845, "mean_token_accuracy": 0.7955123484134674, "step": 2649 }, { "epoch": 1.8042219952332312, "grad_norm": 2.0092008113861084, "learning_rate": 7.5801982964265815e-06, "loss": 0.716, "mean_token_accuracy": 0.7601533830165863, "step": 2650 }, { "epoch": 1.804902962206333, "grad_norm": 1.770376205444336, "learning_rate": 7.578268465326721e-06, "loss": 0.8366, "mean_token_accuracy": 0.7144425809383392, "step": 2651 }, { "epoch": 1.8055839291794347, "grad_norm": 1.64827561378479, "learning_rate": 7.576338110881301e-06, "loss": 0.718, "mean_token_accuracy": 0.777849555015564, "step": 2652 }, { "epoch": 1.8062648961525367, "grad_norm": 1.7078773975372314, "learning_rate": 7.574407233482148e-06, "loss": 0.696, "mean_token_accuracy": 0.7649381160736084, "step": 2653 }, { "epoch": 1.8069458631256383, "grad_norm": 1.6807456016540527, "learning_rate": 7.572475833521202e-06, "loss": 0.7424, "mean_token_accuracy": 0.7626325190067291, "step": 2654 }, { "epoch": 1.8076268300987404, "grad_norm": 1.7045859098434448, "learning_rate": 7.5705439113905046e-06, "loss": 0.6887, "mean_token_accuracy": 0.7627581655979156, "step": 2655 }, { "epoch": 1.808307797071842, "grad_norm": 1.7765161991119385, "learning_rate": 7.568611467482201e-06, "loss": 0.8122, "mean_token_accuracy": 0.7484098672866821, "step": 2656 }, { "epoch": 1.8089887640449438, "grad_norm": 1.7377586364746094, "learning_rate": 7.5666785021885445e-06, "loss": 0.785, "mean_token_accuracy": 0.7444965243339539, "step": 2657 }, { "epoch": 1.8096697310180456, "grad_norm": 1.8454848527908325, "learning_rate": 7.5647450159019e-06, "loss": 0.6747, "mean_token_accuracy": 0.7746550142765045, "step": 2658 }, { "epoch": 1.8103506979911475, "grad_norm": 1.726546049118042, "learning_rate": 7.5628110090147285e-06, "loss": 0.8009, "mean_token_accuracy": 0.732487291097641, "step": 2659 }, { "epoch": 1.8110316649642493, "grad_norm": 1.7520558834075928, "learning_rate": 7.560876481919604e-06, "loss": 0.715, "mean_token_accuracy": 0.7540681660175323, "step": 2660 }, { "epoch": 1.8117126319373509, "grad_norm": 1.806736707687378, "learning_rate": 7.5589414350092015e-06, "loss": 0.7177, "mean_token_accuracy": 0.7699414193630219, "step": 2661 }, { "epoch": 1.812393598910453, "grad_norm": 1.6984241008758545, "learning_rate": 7.557005868676304e-06, "loss": 0.6493, "mean_token_accuracy": 0.774213582277298, "step": 2662 }, { "epoch": 1.8130745658835545, "grad_norm": 1.6428695917129517, "learning_rate": 7.555069783313798e-06, "loss": 0.7363, "mean_token_accuracy": 0.7588293552398682, "step": 2663 }, { "epoch": 1.8137555328566566, "grad_norm": 1.4097001552581787, "learning_rate": 7.553133179314678e-06, "loss": 0.8212, "mean_token_accuracy": 0.7327198386192322, "step": 2664 }, { "epoch": 1.8144364998297582, "grad_norm": 1.6966286897659302, "learning_rate": 7.551196057072043e-06, "loss": 0.7509, "mean_token_accuracy": 0.753719300031662, "step": 2665 }, { "epoch": 1.81511746680286, "grad_norm": 1.9560604095458984, "learning_rate": 7.549258416979094e-06, "loss": 0.6717, "mean_token_accuracy": 0.7647784650325775, "step": 2666 }, { "epoch": 1.8157984337759618, "grad_norm": 1.6067489385604858, "learning_rate": 7.54732025942914e-06, "loss": 0.7515, "mean_token_accuracy": 0.7530572712421417, "step": 2667 }, { "epoch": 1.8164794007490637, "grad_norm": 1.863168478012085, "learning_rate": 7.545381584815599e-06, "loss": 0.7779, "mean_token_accuracy": 0.7324115037918091, "step": 2668 }, { "epoch": 1.8171603677221655, "grad_norm": 1.695818543434143, "learning_rate": 7.543442393531985e-06, "loss": 0.8163, "mean_token_accuracy": 0.7351820766925812, "step": 2669 }, { "epoch": 1.8178413346952673, "grad_norm": 1.8063067197799683, "learning_rate": 7.541502685971922e-06, "loss": 0.7618, "mean_token_accuracy": 0.7477411925792694, "step": 2670 }, { "epoch": 1.8185223016683691, "grad_norm": 1.8407526016235352, "learning_rate": 7.539562462529142e-06, "loss": 0.704, "mean_token_accuracy": 0.7723397314548492, "step": 2671 }, { "epoch": 1.8192032686414707, "grad_norm": 1.794973611831665, "learning_rate": 7.537621723597474e-06, "loss": 0.6584, "mean_token_accuracy": 0.7857692241668701, "step": 2672 }, { "epoch": 1.8198842356145728, "grad_norm": 1.707361102104187, "learning_rate": 7.535680469570858e-06, "loss": 0.6853, "mean_token_accuracy": 0.7496468722820282, "step": 2673 }, { "epoch": 1.8205652025876744, "grad_norm": 1.7188961505889893, "learning_rate": 7.533738700843334e-06, "loss": 0.8123, "mean_token_accuracy": 0.763315737247467, "step": 2674 }, { "epoch": 1.8212461695607765, "grad_norm": 1.8952326774597168, "learning_rate": 7.53179641780905e-06, "loss": 0.6518, "mean_token_accuracy": 0.7883713245391846, "step": 2675 }, { "epoch": 1.821927136533878, "grad_norm": 1.752812147140503, "learning_rate": 7.529853620862257e-06, "loss": 0.7544, "mean_token_accuracy": 0.7467054724693298, "step": 2676 }, { "epoch": 1.8226081035069799, "grad_norm": 1.7068490982055664, "learning_rate": 7.5279103103973125e-06, "loss": 0.8218, "mean_token_accuracy": 0.7426196336746216, "step": 2677 }, { "epoch": 1.8232890704800817, "grad_norm": 1.732961654663086, "learning_rate": 7.525966486808674e-06, "loss": 0.747, "mean_token_accuracy": 0.7542922794818878, "step": 2678 }, { "epoch": 1.8239700374531835, "grad_norm": 1.6412076950073242, "learning_rate": 7.524022150490904e-06, "loss": 0.7025, "mean_token_accuracy": 0.7626677751541138, "step": 2679 }, { "epoch": 1.8246510044262854, "grad_norm": 1.6739858388900757, "learning_rate": 7.522077301838673e-06, "loss": 0.8114, "mean_token_accuracy": 0.7280959188938141, "step": 2680 }, { "epoch": 1.825331971399387, "grad_norm": 1.643215298652649, "learning_rate": 7.520131941246753e-06, "loss": 0.7544, "mean_token_accuracy": 0.7589448094367981, "step": 2681 }, { "epoch": 1.826012938372489, "grad_norm": 1.836077094078064, "learning_rate": 7.5181860691100165e-06, "loss": 0.6018, "mean_token_accuracy": 0.8011603653430939, "step": 2682 }, { "epoch": 1.8266939053455906, "grad_norm": 1.463793158531189, "learning_rate": 7.516239685823446e-06, "loss": 0.8273, "mean_token_accuracy": 0.7313739061355591, "step": 2683 }, { "epoch": 1.8273748723186927, "grad_norm": 1.6463496685028076, "learning_rate": 7.514292791782125e-06, "loss": 0.7495, "mean_token_accuracy": 0.7398199141025543, "step": 2684 }, { "epoch": 1.8280558392917943, "grad_norm": 1.6125226020812988, "learning_rate": 7.512345387381239e-06, "loss": 0.8527, "mean_token_accuracy": 0.7416198253631592, "step": 2685 }, { "epoch": 1.828736806264896, "grad_norm": 1.7438815832138062, "learning_rate": 7.51039747301608e-06, "loss": 0.6696, "mean_token_accuracy": 0.7864900529384613, "step": 2686 }, { "epoch": 1.829417773237998, "grad_norm": 1.9097577333450317, "learning_rate": 7.5084490490820406e-06, "loss": 0.6137, "mean_token_accuracy": 0.7931230068206787, "step": 2687 }, { "epoch": 1.8300987402110998, "grad_norm": 1.768645167350769, "learning_rate": 7.506500115974621e-06, "loss": 0.7111, "mean_token_accuracy": 0.7646727561950684, "step": 2688 }, { "epoch": 1.8307797071842016, "grad_norm": 1.9361999034881592, "learning_rate": 7.504550674089419e-06, "loss": 0.7209, "mean_token_accuracy": 0.7633779346942902, "step": 2689 }, { "epoch": 1.8314606741573034, "grad_norm": 1.8572628498077393, "learning_rate": 7.502600723822141e-06, "loss": 0.7263, "mean_token_accuracy": 0.7452824413776398, "step": 2690 }, { "epoch": 1.8321416411304052, "grad_norm": 1.7448725700378418, "learning_rate": 7.500650265568594e-06, "loss": 0.7519, "mean_token_accuracy": 0.741788238286972, "step": 2691 }, { "epoch": 1.8328226081035068, "grad_norm": 1.70537531375885, "learning_rate": 7.49869929972469e-06, "loss": 0.7035, "mean_token_accuracy": 0.7725488841533661, "step": 2692 }, { "epoch": 1.8335035750766089, "grad_norm": 1.7545404434204102, "learning_rate": 7.496747826686439e-06, "loss": 0.7259, "mean_token_accuracy": 0.7588635087013245, "step": 2693 }, { "epoch": 1.8341845420497105, "grad_norm": 1.712618112564087, "learning_rate": 7.494795846849958e-06, "loss": 0.7228, "mean_token_accuracy": 0.7509573996067047, "step": 2694 }, { "epoch": 1.8348655090228125, "grad_norm": 1.8301552534103394, "learning_rate": 7.492843360611472e-06, "loss": 0.8448, "mean_token_accuracy": 0.728617250919342, "step": 2695 }, { "epoch": 1.8355464759959141, "grad_norm": 1.8334134817123413, "learning_rate": 7.490890368367296e-06, "loss": 0.7349, "mean_token_accuracy": 0.7483829259872437, "step": 2696 }, { "epoch": 1.836227442969016, "grad_norm": 1.7318798303604126, "learning_rate": 7.488936870513859e-06, "loss": 0.7299, "mean_token_accuracy": 0.7585636377334595, "step": 2697 }, { "epoch": 1.8369084099421178, "grad_norm": 1.9687246084213257, "learning_rate": 7.486982867447688e-06, "loss": 0.8187, "mean_token_accuracy": 0.7311733663082123, "step": 2698 }, { "epoch": 1.8375893769152196, "grad_norm": 1.8214898109436035, "learning_rate": 7.485028359565412e-06, "loss": 0.6524, "mean_token_accuracy": 0.7836682796478271, "step": 2699 }, { "epoch": 1.8382703438883214, "grad_norm": 1.761605978012085, "learning_rate": 7.483073347263763e-06, "loss": 0.7228, "mean_token_accuracy": 0.7358288466930389, "step": 2700 }, { "epoch": 1.8389513108614233, "grad_norm": 1.750671148300171, "learning_rate": 7.48111783093958e-06, "loss": 0.7784, "mean_token_accuracy": 0.7417736351490021, "step": 2701 }, { "epoch": 1.839632277834525, "grad_norm": 1.9579483270645142, "learning_rate": 7.479161810989795e-06, "loss": 0.6653, "mean_token_accuracy": 0.7685501277446747, "step": 2702 }, { "epoch": 1.8403132448076267, "grad_norm": 1.7612597942352295, "learning_rate": 7.4772052878114495e-06, "loss": 0.8171, "mean_token_accuracy": 0.7374162375926971, "step": 2703 }, { "epoch": 1.8409942117807288, "grad_norm": 1.678595781326294, "learning_rate": 7.475248261801687e-06, "loss": 0.7255, "mean_token_accuracy": 0.7516993582248688, "step": 2704 }, { "epoch": 1.8416751787538304, "grad_norm": 1.9612678289413452, "learning_rate": 7.473290733357747e-06, "loss": 0.699, "mean_token_accuracy": 0.7693169713020325, "step": 2705 }, { "epoch": 1.8423561457269324, "grad_norm": 1.8237744569778442, "learning_rate": 7.471332702876981e-06, "loss": 0.5897, "mean_token_accuracy": 0.8063098788261414, "step": 2706 }, { "epoch": 1.843037112700034, "grad_norm": 1.8599791526794434, "learning_rate": 7.469374170756833e-06, "loss": 0.6989, "mean_token_accuracy": 0.7398560047149658, "step": 2707 }, { "epoch": 1.8437180796731358, "grad_norm": 1.6344975233078003, "learning_rate": 7.467415137394851e-06, "loss": 0.7404, "mean_token_accuracy": 0.7715768814086914, "step": 2708 }, { "epoch": 1.8443990466462377, "grad_norm": 1.9867238998413086, "learning_rate": 7.465455603188689e-06, "loss": 0.7644, "mean_token_accuracy": 0.7516380548477173, "step": 2709 }, { "epoch": 1.8450800136193395, "grad_norm": 1.7790638208389282, "learning_rate": 7.463495568536099e-06, "loss": 0.7005, "mean_token_accuracy": 0.7785283625125885, "step": 2710 }, { "epoch": 1.8457609805924413, "grad_norm": 1.8442232608795166, "learning_rate": 7.461535033834936e-06, "loss": 0.6651, "mean_token_accuracy": 0.7837604880332947, "step": 2711 }, { "epoch": 1.846441947565543, "grad_norm": 1.739822268486023, "learning_rate": 7.459573999483153e-06, "loss": 0.7715, "mean_token_accuracy": 0.7638511061668396, "step": 2712 }, { "epoch": 1.847122914538645, "grad_norm": 1.7878270149230957, "learning_rate": 7.4576124658788096e-06, "loss": 0.7423, "mean_token_accuracy": 0.7404406368732452, "step": 2713 }, { "epoch": 1.8478038815117466, "grad_norm": 1.7259665727615356, "learning_rate": 7.455650433420065e-06, "loss": 0.6805, "mean_token_accuracy": 0.7711426913738251, "step": 2714 }, { "epoch": 1.8484848484848486, "grad_norm": 1.6691337823867798, "learning_rate": 7.453687902505178e-06, "loss": 0.7288, "mean_token_accuracy": 0.7537149488925934, "step": 2715 }, { "epoch": 1.8491658154579502, "grad_norm": 1.7361657619476318, "learning_rate": 7.451724873532511e-06, "loss": 0.7727, "mean_token_accuracy": 0.7331708073616028, "step": 2716 }, { "epoch": 1.849846782431052, "grad_norm": 1.7250487804412842, "learning_rate": 7.4497613469005235e-06, "loss": 0.7087, "mean_token_accuracy": 0.7671894431114197, "step": 2717 }, { "epoch": 1.8505277494041539, "grad_norm": 1.7663251161575317, "learning_rate": 7.4477973230077815e-06, "loss": 0.7317, "mean_token_accuracy": 0.7691561877727509, "step": 2718 }, { "epoch": 1.8512087163772557, "grad_norm": 1.8266428709030151, "learning_rate": 7.445832802252947e-06, "loss": 0.7377, "mean_token_accuracy": 0.7827037870883942, "step": 2719 }, { "epoch": 1.8518896833503575, "grad_norm": 1.7783282995224, "learning_rate": 7.443867785034786e-06, "loss": 0.6521, "mean_token_accuracy": 0.7768807709217072, "step": 2720 }, { "epoch": 1.8525706503234594, "grad_norm": 1.7084063291549683, "learning_rate": 7.441902271752166e-06, "loss": 0.6319, "mean_token_accuracy": 0.7948492765426636, "step": 2721 }, { "epoch": 1.8532516172965612, "grad_norm": 1.8421449661254883, "learning_rate": 7.43993626280405e-06, "loss": 0.5948, "mean_token_accuracy": 0.8043064177036285, "step": 2722 }, { "epoch": 1.8539325842696628, "grad_norm": 1.7737030982971191, "learning_rate": 7.437969758589508e-06, "loss": 0.7521, "mean_token_accuracy": 0.7472672462463379, "step": 2723 }, { "epoch": 1.8546135512427648, "grad_norm": 1.6601802110671997, "learning_rate": 7.436002759507706e-06, "loss": 0.8318, "mean_token_accuracy": 0.7350498735904694, "step": 2724 }, { "epoch": 1.8552945182158664, "grad_norm": 1.6394767761230469, "learning_rate": 7.434035265957913e-06, "loss": 0.8693, "mean_token_accuracy": 0.726248562335968, "step": 2725 }, { "epoch": 1.8559754851889685, "grad_norm": 1.798323392868042, "learning_rate": 7.432067278339496e-06, "loss": 0.6982, "mean_token_accuracy": 0.7680550515651703, "step": 2726 }, { "epoch": 1.85665645216207, "grad_norm": 1.6244382858276367, "learning_rate": 7.430098797051926e-06, "loss": 0.8157, "mean_token_accuracy": 0.7258849740028381, "step": 2727 }, { "epoch": 1.857337419135172, "grad_norm": 1.5735372304916382, "learning_rate": 7.42812982249477e-06, "loss": 0.7919, "mean_token_accuracy": 0.7204679250717163, "step": 2728 }, { "epoch": 1.8580183861082737, "grad_norm": 1.8519638776779175, "learning_rate": 7.426160355067696e-06, "loss": 0.7182, "mean_token_accuracy": 0.7510006725788116, "step": 2729 }, { "epoch": 1.8586993530813756, "grad_norm": 1.7504626512527466, "learning_rate": 7.424190395170476e-06, "loss": 0.6416, "mean_token_accuracy": 0.7810568809509277, "step": 2730 }, { "epoch": 1.8593803200544774, "grad_norm": 1.7656965255737305, "learning_rate": 7.422219943202977e-06, "loss": 0.6277, "mean_token_accuracy": 0.8054913878440857, "step": 2731 }, { "epoch": 1.860061287027579, "grad_norm": 1.7189867496490479, "learning_rate": 7.42024899956517e-06, "loss": 0.7158, "mean_token_accuracy": 0.7694578468799591, "step": 2732 }, { "epoch": 1.860742254000681, "grad_norm": 1.88117516040802, "learning_rate": 7.418277564657122e-06, "loss": 0.7967, "mean_token_accuracy": 0.7298433780670166, "step": 2733 }, { "epoch": 1.8614232209737827, "grad_norm": 1.7308566570281982, "learning_rate": 7.4163056388790025e-06, "loss": 0.6814, "mean_token_accuracy": 0.7517147362232208, "step": 2734 }, { "epoch": 1.8621041879468847, "grad_norm": 1.8354604244232178, "learning_rate": 7.414333222631077e-06, "loss": 0.6718, "mean_token_accuracy": 0.7853763103485107, "step": 2735 }, { "epoch": 1.8627851549199863, "grad_norm": 1.5006234645843506, "learning_rate": 7.412360316313717e-06, "loss": 0.8643, "mean_token_accuracy": 0.7297048270702362, "step": 2736 }, { "epoch": 1.8634661218930881, "grad_norm": 1.643819808959961, "learning_rate": 7.410386920327388e-06, "loss": 0.8373, "mean_token_accuracy": 0.7157947421073914, "step": 2737 }, { "epoch": 1.86414708886619, "grad_norm": 1.9521781206130981, "learning_rate": 7.408413035072654e-06, "loss": 0.6341, "mean_token_accuracy": 0.7798475027084351, "step": 2738 }, { "epoch": 1.8648280558392918, "grad_norm": 1.7322988510131836, "learning_rate": 7.406438660950184e-06, "loss": 0.8445, "mean_token_accuracy": 0.7111397683620453, "step": 2739 }, { "epoch": 1.8655090228123936, "grad_norm": 1.8181164264678955, "learning_rate": 7.404463798360744e-06, "loss": 0.5987, "mean_token_accuracy": 0.7926665544509888, "step": 2740 }, { "epoch": 1.8661899897854954, "grad_norm": 1.6533734798431396, "learning_rate": 7.402488447705194e-06, "loss": 0.7492, "mean_token_accuracy": 0.7511542439460754, "step": 2741 }, { "epoch": 1.8668709567585973, "grad_norm": 1.803444266319275, "learning_rate": 7.4005126093845005e-06, "loss": 0.7834, "mean_token_accuracy": 0.7489103376865387, "step": 2742 }, { "epoch": 1.8675519237316989, "grad_norm": 1.8537098169326782, "learning_rate": 7.3985362837997244e-06, "loss": 0.6384, "mean_token_accuracy": 0.7920272946357727, "step": 2743 }, { "epoch": 1.868232890704801, "grad_norm": 1.7166157960891724, "learning_rate": 7.396559471352029e-06, "loss": 0.7901, "mean_token_accuracy": 0.7522743940353394, "step": 2744 }, { "epoch": 1.8689138576779025, "grad_norm": 1.781941294670105, "learning_rate": 7.394582172442672e-06, "loss": 0.7785, "mean_token_accuracy": 0.7577338516712189, "step": 2745 }, { "epoch": 1.8695948246510046, "grad_norm": 1.550152063369751, "learning_rate": 7.392604387473011e-06, "loss": 0.7506, "mean_token_accuracy": 0.7518685758113861, "step": 2746 }, { "epoch": 1.8702757916241062, "grad_norm": 1.8206350803375244, "learning_rate": 7.390626116844508e-06, "loss": 0.8494, "mean_token_accuracy": 0.737255185842514, "step": 2747 }, { "epoch": 1.870956758597208, "grad_norm": 1.8622734546661377, "learning_rate": 7.3886473609587165e-06, "loss": 0.7227, "mean_token_accuracy": 0.7729296386241913, "step": 2748 }, { "epoch": 1.8716377255703098, "grad_norm": 1.7876381874084473, "learning_rate": 7.386668120217289e-06, "loss": 0.7016, "mean_token_accuracy": 0.7707555592060089, "step": 2749 }, { "epoch": 1.8723186925434117, "grad_norm": 1.816217064857483, "learning_rate": 7.38468839502198e-06, "loss": 0.8061, "mean_token_accuracy": 0.7162515819072723, "step": 2750 }, { "epoch": 1.8729996595165135, "grad_norm": 1.8420867919921875, "learning_rate": 7.3827081857746404e-06, "loss": 0.7236, "mean_token_accuracy": 0.7611306607723236, "step": 2751 }, { "epoch": 1.8736806264896153, "grad_norm": 1.686049461364746, "learning_rate": 7.380727492877222e-06, "loss": 0.7534, "mean_token_accuracy": 0.7494299113750458, "step": 2752 }, { "epoch": 1.8743615934627171, "grad_norm": 1.8858559131622314, "learning_rate": 7.3787463167317705e-06, "loss": 0.6725, "mean_token_accuracy": 0.7785737812519073, "step": 2753 }, { "epoch": 1.8750425604358187, "grad_norm": 1.6397924423217773, "learning_rate": 7.376764657740431e-06, "loss": 0.9091, "mean_token_accuracy": 0.7007673978805542, "step": 2754 }, { "epoch": 1.8757235274089208, "grad_norm": 1.6446930170059204, "learning_rate": 7.374782516305448e-06, "loss": 0.8298, "mean_token_accuracy": 0.7183621823787689, "step": 2755 }, { "epoch": 1.8764044943820224, "grad_norm": 1.7357335090637207, "learning_rate": 7.372799892829165e-06, "loss": 0.8032, "mean_token_accuracy": 0.75037682056427, "step": 2756 }, { "epoch": 1.8770854613551244, "grad_norm": 1.8044798374176025, "learning_rate": 7.370816787714018e-06, "loss": 0.7158, "mean_token_accuracy": 0.7689070701599121, "step": 2757 }, { "epoch": 1.877766428328226, "grad_norm": 1.5688132047653198, "learning_rate": 7.368833201362546e-06, "loss": 0.8145, "mean_token_accuracy": 0.7375842332839966, "step": 2758 }, { "epoch": 1.8784473953013279, "grad_norm": 1.6917669773101807, "learning_rate": 7.3668491341773866e-06, "loss": 0.811, "mean_token_accuracy": 0.7416626513004303, "step": 2759 }, { "epoch": 1.8791283622744297, "grad_norm": 1.7720741033554077, "learning_rate": 7.364864586561267e-06, "loss": 0.7782, "mean_token_accuracy": 0.7340041995048523, "step": 2760 }, { "epoch": 1.8798093292475315, "grad_norm": 1.53093421459198, "learning_rate": 7.3628795589170224e-06, "loss": 0.731, "mean_token_accuracy": 0.7596551775932312, "step": 2761 }, { "epoch": 1.8804902962206334, "grad_norm": 1.6538312435150146, "learning_rate": 7.360894051647578e-06, "loss": 0.7637, "mean_token_accuracy": 0.7484182417392731, "step": 2762 }, { "epoch": 1.881171263193735, "grad_norm": 1.9981262683868408, "learning_rate": 7.358908065155959e-06, "loss": 0.6908, "mean_token_accuracy": 0.7470174133777618, "step": 2763 }, { "epoch": 1.881852230166837, "grad_norm": 1.7837830781936646, "learning_rate": 7.356921599845286e-06, "loss": 0.7583, "mean_token_accuracy": 0.7457420825958252, "step": 2764 }, { "epoch": 1.8825331971399386, "grad_norm": 1.8589929342269897, "learning_rate": 7.35493465611878e-06, "loss": 0.7039, "mean_token_accuracy": 0.77277010679245, "step": 2765 }, { "epoch": 1.8832141641130407, "grad_norm": 1.8941067457199097, "learning_rate": 7.352947234379759e-06, "loss": 0.7603, "mean_token_accuracy": 0.7544480264186859, "step": 2766 }, { "epoch": 1.8838951310861423, "grad_norm": 1.6853479146957397, "learning_rate": 7.350959335031636e-06, "loss": 0.7334, "mean_token_accuracy": 0.7421896755695343, "step": 2767 }, { "epoch": 1.884576098059244, "grad_norm": 1.681980013847351, "learning_rate": 7.348970958477918e-06, "loss": 0.7474, "mean_token_accuracy": 0.7508864402770996, "step": 2768 }, { "epoch": 1.885257065032346, "grad_norm": 1.7229623794555664, "learning_rate": 7.346982105122215e-06, "loss": 0.8938, "mean_token_accuracy": 0.7076442241668701, "step": 2769 }, { "epoch": 1.8859380320054477, "grad_norm": 1.6940075159072876, "learning_rate": 7.344992775368231e-06, "loss": 0.7678, "mean_token_accuracy": 0.7509547173976898, "step": 2770 }, { "epoch": 1.8866189989785496, "grad_norm": 1.6187940835952759, "learning_rate": 7.343002969619767e-06, "loss": 0.8128, "mean_token_accuracy": 0.7484486401081085, "step": 2771 }, { "epoch": 1.8872999659516514, "grad_norm": 1.871293544769287, "learning_rate": 7.341012688280719e-06, "loss": 0.7597, "mean_token_accuracy": 0.7463994324207306, "step": 2772 }, { "epoch": 1.8879809329247532, "grad_norm": 1.7232846021652222, "learning_rate": 7.339021931755084e-06, "loss": 0.7677, "mean_token_accuracy": 0.7459184229373932, "step": 2773 }, { "epoch": 1.8886618998978548, "grad_norm": 1.6381198167800903, "learning_rate": 7.33703070044695e-06, "loss": 0.8637, "mean_token_accuracy": 0.7318844795227051, "step": 2774 }, { "epoch": 1.8893428668709569, "grad_norm": 1.6201672554016113, "learning_rate": 7.3350389947605025e-06, "loss": 0.8228, "mean_token_accuracy": 0.7266575992107391, "step": 2775 }, { "epoch": 1.8900238338440585, "grad_norm": 1.728873610496521, "learning_rate": 7.333046815100027e-06, "loss": 0.7499, "mean_token_accuracy": 0.7577397525310516, "step": 2776 }, { "epoch": 1.8907048008171605, "grad_norm": 1.5022398233413696, "learning_rate": 7.331054161869903e-06, "loss": 0.7957, "mean_token_accuracy": 0.7359139919281006, "step": 2777 }, { "epoch": 1.8913857677902621, "grad_norm": 1.6400156021118164, "learning_rate": 7.329061035474605e-06, "loss": 0.7812, "mean_token_accuracy": 0.7393780052661896, "step": 2778 }, { "epoch": 1.892066734763364, "grad_norm": 1.7221527099609375, "learning_rate": 7.327067436318704e-06, "loss": 0.7327, "mean_token_accuracy": 0.7630227506160736, "step": 2779 }, { "epoch": 1.8927477017364658, "grad_norm": 1.7724446058273315, "learning_rate": 7.325073364806867e-06, "loss": 0.6888, "mean_token_accuracy": 0.7761099934577942, "step": 2780 }, { "epoch": 1.8934286687095676, "grad_norm": 1.6968094110488892, "learning_rate": 7.323078821343859e-06, "loss": 0.757, "mean_token_accuracy": 0.7493806779384613, "step": 2781 }, { "epoch": 1.8941096356826694, "grad_norm": 1.7166043519973755, "learning_rate": 7.321083806334539e-06, "loss": 0.6926, "mean_token_accuracy": 0.7632210552692413, "step": 2782 }, { "epoch": 1.894790602655771, "grad_norm": 1.6902015209197998, "learning_rate": 7.319088320183861e-06, "loss": 0.7427, "mean_token_accuracy": 0.7405990064144135, "step": 2783 }, { "epoch": 1.895471569628873, "grad_norm": 1.7707126140594482, "learning_rate": 7.317092363296875e-06, "loss": 0.7542, "mean_token_accuracy": 0.7450629770755768, "step": 2784 }, { "epoch": 1.8961525366019747, "grad_norm": 1.798412799835205, "learning_rate": 7.3150959360787285e-06, "loss": 0.6542, "mean_token_accuracy": 0.7965307831764221, "step": 2785 }, { "epoch": 1.8968335035750767, "grad_norm": 1.7628246545791626, "learning_rate": 7.3130990389346615e-06, "loss": 0.8091, "mean_token_accuracy": 0.7317991852760315, "step": 2786 }, { "epoch": 1.8975144705481783, "grad_norm": 1.6973421573638916, "learning_rate": 7.3111016722700116e-06, "loss": 0.7954, "mean_token_accuracy": 0.7129428088665009, "step": 2787 }, { "epoch": 1.8981954375212802, "grad_norm": 1.8200230598449707, "learning_rate": 7.30910383649021e-06, "loss": 0.6071, "mean_token_accuracy": 0.8065387308597565, "step": 2788 }, { "epoch": 1.898876404494382, "grad_norm": 1.970239520072937, "learning_rate": 7.307105532000787e-06, "loss": 0.5568, "mean_token_accuracy": 0.8118141889572144, "step": 2789 }, { "epoch": 1.8995573714674838, "grad_norm": 1.6461114883422852, "learning_rate": 7.305106759207361e-06, "loss": 0.7048, "mean_token_accuracy": 0.7563951909542084, "step": 2790 }, { "epoch": 1.9002383384405857, "grad_norm": 1.948729395866394, "learning_rate": 7.303107518515652e-06, "loss": 0.7485, "mean_token_accuracy": 0.7591689825057983, "step": 2791 }, { "epoch": 1.9009193054136875, "grad_norm": 1.9192380905151367, "learning_rate": 7.301107810331473e-06, "loss": 0.8987, "mean_token_accuracy": 0.7209303379058838, "step": 2792 }, { "epoch": 1.9016002723867893, "grad_norm": 1.518784523010254, "learning_rate": 7.299107635060731e-06, "loss": 0.8068, "mean_token_accuracy": 0.7307584285736084, "step": 2793 }, { "epoch": 1.902281239359891, "grad_norm": 1.9142760038375854, "learning_rate": 7.297106993109426e-06, "loss": 0.7826, "mean_token_accuracy": 0.755306214094162, "step": 2794 }, { "epoch": 1.902962206332993, "grad_norm": 1.6888295412063599, "learning_rate": 7.295105884883659e-06, "loss": 0.6589, "mean_token_accuracy": 0.7927906215190887, "step": 2795 }, { "epoch": 1.9036431733060946, "grad_norm": 1.6537450551986694, "learning_rate": 7.293104310789618e-06, "loss": 0.7396, "mean_token_accuracy": 0.7374014258384705, "step": 2796 }, { "epoch": 1.9043241402791966, "grad_norm": 1.6607189178466797, "learning_rate": 7.291102271233592e-06, "loss": 0.72, "mean_token_accuracy": 0.7664028406143188, "step": 2797 }, { "epoch": 1.9050051072522982, "grad_norm": 1.9276103973388672, "learning_rate": 7.289099766621961e-06, "loss": 0.6374, "mean_token_accuracy": 0.7830633521080017, "step": 2798 }, { "epoch": 1.9056860742254, "grad_norm": 1.7547537088394165, "learning_rate": 7.287096797361197e-06, "loss": 0.7651, "mean_token_accuracy": 0.7570888698101044, "step": 2799 }, { "epoch": 1.9063670411985019, "grad_norm": 1.511141061782837, "learning_rate": 7.285093363857875e-06, "loss": 0.8781, "mean_token_accuracy": 0.7206160426139832, "step": 2800 }, { "epoch": 1.9070480081716037, "grad_norm": 1.7544573545455933, "learning_rate": 7.283089466518654e-06, "loss": 0.7878, "mean_token_accuracy": 0.7374825775623322, "step": 2801 }, { "epoch": 1.9077289751447055, "grad_norm": 1.759790301322937, "learning_rate": 7.281085105750292e-06, "loss": 0.6848, "mean_token_accuracy": 0.7687813639640808, "step": 2802 }, { "epoch": 1.9084099421178071, "grad_norm": 1.8674345016479492, "learning_rate": 7.279080281959645e-06, "loss": 0.7365, "mean_token_accuracy": 0.7560760080814362, "step": 2803 }, { "epoch": 1.9090909090909092, "grad_norm": 1.937725305557251, "learning_rate": 7.277074995553656e-06, "loss": 0.569, "mean_token_accuracy": 0.8118851184844971, "step": 2804 }, { "epoch": 1.9097718760640108, "grad_norm": 1.6992608308792114, "learning_rate": 7.2750692469393626e-06, "loss": 0.7832, "mean_token_accuracy": 0.7322153747081757, "step": 2805 }, { "epoch": 1.9104528430371128, "grad_norm": 1.55292809009552, "learning_rate": 7.273063036523902e-06, "loss": 0.8666, "mean_token_accuracy": 0.7142980396747589, "step": 2806 }, { "epoch": 1.9111338100102144, "grad_norm": 1.8124405145645142, "learning_rate": 7.2710563647145e-06, "loss": 0.6391, "mean_token_accuracy": 0.7832696735858917, "step": 2807 }, { "epoch": 1.9118147769833163, "grad_norm": 1.7638154029846191, "learning_rate": 7.269049231918478e-06, "loss": 0.675, "mean_token_accuracy": 0.7835802733898163, "step": 2808 }, { "epoch": 1.912495743956418, "grad_norm": 1.7570267915725708, "learning_rate": 7.2670416385432485e-06, "loss": 0.7206, "mean_token_accuracy": 0.7600840926170349, "step": 2809 }, { "epoch": 1.91317671092952, "grad_norm": 1.819337010383606, "learning_rate": 7.265033584996323e-06, "loss": 0.7079, "mean_token_accuracy": 0.7552609443664551, "step": 2810 }, { "epoch": 1.9138576779026217, "grad_norm": 1.6209585666656494, "learning_rate": 7.263025071685301e-06, "loss": 0.8167, "mean_token_accuracy": 0.737509161233902, "step": 2811 }, { "epoch": 1.9145386448757236, "grad_norm": 1.6585214138031006, "learning_rate": 7.261016099017877e-06, "loss": 0.6839, "mean_token_accuracy": 0.7709226012229919, "step": 2812 }, { "epoch": 1.9152196118488254, "grad_norm": 1.635117769241333, "learning_rate": 7.259006667401839e-06, "loss": 0.766, "mean_token_accuracy": 0.7677783668041229, "step": 2813 }, { "epoch": 1.915900578821927, "grad_norm": 1.7786891460418701, "learning_rate": 7.256996777245069e-06, "loss": 0.6898, "mean_token_accuracy": 0.7782201170921326, "step": 2814 }, { "epoch": 1.916581545795029, "grad_norm": 1.8355966806411743, "learning_rate": 7.2549864289555416e-06, "loss": 0.7227, "mean_token_accuracy": 0.7698667049407959, "step": 2815 }, { "epoch": 1.9172625127681306, "grad_norm": 1.7103700637817383, "learning_rate": 7.252975622941321e-06, "loss": 0.7356, "mean_token_accuracy": 0.7605248987674713, "step": 2816 }, { "epoch": 1.9179434797412327, "grad_norm": 1.8367509841918945, "learning_rate": 7.250964359610571e-06, "loss": 0.72, "mean_token_accuracy": 0.7546649277210236, "step": 2817 }, { "epoch": 1.9186244467143343, "grad_norm": 1.7242019176483154, "learning_rate": 7.248952639371543e-06, "loss": 0.8092, "mean_token_accuracy": 0.7331208288669586, "step": 2818 }, { "epoch": 1.9193054136874361, "grad_norm": 1.6852173805236816, "learning_rate": 7.246940462632583e-06, "loss": 0.7951, "mean_token_accuracy": 0.7464454770088196, "step": 2819 }, { "epoch": 1.919986380660538, "grad_norm": 1.6959221363067627, "learning_rate": 7.244927829802128e-06, "loss": 0.736, "mean_token_accuracy": 0.7366943955421448, "step": 2820 }, { "epoch": 1.9206673476336398, "grad_norm": 1.7434744834899902, "learning_rate": 7.242914741288711e-06, "loss": 0.7197, "mean_token_accuracy": 0.7652397751808167, "step": 2821 }, { "epoch": 1.9213483146067416, "grad_norm": 1.6880789995193481, "learning_rate": 7.240901197500956e-06, "loss": 0.7129, "mean_token_accuracy": 0.7742304801940918, "step": 2822 }, { "epoch": 1.9220292815798434, "grad_norm": 1.9374816417694092, "learning_rate": 7.238887198847575e-06, "loss": 0.6228, "mean_token_accuracy": 0.7942592203617096, "step": 2823 }, { "epoch": 1.9227102485529453, "grad_norm": 1.8086215257644653, "learning_rate": 7.23687274573738e-06, "loss": 0.6782, "mean_token_accuracy": 0.772092342376709, "step": 2824 }, { "epoch": 1.9233912155260469, "grad_norm": 1.9186817407608032, "learning_rate": 7.234857838579269e-06, "loss": 0.7368, "mean_token_accuracy": 0.7592378854751587, "step": 2825 }, { "epoch": 1.924072182499149, "grad_norm": 1.785252332687378, "learning_rate": 7.232842477782237e-06, "loss": 0.5789, "mean_token_accuracy": 0.8125032782554626, "step": 2826 }, { "epoch": 1.9247531494722505, "grad_norm": 1.766361117362976, "learning_rate": 7.2308266637553655e-06, "loss": 0.5822, "mean_token_accuracy": 0.7905328869819641, "step": 2827 }, { "epoch": 1.9254341164453526, "grad_norm": 1.7315737009048462, "learning_rate": 7.228810396907835e-06, "loss": 0.7995, "mean_token_accuracy": 0.7408699691295624, "step": 2828 }, { "epoch": 1.9261150834184542, "grad_norm": 1.8490349054336548, "learning_rate": 7.226793677648911e-06, "loss": 0.7094, "mean_token_accuracy": 0.7770526111125946, "step": 2829 }, { "epoch": 1.926796050391556, "grad_norm": 1.5557222366333008, "learning_rate": 7.224776506387956e-06, "loss": 0.7933, "mean_token_accuracy": 0.7298164069652557, "step": 2830 }, { "epoch": 1.9274770173646578, "grad_norm": 1.7630202770233154, "learning_rate": 7.22275888353442e-06, "loss": 0.7143, "mean_token_accuracy": 0.7744943499565125, "step": 2831 }, { "epoch": 1.9281579843377596, "grad_norm": 1.918547511100769, "learning_rate": 7.220740809497847e-06, "loss": 0.6276, "mean_token_accuracy": 0.7900619506835938, "step": 2832 }, { "epoch": 1.9288389513108615, "grad_norm": 1.708437442779541, "learning_rate": 7.218722284687875e-06, "loss": 0.6958, "mean_token_accuracy": 0.7678330838680267, "step": 2833 }, { "epoch": 1.929519918283963, "grad_norm": 1.7711790800094604, "learning_rate": 7.21670330951423e-06, "loss": 0.6494, "mean_token_accuracy": 0.790889173746109, "step": 2834 }, { "epoch": 1.9302008852570651, "grad_norm": 1.819953441619873, "learning_rate": 7.214683884386728e-06, "loss": 0.7286, "mean_token_accuracy": 0.7602211833000183, "step": 2835 }, { "epoch": 1.9308818522301667, "grad_norm": 1.7557930946350098, "learning_rate": 7.212664009715281e-06, "loss": 0.7979, "mean_token_accuracy": 0.7519899904727936, "step": 2836 }, { "epoch": 1.9315628192032688, "grad_norm": 1.8621302843093872, "learning_rate": 7.2106436859098904e-06, "loss": 0.6548, "mean_token_accuracy": 0.7837201356887817, "step": 2837 }, { "epoch": 1.9322437861763704, "grad_norm": 1.6826975345611572, "learning_rate": 7.208622913380645e-06, "loss": 0.7762, "mean_token_accuracy": 0.7516266107559204, "step": 2838 }, { "epoch": 1.9329247531494722, "grad_norm": 1.7264341115951538, "learning_rate": 7.2066016925377325e-06, "loss": 0.8373, "mean_token_accuracy": 0.7191969752311707, "step": 2839 }, { "epoch": 1.933605720122574, "grad_norm": 1.4937478303909302, "learning_rate": 7.204580023791423e-06, "loss": 0.8382, "mean_token_accuracy": 0.7319225072860718, "step": 2840 }, { "epoch": 1.9342866870956759, "grad_norm": 1.8764711618423462, "learning_rate": 7.2025579075520835e-06, "loss": 0.6952, "mean_token_accuracy": 0.7614614069461823, "step": 2841 }, { "epoch": 1.9349676540687777, "grad_norm": 1.805161952972412, "learning_rate": 7.20053534423017e-06, "loss": 0.7084, "mean_token_accuracy": 0.771033525466919, "step": 2842 }, { "epoch": 1.9356486210418795, "grad_norm": 1.640527367591858, "learning_rate": 7.198512334236228e-06, "loss": 0.8279, "mean_token_accuracy": 0.7159605920314789, "step": 2843 }, { "epoch": 1.9363295880149813, "grad_norm": 1.5640487670898438, "learning_rate": 7.196488877980897e-06, "loss": 0.7713, "mean_token_accuracy": 0.7387466132640839, "step": 2844 }, { "epoch": 1.937010554988083, "grad_norm": 1.7002869844436646, "learning_rate": 7.194464975874904e-06, "loss": 0.7733, "mean_token_accuracy": 0.7419323623180389, "step": 2845 }, { "epoch": 1.937691521961185, "grad_norm": 1.6880589723587036, "learning_rate": 7.192440628329066e-06, "loss": 0.7532, "mean_token_accuracy": 0.7396595180034637, "step": 2846 }, { "epoch": 1.9383724889342866, "grad_norm": 1.6738214492797852, "learning_rate": 7.190415835754293e-06, "loss": 0.7272, "mean_token_accuracy": 0.7435548007488251, "step": 2847 }, { "epoch": 1.9390534559073886, "grad_norm": 1.918070673942566, "learning_rate": 7.188390598561584e-06, "loss": 0.6855, "mean_token_accuracy": 0.7768622934818268, "step": 2848 }, { "epoch": 1.9397344228804902, "grad_norm": 1.7370476722717285, "learning_rate": 7.18636491716203e-06, "loss": 0.8113, "mean_token_accuracy": 0.7334631085395813, "step": 2849 }, { "epoch": 1.940415389853592, "grad_norm": 1.6956255435943604, "learning_rate": 7.184338791966807e-06, "loss": 0.6604, "mean_token_accuracy": 0.7709639668464661, "step": 2850 }, { "epoch": 1.941096356826694, "grad_norm": 1.6479026079177856, "learning_rate": 7.182312223387187e-06, "loss": 0.7254, "mean_token_accuracy": 0.7592873871326447, "step": 2851 }, { "epoch": 1.9417773237997957, "grad_norm": 1.7026594877243042, "learning_rate": 7.180285211834531e-06, "loss": 0.7462, "mean_token_accuracy": 0.7420480251312256, "step": 2852 }, { "epoch": 1.9424582907728976, "grad_norm": 1.835135817527771, "learning_rate": 7.178257757720286e-06, "loss": 0.7434, "mean_token_accuracy": 0.7503098845481873, "step": 2853 }, { "epoch": 1.9431392577459992, "grad_norm": 1.742002248764038, "learning_rate": 7.176229861455992e-06, "loss": 0.869, "mean_token_accuracy": 0.7162235975265503, "step": 2854 }, { "epoch": 1.9438202247191012, "grad_norm": 1.77491295337677, "learning_rate": 7.174201523453279e-06, "loss": 0.6008, "mean_token_accuracy": 0.789484977722168, "step": 2855 }, { "epoch": 1.9445011916922028, "grad_norm": 1.7835779190063477, "learning_rate": 7.172172744123867e-06, "loss": 0.6606, "mean_token_accuracy": 0.7936999499797821, "step": 2856 }, { "epoch": 1.9451821586653049, "grad_norm": 1.7767788171768188, "learning_rate": 7.1701435238795605e-06, "loss": 0.6903, "mean_token_accuracy": 0.7647218108177185, "step": 2857 }, { "epoch": 1.9458631256384065, "grad_norm": 1.8000895977020264, "learning_rate": 7.16811386313226e-06, "loss": 0.7736, "mean_token_accuracy": 0.7356529533863068, "step": 2858 }, { "epoch": 1.9465440926115083, "grad_norm": 1.6483913660049438, "learning_rate": 7.1660837622939535e-06, "loss": 0.7932, "mean_token_accuracy": 0.7466507852077484, "step": 2859 }, { "epoch": 1.9472250595846101, "grad_norm": 1.7530640363693237, "learning_rate": 7.164053221776717e-06, "loss": 0.6586, "mean_token_accuracy": 0.775009959936142, "step": 2860 }, { "epoch": 1.947906026557712, "grad_norm": 1.8024193048477173, "learning_rate": 7.162022241992716e-06, "loss": 0.8472, "mean_token_accuracy": 0.7410489022731781, "step": 2861 }, { "epoch": 1.9485869935308138, "grad_norm": 1.777213215827942, "learning_rate": 7.159990823354204e-06, "loss": 0.7225, "mean_token_accuracy": 0.7694746553897858, "step": 2862 }, { "epoch": 1.9492679605039156, "grad_norm": 1.7864105701446533, "learning_rate": 7.157958966273529e-06, "loss": 0.702, "mean_token_accuracy": 0.7700666785240173, "step": 2863 }, { "epoch": 1.9499489274770174, "grad_norm": 1.8980131149291992, "learning_rate": 7.155926671163123e-06, "loss": 0.6868, "mean_token_accuracy": 0.7696079313755035, "step": 2864 }, { "epoch": 1.950629894450119, "grad_norm": 1.731663465499878, "learning_rate": 7.153893938435505e-06, "loss": 0.6708, "mean_token_accuracy": 0.774931788444519, "step": 2865 }, { "epoch": 1.951310861423221, "grad_norm": 1.8306338787078857, "learning_rate": 7.151860768503287e-06, "loss": 0.7343, "mean_token_accuracy": 0.768362820148468, "step": 2866 }, { "epoch": 1.9519918283963227, "grad_norm": 1.6578402519226074, "learning_rate": 7.149827161779172e-06, "loss": 0.835, "mean_token_accuracy": 0.7271889746189117, "step": 2867 }, { "epoch": 1.9526727953694247, "grad_norm": 1.691877007484436, "learning_rate": 7.147793118675944e-06, "loss": 0.7378, "mean_token_accuracy": 0.7622014284133911, "step": 2868 }, { "epoch": 1.9533537623425263, "grad_norm": 1.678011417388916, "learning_rate": 7.145758639606483e-06, "loss": 0.712, "mean_token_accuracy": 0.7717524170875549, "step": 2869 }, { "epoch": 1.9540347293156282, "grad_norm": 1.9429724216461182, "learning_rate": 7.143723724983753e-06, "loss": 0.5972, "mean_token_accuracy": 0.8084903657436371, "step": 2870 }, { "epoch": 1.95471569628873, "grad_norm": 1.723443627357483, "learning_rate": 7.141688375220809e-06, "loss": 0.7509, "mean_token_accuracy": 0.749064028263092, "step": 2871 }, { "epoch": 1.9553966632618318, "grad_norm": 1.754875659942627, "learning_rate": 7.13965259073079e-06, "loss": 0.7871, "mean_token_accuracy": 0.7365760505199432, "step": 2872 }, { "epoch": 1.9560776302349336, "grad_norm": 1.6376547813415527, "learning_rate": 7.13761637192693e-06, "loss": 0.7313, "mean_token_accuracy": 0.74747434258461, "step": 2873 }, { "epoch": 1.9567585972080352, "grad_norm": 1.7930004596710205, "learning_rate": 7.135579719222545e-06, "loss": 0.7558, "mean_token_accuracy": 0.7569591701030731, "step": 2874 }, { "epoch": 1.9574395641811373, "grad_norm": 1.7422864437103271, "learning_rate": 7.133542633031044e-06, "loss": 0.614, "mean_token_accuracy": 0.7966923713684082, "step": 2875 }, { "epoch": 1.958120531154239, "grad_norm": 1.649262547492981, "learning_rate": 7.131505113765919e-06, "loss": 0.7504, "mean_token_accuracy": 0.7762450277805328, "step": 2876 }, { "epoch": 1.958801498127341, "grad_norm": 1.860689401626587, "learning_rate": 7.129467161840753e-06, "loss": 0.7628, "mean_token_accuracy": 0.7374359667301178, "step": 2877 }, { "epoch": 1.9594824651004425, "grad_norm": 1.7967554330825806, "learning_rate": 7.127428777669217e-06, "loss": 0.6658, "mean_token_accuracy": 0.7731353044509888, "step": 2878 }, { "epoch": 1.9601634320735446, "grad_norm": 1.6457343101501465, "learning_rate": 7.12538996166507e-06, "loss": 0.7971, "mean_token_accuracy": 0.7408202290534973, "step": 2879 }, { "epoch": 1.9608443990466462, "grad_norm": 1.694189429283142, "learning_rate": 7.1233507142421566e-06, "loss": 0.6987, "mean_token_accuracy": 0.7717253267765045, "step": 2880 }, { "epoch": 1.961525366019748, "grad_norm": 1.9490187168121338, "learning_rate": 7.121311035814409e-06, "loss": 0.713, "mean_token_accuracy": 0.7699535489082336, "step": 2881 }, { "epoch": 1.9622063329928499, "grad_norm": 1.8750550746917725, "learning_rate": 7.11927092679585e-06, "loss": 0.6854, "mean_token_accuracy": 0.7816856205463409, "step": 2882 }, { "epoch": 1.9628872999659517, "grad_norm": 1.5706425905227661, "learning_rate": 7.117230387600584e-06, "loss": 0.7811, "mean_token_accuracy": 0.7576667666435242, "step": 2883 }, { "epoch": 1.9635682669390535, "grad_norm": 1.8285611867904663, "learning_rate": 7.115189418642812e-06, "loss": 0.7055, "mean_token_accuracy": 0.7758765518665314, "step": 2884 }, { "epoch": 1.9642492339121551, "grad_norm": 1.563498854637146, "learning_rate": 7.113148020336813e-06, "loss": 0.7854, "mean_token_accuracy": 0.7307912409305573, "step": 2885 }, { "epoch": 1.9649302008852572, "grad_norm": 1.758863091468811, "learning_rate": 7.111106193096959e-06, "loss": 0.7124, "mean_token_accuracy": 0.763394683599472, "step": 2886 }, { "epoch": 1.9656111678583588, "grad_norm": 1.5508618354797363, "learning_rate": 7.109063937337705e-06, "loss": 0.7306, "mean_token_accuracy": 0.7653729319572449, "step": 2887 }, { "epoch": 1.9662921348314608, "grad_norm": 1.6823328733444214, "learning_rate": 7.107021253473594e-06, "loss": 0.7803, "mean_token_accuracy": 0.7353699505329132, "step": 2888 }, { "epoch": 1.9669731018045624, "grad_norm": 1.7849353551864624, "learning_rate": 7.104978141919261e-06, "loss": 0.7436, "mean_token_accuracy": 0.7302327454090118, "step": 2889 }, { "epoch": 1.9676540687776642, "grad_norm": 1.8850595951080322, "learning_rate": 7.10293460308942e-06, "loss": 0.7188, "mean_token_accuracy": 0.7712018489837646, "step": 2890 }, { "epoch": 1.968335035750766, "grad_norm": 1.9582655429840088, "learning_rate": 7.100890637398876e-06, "loss": 0.7775, "mean_token_accuracy": 0.7552027702331543, "step": 2891 }, { "epoch": 1.969016002723868, "grad_norm": 1.7970545291900635, "learning_rate": 7.0988462452625206e-06, "loss": 0.6777, "mean_token_accuracy": 0.7771441638469696, "step": 2892 }, { "epoch": 1.9696969696969697, "grad_norm": 1.8038957118988037, "learning_rate": 7.096801427095331e-06, "loss": 0.66, "mean_token_accuracy": 0.7748119831085205, "step": 2893 }, { "epoch": 1.9703779366700716, "grad_norm": 1.7118242979049683, "learning_rate": 7.094756183312372e-06, "loss": 0.6853, "mean_token_accuracy": 0.7689699828624725, "step": 2894 }, { "epoch": 1.9710589036431734, "grad_norm": 1.8875545263290405, "learning_rate": 7.092710514328793e-06, "loss": 0.6474, "mean_token_accuracy": 0.7804685533046722, "step": 2895 }, { "epoch": 1.971739870616275, "grad_norm": 1.8151986598968506, "learning_rate": 7.090664420559832e-06, "loss": 0.7493, "mean_token_accuracy": 0.7525484561920166, "step": 2896 }, { "epoch": 1.972420837589377, "grad_norm": 1.8582351207733154, "learning_rate": 7.08861790242081e-06, "loss": 0.7516, "mean_token_accuracy": 0.7630902230739594, "step": 2897 }, { "epoch": 1.9731018045624786, "grad_norm": 1.7192351818084717, "learning_rate": 7.086570960327137e-06, "loss": 0.8121, "mean_token_accuracy": 0.7356559634208679, "step": 2898 }, { "epoch": 1.9737827715355807, "grad_norm": 1.8100576400756836, "learning_rate": 7.084523594694309e-06, "loss": 0.661, "mean_token_accuracy": 0.7832964956760406, "step": 2899 }, { "epoch": 1.9744637385086823, "grad_norm": 1.9657095670700073, "learning_rate": 7.0824758059379075e-06, "loss": 0.5941, "mean_token_accuracy": 0.7959547638893127, "step": 2900 }, { "epoch": 1.9751447054817841, "grad_norm": 1.6955845355987549, "learning_rate": 7.080427594473598e-06, "loss": 0.697, "mean_token_accuracy": 0.7649266123771667, "step": 2901 }, { "epoch": 1.975825672454886, "grad_norm": 1.7059482336044312, "learning_rate": 7.078378960717132e-06, "loss": 0.73, "mean_token_accuracy": 0.7713019251823425, "step": 2902 }, { "epoch": 1.9765066394279878, "grad_norm": 1.858829140663147, "learning_rate": 7.076329905084352e-06, "loss": 0.6612, "mean_token_accuracy": 0.7839258313179016, "step": 2903 }, { "epoch": 1.9771876064010896, "grad_norm": 1.6988039016723633, "learning_rate": 7.074280427991179e-06, "loss": 0.7143, "mean_token_accuracy": 0.7415388822555542, "step": 2904 }, { "epoch": 1.9778685733741912, "grad_norm": 1.8073707818984985, "learning_rate": 7.0722305298536245e-06, "loss": 0.7309, "mean_token_accuracy": 0.7578823864459991, "step": 2905 }, { "epoch": 1.9785495403472932, "grad_norm": 1.707658052444458, "learning_rate": 7.070180211087781e-06, "loss": 0.7098, "mean_token_accuracy": 0.7676490247249603, "step": 2906 }, { "epoch": 1.9792305073203948, "grad_norm": 1.6710084676742554, "learning_rate": 7.0681294721098325e-06, "loss": 0.7605, "mean_token_accuracy": 0.754244327545166, "step": 2907 }, { "epoch": 1.979911474293497, "grad_norm": 1.8941258192062378, "learning_rate": 7.066078313336045e-06, "loss": 0.7186, "mean_token_accuracy": 0.7729725241661072, "step": 2908 }, { "epoch": 1.9805924412665985, "grad_norm": 1.8865514993667603, "learning_rate": 7.064026735182765e-06, "loss": 0.6143, "mean_token_accuracy": 0.791162520647049, "step": 2909 }, { "epoch": 1.9812734082397003, "grad_norm": 1.7880830764770508, "learning_rate": 7.061974738066433e-06, "loss": 0.7528, "mean_token_accuracy": 0.746629387140274, "step": 2910 }, { "epoch": 1.9819543752128022, "grad_norm": 1.7430874109268188, "learning_rate": 7.059922322403568e-06, "loss": 0.6884, "mean_token_accuracy": 0.7578637003898621, "step": 2911 }, { "epoch": 1.982635342185904, "grad_norm": 1.6889508962631226, "learning_rate": 7.0578694886107794e-06, "loss": 0.7435, "mean_token_accuracy": 0.7372950613498688, "step": 2912 }, { "epoch": 1.9833163091590058, "grad_norm": 1.8469343185424805, "learning_rate": 7.055816237104753e-06, "loss": 0.7479, "mean_token_accuracy": 0.7491508424282074, "step": 2913 }, { "epoch": 1.9839972761321076, "grad_norm": 1.8136179447174072, "learning_rate": 7.0537625683022695e-06, "loss": 0.7881, "mean_token_accuracy": 0.7506251931190491, "step": 2914 }, { "epoch": 1.9846782431052095, "grad_norm": 1.8074803352355957, "learning_rate": 7.051708482620187e-06, "loss": 0.7349, "mean_token_accuracy": 0.7635655105113983, "step": 2915 }, { "epoch": 1.985359210078311, "grad_norm": 1.8441497087478638, "learning_rate": 7.049653980475451e-06, "loss": 0.785, "mean_token_accuracy": 0.7322985231876373, "step": 2916 }, { "epoch": 1.9860401770514131, "grad_norm": 1.8160741329193115, "learning_rate": 7.047599062285092e-06, "loss": 0.671, "mean_token_accuracy": 0.7735156118869781, "step": 2917 }, { "epoch": 1.9867211440245147, "grad_norm": 1.74799382686615, "learning_rate": 7.045543728466223e-06, "loss": 0.7221, "mean_token_accuracy": 0.7466610372066498, "step": 2918 }, { "epoch": 1.9874021109976168, "grad_norm": 1.6796425580978394, "learning_rate": 7.043487979436044e-06, "loss": 0.7781, "mean_token_accuracy": 0.7318922281265259, "step": 2919 }, { "epoch": 1.9880830779707184, "grad_norm": 1.7700245380401611, "learning_rate": 7.041431815611836e-06, "loss": 0.7089, "mean_token_accuracy": 0.7510814666748047, "step": 2920 }, { "epoch": 1.9887640449438202, "grad_norm": 1.7510669231414795, "learning_rate": 7.039375237410968e-06, "loss": 0.7654, "mean_token_accuracy": 0.72867152094841, "step": 2921 }, { "epoch": 1.989445011916922, "grad_norm": 1.711633324623108, "learning_rate": 7.037318245250891e-06, "loss": 0.7733, "mean_token_accuracy": 0.7531162798404694, "step": 2922 }, { "epoch": 1.9901259788900239, "grad_norm": 1.7148668766021729, "learning_rate": 7.0352608395491375e-06, "loss": 0.6899, "mean_token_accuracy": 0.7798043489456177, "step": 2923 }, { "epoch": 1.9908069458631257, "grad_norm": 1.8222483396530151, "learning_rate": 7.033203020723328e-06, "loss": 0.7449, "mean_token_accuracy": 0.7621224224567413, "step": 2924 }, { "epoch": 1.9914879128362273, "grad_norm": 1.9955716133117676, "learning_rate": 7.031144789191167e-06, "loss": 0.6578, "mean_token_accuracy": 0.7686023712158203, "step": 2925 }, { "epoch": 1.9921688798093293, "grad_norm": 1.7438702583312988, "learning_rate": 7.02908614537044e-06, "loss": 0.6881, "mean_token_accuracy": 0.7730461061000824, "step": 2926 }, { "epoch": 1.992849846782431, "grad_norm": 1.7515597343444824, "learning_rate": 7.027027089679017e-06, "loss": 0.7737, "mean_token_accuracy": 0.7366509437561035, "step": 2927 }, { "epoch": 1.993530813755533, "grad_norm": 1.7274036407470703, "learning_rate": 7.024967622534853e-06, "loss": 0.7362, "mean_token_accuracy": 0.7542429864406586, "step": 2928 }, { "epoch": 1.9942117807286346, "grad_norm": 1.689366102218628, "learning_rate": 7.0229077443559845e-06, "loss": 0.6806, "mean_token_accuracy": 0.7855758666992188, "step": 2929 }, { "epoch": 1.9948927477017364, "grad_norm": 1.6642643213272095, "learning_rate": 7.020847455560533e-06, "loss": 0.7275, "mean_token_accuracy": 0.7497054040431976, "step": 2930 }, { "epoch": 1.9955737146748382, "grad_norm": 1.7287439107894897, "learning_rate": 7.018786756566704e-06, "loss": 0.7296, "mean_token_accuracy": 0.7636447846889496, "step": 2931 }, { "epoch": 1.99625468164794, "grad_norm": 1.7088626623153687, "learning_rate": 7.016725647792783e-06, "loss": 0.6526, "mean_token_accuracy": 0.7839633524417877, "step": 2932 }, { "epoch": 1.996935648621042, "grad_norm": 1.6774065494537354, "learning_rate": 7.014664129657141e-06, "loss": 0.8118, "mean_token_accuracy": 0.7139724791049957, "step": 2933 }, { "epoch": 1.9976166155941437, "grad_norm": 1.9092817306518555, "learning_rate": 7.0126022025782335e-06, "loss": 0.7671, "mean_token_accuracy": 0.7492094933986664, "step": 2934 }, { "epoch": 1.9982975825672455, "grad_norm": 1.694600224494934, "learning_rate": 7.010539866974595e-06, "loss": 0.7051, "mean_token_accuracy": 0.756309449672699, "step": 2935 }, { "epoch": 1.9989785495403471, "grad_norm": 1.6059385538101196, "learning_rate": 7.008477123264849e-06, "loss": 0.7862, "mean_token_accuracy": 0.7512997686862946, "step": 2936 }, { "epoch": 1.9996595165134492, "grad_norm": 1.8264501094818115, "learning_rate": 7.006413971867694e-06, "loss": 0.6826, "mean_token_accuracy": 0.7666681408882141, "step": 2937 }, { "epoch": 2.0, "grad_norm": 2.608997344970703, "learning_rate": 7.004350413201917e-06, "loss": 0.6382, "mean_token_accuracy": 0.8121079206466675, "step": 2938 }, { "epoch": 2.0006809669731016, "grad_norm": 1.7357453107833862, "learning_rate": 7.002286447686386e-06, "loss": 0.5691, "mean_token_accuracy": 0.8027819097042084, "step": 2939 }, { "epoch": 2.0013619339462037, "grad_norm": 1.7497780323028564, "learning_rate": 7.000222075740052e-06, "loss": 0.4495, "mean_token_accuracy": 0.845757395029068, "step": 2940 }, { "epoch": 2.0020429009193053, "grad_norm": 1.6850841045379639, "learning_rate": 6.998157297781949e-06, "loss": 0.4751, "mean_token_accuracy": 0.8335265219211578, "step": 2941 }, { "epoch": 2.0027238678924073, "grad_norm": 1.691750168800354, "learning_rate": 6.996092114231191e-06, "loss": 0.4898, "mean_token_accuracy": 0.8298549652099609, "step": 2942 }, { "epoch": 2.003404834865509, "grad_norm": 1.6246052980422974, "learning_rate": 6.994026525506975e-06, "loss": 0.5818, "mean_token_accuracy": 0.7914283275604248, "step": 2943 }, { "epoch": 2.004085801838611, "grad_norm": 1.8262221813201904, "learning_rate": 6.991960532028584e-06, "loss": 0.5011, "mean_token_accuracy": 0.8123665750026703, "step": 2944 }, { "epoch": 2.0047667688117126, "grad_norm": 2.153334379196167, "learning_rate": 6.989894134215378e-06, "loss": 0.5426, "mean_token_accuracy": 0.802422046661377, "step": 2945 }, { "epoch": 2.0054477357848146, "grad_norm": 2.2675154209136963, "learning_rate": 6.987827332486803e-06, "loss": 0.4494, "mean_token_accuracy": 0.846182256937027, "step": 2946 }, { "epoch": 2.006128702757916, "grad_norm": 1.9422893524169922, "learning_rate": 6.985760127262382e-06, "loss": 0.58, "mean_token_accuracy": 0.7985585629940033, "step": 2947 }, { "epoch": 2.0068096697310183, "grad_norm": 1.891280174255371, "learning_rate": 6.983692518961727e-06, "loss": 0.5508, "mean_token_accuracy": 0.801910400390625, "step": 2948 }, { "epoch": 2.00749063670412, "grad_norm": 1.7312198877334595, "learning_rate": 6.981624508004527e-06, "loss": 0.547, "mean_token_accuracy": 0.7900567352771759, "step": 2949 }, { "epoch": 2.0081716036772215, "grad_norm": 1.8253819942474365, "learning_rate": 6.979556094810553e-06, "loss": 0.4869, "mean_token_accuracy": 0.8329333662986755, "step": 2950 }, { "epoch": 2.0088525706503235, "grad_norm": 1.770350694656372, "learning_rate": 6.97748727979966e-06, "loss": 0.4693, "mean_token_accuracy": 0.7952465713024139, "step": 2951 }, { "epoch": 2.009533537623425, "grad_norm": 1.5132150650024414, "learning_rate": 6.97541806339178e-06, "loss": 0.6264, "mean_token_accuracy": 0.7847740650177002, "step": 2952 }, { "epoch": 2.010214504596527, "grad_norm": 1.7848076820373535, "learning_rate": 6.973348446006933e-06, "loss": 0.4649, "mean_token_accuracy": 0.8397520780563354, "step": 2953 }, { "epoch": 2.010895471569629, "grad_norm": 1.8043678998947144, "learning_rate": 6.971278428065214e-06, "loss": 0.6236, "mean_token_accuracy": 0.7781676948070526, "step": 2954 }, { "epoch": 2.011576438542731, "grad_norm": 1.8593765497207642, "learning_rate": 6.969208009986803e-06, "loss": 0.4141, "mean_token_accuracy": 0.8622273206710815, "step": 2955 }, { "epoch": 2.0122574055158324, "grad_norm": 1.886979341506958, "learning_rate": 6.967137192191963e-06, "loss": 0.349, "mean_token_accuracy": 0.8835228085517883, "step": 2956 }, { "epoch": 2.0129383724889345, "grad_norm": 1.7104976177215576, "learning_rate": 6.965065975101032e-06, "loss": 0.6461, "mean_token_accuracy": 0.7569501101970673, "step": 2957 }, { "epoch": 2.013619339462036, "grad_norm": 2.0906593799591064, "learning_rate": 6.962994359134433e-06, "loss": 0.5288, "mean_token_accuracy": 0.8084585070610046, "step": 2958 }, { "epoch": 2.0143003064351377, "grad_norm": 1.7722917795181274, "learning_rate": 6.960922344712671e-06, "loss": 0.533, "mean_token_accuracy": 0.7752386629581451, "step": 2959 }, { "epoch": 2.0149812734082397, "grad_norm": 2.0117146968841553, "learning_rate": 6.9588499322563304e-06, "loss": 0.4251, "mean_token_accuracy": 0.8501417338848114, "step": 2960 }, { "epoch": 2.0156622403813413, "grad_norm": 1.789744257926941, "learning_rate": 6.956777122186076e-06, "loss": 0.4107, "mean_token_accuracy": 0.8634348511695862, "step": 2961 }, { "epoch": 2.0163432073544434, "grad_norm": 1.8329695463180542, "learning_rate": 6.954703914922653e-06, "loss": 0.5389, "mean_token_accuracy": 0.7994756102561951, "step": 2962 }, { "epoch": 2.017024174327545, "grad_norm": 1.7534966468811035, "learning_rate": 6.952630310886888e-06, "loss": 0.6488, "mean_token_accuracy": 0.7769661247730255, "step": 2963 }, { "epoch": 2.017705141300647, "grad_norm": 1.6976571083068848, "learning_rate": 6.9505563104996886e-06, "loss": 0.4893, "mean_token_accuracy": 0.8260670900344849, "step": 2964 }, { "epoch": 2.0183861082737486, "grad_norm": 1.6345800161361694, "learning_rate": 6.9484819141820425e-06, "loss": 0.5324, "mean_token_accuracy": 0.790162593126297, "step": 2965 }, { "epoch": 2.0190670752468507, "grad_norm": 1.7558212280273438, "learning_rate": 6.946407122355019e-06, "loss": 0.411, "mean_token_accuracy": 0.859899491071701, "step": 2966 }, { "epoch": 2.0197480422199523, "grad_norm": 1.7621279954910278, "learning_rate": 6.944331935439762e-06, "loss": 0.3645, "mean_token_accuracy": 0.8662325143814087, "step": 2967 }, { "epoch": 2.0204290091930543, "grad_norm": 1.780436396598816, "learning_rate": 6.942256353857505e-06, "loss": 0.3938, "mean_token_accuracy": 0.856524646282196, "step": 2968 }, { "epoch": 2.021109976166156, "grad_norm": 1.6332710981369019, "learning_rate": 6.940180378029553e-06, "loss": 0.6441, "mean_token_accuracy": 0.7931933403015137, "step": 2969 }, { "epoch": 2.0217909431392576, "grad_norm": 1.733354926109314, "learning_rate": 6.9381040083772946e-06, "loss": 0.4733, "mean_token_accuracy": 0.8263077735900879, "step": 2970 }, { "epoch": 2.0224719101123596, "grad_norm": 1.7305712699890137, "learning_rate": 6.936027245322201e-06, "loss": 0.4228, "mean_token_accuracy": 0.8682672679424286, "step": 2971 }, { "epoch": 2.023152877085461, "grad_norm": 1.7489063739776611, "learning_rate": 6.933950089285819e-06, "loss": 0.5326, "mean_token_accuracy": 0.7930766344070435, "step": 2972 }, { "epoch": 2.0238338440585633, "grad_norm": 1.6486245393753052, "learning_rate": 6.931872540689775e-06, "loss": 0.598, "mean_token_accuracy": 0.7916372120380402, "step": 2973 }, { "epoch": 2.024514811031665, "grad_norm": 1.8617854118347168, "learning_rate": 6.929794599955778e-06, "loss": 0.4065, "mean_token_accuracy": 0.8563220202922821, "step": 2974 }, { "epoch": 2.025195778004767, "grad_norm": 1.701292872428894, "learning_rate": 6.927716267505617e-06, "loss": 0.5845, "mean_token_accuracy": 0.8109921514987946, "step": 2975 }, { "epoch": 2.0258767449778685, "grad_norm": 1.7701376676559448, "learning_rate": 6.925637543761157e-06, "loss": 0.5159, "mean_token_accuracy": 0.802451491355896, "step": 2976 }, { "epoch": 2.0265577119509706, "grad_norm": 1.7474597692489624, "learning_rate": 6.923558429144346e-06, "loss": 0.4605, "mean_token_accuracy": 0.829571932554245, "step": 2977 }, { "epoch": 2.027238678924072, "grad_norm": 1.738660454750061, "learning_rate": 6.921478924077206e-06, "loss": 0.5173, "mean_token_accuracy": 0.810337096452713, "step": 2978 }, { "epoch": 2.0279196458971738, "grad_norm": 1.736992597579956, "learning_rate": 6.9193990289818455e-06, "loss": 0.6453, "mean_token_accuracy": 0.7752413153648376, "step": 2979 }, { "epoch": 2.028600612870276, "grad_norm": 1.8090232610702515, "learning_rate": 6.917318744280448e-06, "loss": 0.4662, "mean_token_accuracy": 0.8456089198589325, "step": 2980 }, { "epoch": 2.0292815798433774, "grad_norm": 1.7668931484222412, "learning_rate": 6.915238070395275e-06, "loss": 0.4472, "mean_token_accuracy": 0.85069739818573, "step": 2981 }, { "epoch": 2.0299625468164795, "grad_norm": 1.59562087059021, "learning_rate": 6.913157007748671e-06, "loss": 0.6173, "mean_token_accuracy": 0.774705171585083, "step": 2982 }, { "epoch": 2.030643513789581, "grad_norm": 1.7538572549819946, "learning_rate": 6.911075556763055e-06, "loss": 0.5536, "mean_token_accuracy": 0.8072764277458191, "step": 2983 }, { "epoch": 2.031324480762683, "grad_norm": 1.7840160131454468, "learning_rate": 6.908993717860928e-06, "loss": 0.436, "mean_token_accuracy": 0.8385974764823914, "step": 2984 }, { "epoch": 2.0320054477357847, "grad_norm": 1.662663221359253, "learning_rate": 6.906911491464867e-06, "loss": 0.6152, "mean_token_accuracy": 0.7906351089477539, "step": 2985 }, { "epoch": 2.032686414708887, "grad_norm": 1.7998888492584229, "learning_rate": 6.904828877997531e-06, "loss": 0.3979, "mean_token_accuracy": 0.8620562255382538, "step": 2986 }, { "epoch": 2.0333673816819884, "grad_norm": 1.9155522584915161, "learning_rate": 6.9027458778816566e-06, "loss": 0.5185, "mean_token_accuracy": 0.8191118240356445, "step": 2987 }, { "epoch": 2.0340483486550904, "grad_norm": 1.7279006242752075, "learning_rate": 6.900662491540056e-06, "loss": 0.4772, "mean_token_accuracy": 0.8316398561000824, "step": 2988 }, { "epoch": 2.034729315628192, "grad_norm": 1.8462172746658325, "learning_rate": 6.898578719395622e-06, "loss": 0.4468, "mean_token_accuracy": 0.8428451716899872, "step": 2989 }, { "epoch": 2.0354102826012936, "grad_norm": 1.712181568145752, "learning_rate": 6.896494561871328e-06, "loss": 0.4452, "mean_token_accuracy": 0.8204884231090546, "step": 2990 }, { "epoch": 2.0360912495743957, "grad_norm": 1.6154361963272095, "learning_rate": 6.894410019390221e-06, "loss": 0.557, "mean_token_accuracy": 0.799311101436615, "step": 2991 }, { "epoch": 2.0367722165474973, "grad_norm": 1.9486002922058105, "learning_rate": 6.8923250923754304e-06, "loss": 0.4137, "mean_token_accuracy": 0.8562201857566833, "step": 2992 }, { "epoch": 2.0374531835205993, "grad_norm": 1.6204317808151245, "learning_rate": 6.890239781250158e-06, "loss": 0.6234, "mean_token_accuracy": 0.7858479619026184, "step": 2993 }, { "epoch": 2.038134150493701, "grad_norm": 1.8636976480484009, "learning_rate": 6.8881540864376925e-06, "loss": 0.4765, "mean_token_accuracy": 0.8334877490997314, "step": 2994 }, { "epoch": 2.038815117466803, "grad_norm": 1.8322168588638306, "learning_rate": 6.886068008361391e-06, "loss": 0.4791, "mean_token_accuracy": 0.8323057889938354, "step": 2995 }, { "epoch": 2.0394960844399046, "grad_norm": 1.788503646850586, "learning_rate": 6.883981547444693e-06, "loss": 0.4674, "mean_token_accuracy": 0.8388683497905731, "step": 2996 }, { "epoch": 2.0401770514130066, "grad_norm": 1.6894137859344482, "learning_rate": 6.8818947041111176e-06, "loss": 0.6101, "mean_token_accuracy": 0.7775295376777649, "step": 2997 }, { "epoch": 2.0408580183861083, "grad_norm": 1.7701241970062256, "learning_rate": 6.8798074787842585e-06, "loss": 0.5238, "mean_token_accuracy": 0.8234963119029999, "step": 2998 }, { "epoch": 2.0415389853592103, "grad_norm": 1.8974032402038574, "learning_rate": 6.8777198718877846e-06, "loss": 0.4, "mean_token_accuracy": 0.8464080691337585, "step": 2999 }, { "epoch": 2.042219952332312, "grad_norm": 1.826179027557373, "learning_rate": 6.8756318838454494e-06, "loss": 0.3791, "mean_token_accuracy": 0.867959201335907, "step": 3000 }, { "epoch": 2.0429009193054135, "grad_norm": 1.6847373247146606, "learning_rate": 6.87354351508108e-06, "loss": 0.7403, "mean_token_accuracy": 0.7338506281375885, "step": 3001 }, { "epoch": 2.0435818862785156, "grad_norm": 1.7722690105438232, "learning_rate": 6.871454766018577e-06, "loss": 0.459, "mean_token_accuracy": 0.8480782508850098, "step": 3002 }, { "epoch": 2.044262853251617, "grad_norm": 1.8840281963348389, "learning_rate": 6.869365637081922e-06, "loss": 0.4294, "mean_token_accuracy": 0.8458755612373352, "step": 3003 }, { "epoch": 2.044943820224719, "grad_norm": 1.8239628076553345, "learning_rate": 6.867276128695176e-06, "loss": 0.4, "mean_token_accuracy": 0.8611247837543488, "step": 3004 }, { "epoch": 2.045624787197821, "grad_norm": 1.7866660356521606, "learning_rate": 6.865186241282473e-06, "loss": 0.5725, "mean_token_accuracy": 0.8113425970077515, "step": 3005 }, { "epoch": 2.046305754170923, "grad_norm": 1.7480708360671997, "learning_rate": 6.863095975268026e-06, "loss": 0.5731, "mean_token_accuracy": 0.7972377836704254, "step": 3006 }, { "epoch": 2.0469867211440245, "grad_norm": 1.7840685844421387, "learning_rate": 6.861005331076123e-06, "loss": 0.5029, "mean_token_accuracy": 0.8210969567298889, "step": 3007 }, { "epoch": 2.0476676881171265, "grad_norm": 1.8055565357208252, "learning_rate": 6.858914309131131e-06, "loss": 0.4711, "mean_token_accuracy": 0.8366374969482422, "step": 3008 }, { "epoch": 2.048348655090228, "grad_norm": 1.8569306135177612, "learning_rate": 6.856822909857492e-06, "loss": 0.4163, "mean_token_accuracy": 0.8582255840301514, "step": 3009 }, { "epoch": 2.0490296220633297, "grad_norm": 1.8505269289016724, "learning_rate": 6.854731133679725e-06, "loss": 0.5611, "mean_token_accuracy": 0.7751711010932922, "step": 3010 }, { "epoch": 2.0497105890364318, "grad_norm": 1.75973379611969, "learning_rate": 6.852638981022426e-06, "loss": 0.4924, "mean_token_accuracy": 0.8024227619171143, "step": 3011 }, { "epoch": 2.0503915560095334, "grad_norm": 1.7985687255859375, "learning_rate": 6.850546452310268e-06, "loss": 0.421, "mean_token_accuracy": 0.8614809811115265, "step": 3012 }, { "epoch": 2.0510725229826354, "grad_norm": 1.6325494050979614, "learning_rate": 6.848453547967999e-06, "loss": 0.6684, "mean_token_accuracy": 0.7864308655261993, "step": 3013 }, { "epoch": 2.051753489955737, "grad_norm": 1.6636158227920532, "learning_rate": 6.846360268420443e-06, "loss": 0.3689, "mean_token_accuracy": 0.8623292148113251, "step": 3014 }, { "epoch": 2.052434456928839, "grad_norm": 1.6330523490905762, "learning_rate": 6.8442666140925e-06, "loss": 0.5229, "mean_token_accuracy": 0.8197160065174103, "step": 3015 }, { "epoch": 2.0531154239019407, "grad_norm": 1.6983107328414917, "learning_rate": 6.842172585409153e-06, "loss": 0.5594, "mean_token_accuracy": 0.8151257336139679, "step": 3016 }, { "epoch": 2.0537963908750427, "grad_norm": 1.715968132019043, "learning_rate": 6.840078182795447e-06, "loss": 0.5032, "mean_token_accuracy": 0.8318223059177399, "step": 3017 }, { "epoch": 2.0544773578481443, "grad_norm": 1.8240102529525757, "learning_rate": 6.837983406676514e-06, "loss": 0.3865, "mean_token_accuracy": 0.8672555983066559, "step": 3018 }, { "epoch": 2.0551583248212464, "grad_norm": 1.8571504354476929, "learning_rate": 6.835888257477559e-06, "loss": 0.4614, "mean_token_accuracy": 0.8426759541034698, "step": 3019 }, { "epoch": 2.055839291794348, "grad_norm": 1.8893728256225586, "learning_rate": 6.833792735623863e-06, "loss": 0.5201, "mean_token_accuracy": 0.7897457778453827, "step": 3020 }, { "epoch": 2.0565202587674496, "grad_norm": 1.9310246706008911, "learning_rate": 6.831696841540781e-06, "loss": 0.5499, "mean_token_accuracy": 0.7957426607608795, "step": 3021 }, { "epoch": 2.0572012257405516, "grad_norm": 1.8930084705352783, "learning_rate": 6.829600575653743e-06, "loss": 0.4237, "mean_token_accuracy": 0.8332934677600861, "step": 3022 }, { "epoch": 2.0578821927136532, "grad_norm": 1.7511870861053467, "learning_rate": 6.82750393838826e-06, "loss": 0.5086, "mean_token_accuracy": 0.824735164642334, "step": 3023 }, { "epoch": 2.0585631596867553, "grad_norm": 1.8577877283096313, "learning_rate": 6.825406930169913e-06, "loss": 0.4178, "mean_token_accuracy": 0.8532837331295013, "step": 3024 }, { "epoch": 2.059244126659857, "grad_norm": 1.7919191122055054, "learning_rate": 6.8233095514243576e-06, "loss": 0.4638, "mean_token_accuracy": 0.8246636390686035, "step": 3025 }, { "epoch": 2.059925093632959, "grad_norm": 1.731773018836975, "learning_rate": 6.821211802577328e-06, "loss": 0.5304, "mean_token_accuracy": 0.8089438378810883, "step": 3026 }, { "epoch": 2.0606060606060606, "grad_norm": 1.649696946144104, "learning_rate": 6.819113684054634e-06, "loss": 0.6003, "mean_token_accuracy": 0.7860898971557617, "step": 3027 }, { "epoch": 2.0612870275791626, "grad_norm": 1.8900185823440552, "learning_rate": 6.8170151962821575e-06, "loss": 0.4249, "mean_token_accuracy": 0.8531849682331085, "step": 3028 }, { "epoch": 2.061967994552264, "grad_norm": 1.7572197914123535, "learning_rate": 6.814916339685855e-06, "loss": 0.5347, "mean_token_accuracy": 0.7999382615089417, "step": 3029 }, { "epoch": 2.062648961525366, "grad_norm": 1.7194650173187256, "learning_rate": 6.812817114691761e-06, "loss": 0.3941, "mean_token_accuracy": 0.8622331917285919, "step": 3030 }, { "epoch": 2.063329928498468, "grad_norm": 1.7307512760162354, "learning_rate": 6.810717521725984e-06, "loss": 0.6493, "mean_token_accuracy": 0.7732617259025574, "step": 3031 }, { "epoch": 2.0640108954715695, "grad_norm": 1.7885879278182983, "learning_rate": 6.808617561214703e-06, "loss": 0.5412, "mean_token_accuracy": 0.7923894226551056, "step": 3032 }, { "epoch": 2.0646918624446715, "grad_norm": 1.767578363418579, "learning_rate": 6.80651723358418e-06, "loss": 0.496, "mean_token_accuracy": 0.8312160074710846, "step": 3033 }, { "epoch": 2.065372829417773, "grad_norm": 1.8235539197921753, "learning_rate": 6.804416539260743e-06, "loss": 0.5748, "mean_token_accuracy": 0.7953696250915527, "step": 3034 }, { "epoch": 2.066053796390875, "grad_norm": 1.7886263132095337, "learning_rate": 6.8023154786708e-06, "loss": 0.4353, "mean_token_accuracy": 0.8479681313037872, "step": 3035 }, { "epoch": 2.0667347633639768, "grad_norm": 1.5721629858016968, "learning_rate": 6.800214052240828e-06, "loss": 0.5507, "mean_token_accuracy": 0.8032005727291107, "step": 3036 }, { "epoch": 2.067415730337079, "grad_norm": 1.697752594947815, "learning_rate": 6.798112260397385e-06, "loss": 0.5589, "mean_token_accuracy": 0.8026770949363708, "step": 3037 }, { "epoch": 2.0680966973101804, "grad_norm": 1.8937193155288696, "learning_rate": 6.796010103567099e-06, "loss": 0.5637, "mean_token_accuracy": 0.8128458857536316, "step": 3038 }, { "epoch": 2.0687776642832825, "grad_norm": 1.6990340948104858, "learning_rate": 6.793907582176672e-06, "loss": 0.6535, "mean_token_accuracy": 0.7834191620349884, "step": 3039 }, { "epoch": 2.069458631256384, "grad_norm": 1.8081954717636108, "learning_rate": 6.79180469665288e-06, "loss": 0.4225, "mean_token_accuracy": 0.8423987627029419, "step": 3040 }, { "epoch": 2.0701395982294857, "grad_norm": 1.7484551668167114, "learning_rate": 6.7897014474225765e-06, "loss": 0.48, "mean_token_accuracy": 0.8085280954837799, "step": 3041 }, { "epoch": 2.0708205652025877, "grad_norm": 1.6009951829910278, "learning_rate": 6.787597834912684e-06, "loss": 0.5285, "mean_token_accuracy": 0.791002094745636, "step": 3042 }, { "epoch": 2.0715015321756893, "grad_norm": 1.636731743812561, "learning_rate": 6.785493859550202e-06, "loss": 0.5136, "mean_token_accuracy": 0.8119586706161499, "step": 3043 }, { "epoch": 2.0721824991487914, "grad_norm": 1.765044093132019, "learning_rate": 6.783389521762201e-06, "loss": 0.4304, "mean_token_accuracy": 0.8463947772979736, "step": 3044 }, { "epoch": 2.072863466121893, "grad_norm": 1.7634738683700562, "learning_rate": 6.7812848219758265e-06, "loss": 0.5609, "mean_token_accuracy": 0.795606255531311, "step": 3045 }, { "epoch": 2.073544433094995, "grad_norm": 1.9355666637420654, "learning_rate": 6.7791797606183e-06, "loss": 0.4396, "mean_token_accuracy": 0.8536089062690735, "step": 3046 }, { "epoch": 2.0742254000680966, "grad_norm": 1.673628807067871, "learning_rate": 6.77707433811691e-06, "loss": 0.4023, "mean_token_accuracy": 0.8715163171291351, "step": 3047 }, { "epoch": 2.0749063670411987, "grad_norm": 1.664818286895752, "learning_rate": 6.774968554899026e-06, "loss": 0.5589, "mean_token_accuracy": 0.802202045917511, "step": 3048 }, { "epoch": 2.0755873340143003, "grad_norm": 1.894558072090149, "learning_rate": 6.772862411392085e-06, "loss": 0.4333, "mean_token_accuracy": 0.8475740551948547, "step": 3049 }, { "epoch": 2.076268300987402, "grad_norm": 1.618371605873108, "learning_rate": 6.770755908023599e-06, "loss": 0.5686, "mean_token_accuracy": 0.8062464594841003, "step": 3050 }, { "epoch": 2.076949267960504, "grad_norm": 1.8131425380706787, "learning_rate": 6.768649045221154e-06, "loss": 0.5034, "mean_token_accuracy": 0.8222227990627289, "step": 3051 }, { "epoch": 2.0776302349336055, "grad_norm": 1.9583146572113037, "learning_rate": 6.766541823412407e-06, "loss": 0.4774, "mean_token_accuracy": 0.8120732009410858, "step": 3052 }, { "epoch": 2.0783112019067076, "grad_norm": 1.7779372930526733, "learning_rate": 6.764434243025091e-06, "loss": 0.5546, "mean_token_accuracy": 0.8103416562080383, "step": 3053 }, { "epoch": 2.078992168879809, "grad_norm": 1.685203194618225, "learning_rate": 6.762326304487007e-06, "loss": 0.7722, "mean_token_accuracy": 0.7387748658657074, "step": 3054 }, { "epoch": 2.0796731358529112, "grad_norm": 1.8300890922546387, "learning_rate": 6.760218008226032e-06, "loss": 0.4717, "mean_token_accuracy": 0.8385847210884094, "step": 3055 }, { "epoch": 2.080354102826013, "grad_norm": 1.7504608631134033, "learning_rate": 6.758109354670116e-06, "loss": 0.4029, "mean_token_accuracy": 0.8483517467975616, "step": 3056 }, { "epoch": 2.081035069799115, "grad_norm": 1.8117707967758179, "learning_rate": 6.756000344247281e-06, "loss": 0.4329, "mean_token_accuracy": 0.8368720412254333, "step": 3057 }, { "epoch": 2.0817160367722165, "grad_norm": 1.8678120374679565, "learning_rate": 6.75389097738562e-06, "loss": 0.4975, "mean_token_accuracy": 0.8113539814949036, "step": 3058 }, { "epoch": 2.0823970037453186, "grad_norm": 1.9227547645568848, "learning_rate": 6.751781254513299e-06, "loss": 0.41, "mean_token_accuracy": 0.8600229620933533, "step": 3059 }, { "epoch": 2.08307797071842, "grad_norm": 1.568818211555481, "learning_rate": 6.749671176058557e-06, "loss": 0.6381, "mean_token_accuracy": 0.7698249220848083, "step": 3060 }, { "epoch": 2.0837589376915218, "grad_norm": 1.8556894063949585, "learning_rate": 6.747560742449705e-06, "loss": 0.4933, "mean_token_accuracy": 0.7930406928062439, "step": 3061 }, { "epoch": 2.084439904664624, "grad_norm": 1.6544454097747803, "learning_rate": 6.745449954115125e-06, "loss": 0.5197, "mean_token_accuracy": 0.8386875689029694, "step": 3062 }, { "epoch": 2.0851208716377254, "grad_norm": 1.7273932695388794, "learning_rate": 6.743338811483275e-06, "loss": 0.588, "mean_token_accuracy": 0.7915653586387634, "step": 3063 }, { "epoch": 2.0858018386108275, "grad_norm": 1.6861810684204102, "learning_rate": 6.7412273149826765e-06, "loss": 0.4774, "mean_token_accuracy": 0.8167078197002411, "step": 3064 }, { "epoch": 2.086482805583929, "grad_norm": 1.7065541744232178, "learning_rate": 6.739115465041934e-06, "loss": 0.4477, "mean_token_accuracy": 0.8495629131793976, "step": 3065 }, { "epoch": 2.087163772557031, "grad_norm": 1.9428040981292725, "learning_rate": 6.737003262089714e-06, "loss": 0.4567, "mean_token_accuracy": 0.8438577651977539, "step": 3066 }, { "epoch": 2.0878447395301327, "grad_norm": 1.6926970481872559, "learning_rate": 6.734890706554758e-06, "loss": 0.5982, "mean_token_accuracy": 0.7807473838329315, "step": 3067 }, { "epoch": 2.0885257065032348, "grad_norm": 1.6773933172225952, "learning_rate": 6.732777798865882e-06, "loss": 0.5143, "mean_token_accuracy": 0.7746202647686005, "step": 3068 }, { "epoch": 2.0892066734763364, "grad_norm": 1.7511224746704102, "learning_rate": 6.730664539451972e-06, "loss": 0.497, "mean_token_accuracy": 0.819013774394989, "step": 3069 }, { "epoch": 2.0898876404494384, "grad_norm": 1.7178657054901123, "learning_rate": 6.728550928741981e-06, "loss": 0.6283, "mean_token_accuracy": 0.776368647813797, "step": 3070 }, { "epoch": 2.09056860742254, "grad_norm": 1.8723994493484497, "learning_rate": 6.726436967164937e-06, "loss": 0.4562, "mean_token_accuracy": 0.8309962749481201, "step": 3071 }, { "epoch": 2.0912495743956416, "grad_norm": 1.7666912078857422, "learning_rate": 6.724322655149943e-06, "loss": 0.522, "mean_token_accuracy": 0.8335141241550446, "step": 3072 }, { "epoch": 2.0919305413687437, "grad_norm": 1.7527518272399902, "learning_rate": 6.722207993126164e-06, "loss": 0.4375, "mean_token_accuracy": 0.8598190248012543, "step": 3073 }, { "epoch": 2.0926115083418453, "grad_norm": 1.8748749494552612, "learning_rate": 6.7200929815228464e-06, "loss": 0.4969, "mean_token_accuracy": 0.8152442872524261, "step": 3074 }, { "epoch": 2.0932924753149473, "grad_norm": 1.7096550464630127, "learning_rate": 6.717977620769298e-06, "loss": 0.6134, "mean_token_accuracy": 0.7683249413967133, "step": 3075 }, { "epoch": 2.093973442288049, "grad_norm": 1.7804343700408936, "learning_rate": 6.715861911294904e-06, "loss": 0.4131, "mean_token_accuracy": 0.8579076826572418, "step": 3076 }, { "epoch": 2.094654409261151, "grad_norm": 1.8685662746429443, "learning_rate": 6.713745853529117e-06, "loss": 0.3617, "mean_token_accuracy": 0.8751904964447021, "step": 3077 }, { "epoch": 2.0953353762342526, "grad_norm": 1.8745427131652832, "learning_rate": 6.7116294479014625e-06, "loss": 0.4925, "mean_token_accuracy": 0.8001689016819, "step": 3078 }, { "epoch": 2.0960163432073546, "grad_norm": 1.779099464416504, "learning_rate": 6.709512694841534e-06, "loss": 0.3711, "mean_token_accuracy": 0.8658636808395386, "step": 3079 }, { "epoch": 2.0966973101804562, "grad_norm": 1.7415788173675537, "learning_rate": 6.707395594779e-06, "loss": 0.56, "mean_token_accuracy": 0.7781020104885101, "step": 3080 }, { "epoch": 2.097378277153558, "grad_norm": 1.749255895614624, "learning_rate": 6.705278148143595e-06, "loss": 0.6453, "mean_token_accuracy": 0.7763896882534027, "step": 3081 }, { "epoch": 2.09805924412666, "grad_norm": 1.7635297775268555, "learning_rate": 6.703160355365124e-06, "loss": 0.529, "mean_token_accuracy": 0.8172341883182526, "step": 3082 }, { "epoch": 2.0987402110997615, "grad_norm": 1.833112120628357, "learning_rate": 6.701042216873466e-06, "loss": 0.5773, "mean_token_accuracy": 0.767459362745285, "step": 3083 }, { "epoch": 2.0994211780728635, "grad_norm": 1.6281404495239258, "learning_rate": 6.698923733098567e-06, "loss": 0.4346, "mean_token_accuracy": 0.8130261301994324, "step": 3084 }, { "epoch": 2.100102145045965, "grad_norm": 1.753352165222168, "learning_rate": 6.696804904470442e-06, "loss": 0.4768, "mean_token_accuracy": 0.8256202340126038, "step": 3085 }, { "epoch": 2.100783112019067, "grad_norm": 1.6365355253219604, "learning_rate": 6.69468573141918e-06, "loss": 0.5794, "mean_token_accuracy": 0.7927781045436859, "step": 3086 }, { "epoch": 2.101464078992169, "grad_norm": 1.7902075052261353, "learning_rate": 6.692566214374939e-06, "loss": 0.5582, "mean_token_accuracy": 0.8090074956417084, "step": 3087 }, { "epoch": 2.102145045965271, "grad_norm": 1.6838724613189697, "learning_rate": 6.690446353767943e-06, "loss": 0.6376, "mean_token_accuracy": 0.8008783757686615, "step": 3088 }, { "epoch": 2.1028260129383725, "grad_norm": 1.7713446617126465, "learning_rate": 6.68832615002849e-06, "loss": 0.4718, "mean_token_accuracy": 0.8288267850875854, "step": 3089 }, { "epoch": 2.1035069799114745, "grad_norm": 1.801213264465332, "learning_rate": 6.686205603586945e-06, "loss": 0.3974, "mean_token_accuracy": 0.8586208522319794, "step": 3090 }, { "epoch": 2.104187946884576, "grad_norm": 1.8925029039382935, "learning_rate": 6.6840847148737445e-06, "loss": 0.4535, "mean_token_accuracy": 0.8423352241516113, "step": 3091 }, { "epoch": 2.1048689138576777, "grad_norm": 1.7630906105041504, "learning_rate": 6.681963484319394e-06, "loss": 0.4371, "mean_token_accuracy": 0.8516093492507935, "step": 3092 }, { "epoch": 2.1055498808307798, "grad_norm": 1.9167726039886475, "learning_rate": 6.679841912354466e-06, "loss": 0.4588, "mean_token_accuracy": 0.8315687477588654, "step": 3093 }, { "epoch": 2.1062308478038814, "grad_norm": 1.8365161418914795, "learning_rate": 6.677719999409606e-06, "loss": 0.399, "mean_token_accuracy": 0.8608436286449432, "step": 3094 }, { "epoch": 2.1069118147769834, "grad_norm": 1.524660587310791, "learning_rate": 6.675597745915527e-06, "loss": 0.7435, "mean_token_accuracy": 0.7541038691997528, "step": 3095 }, { "epoch": 2.107592781750085, "grad_norm": 1.9786525964736938, "learning_rate": 6.673475152303009e-06, "loss": 0.3971, "mean_token_accuracy": 0.8609915673732758, "step": 3096 }, { "epoch": 2.108273748723187, "grad_norm": 1.678308129310608, "learning_rate": 6.671352219002907e-06, "loss": 0.4724, "mean_token_accuracy": 0.8329270780086517, "step": 3097 }, { "epoch": 2.1089547156962887, "grad_norm": 1.7575689554214478, "learning_rate": 6.6692289464461375e-06, "loss": 0.5098, "mean_token_accuracy": 0.8207334876060486, "step": 3098 }, { "epoch": 2.1096356826693907, "grad_norm": 1.6802237033843994, "learning_rate": 6.667105335063693e-06, "loss": 0.6797, "mean_token_accuracy": 0.7658935487270355, "step": 3099 }, { "epoch": 2.1103166496424923, "grad_norm": 1.7719078063964844, "learning_rate": 6.664981385286626e-06, "loss": 0.459, "mean_token_accuracy": 0.8195448815822601, "step": 3100 }, { "epoch": 2.110997616615594, "grad_norm": 1.931675910949707, "learning_rate": 6.662857097546067e-06, "loss": 0.4385, "mean_token_accuracy": 0.8339181244373322, "step": 3101 }, { "epoch": 2.111678583588696, "grad_norm": 1.5929731130599976, "learning_rate": 6.660732472273211e-06, "loss": 0.4962, "mean_token_accuracy": 0.8108254969120026, "step": 3102 }, { "epoch": 2.1123595505617976, "grad_norm": 1.9597094058990479, "learning_rate": 6.6586075098993196e-06, "loss": 0.5378, "mean_token_accuracy": 0.7963768243789673, "step": 3103 }, { "epoch": 2.1130405175348996, "grad_norm": 2.0126166343688965, "learning_rate": 6.656482210855727e-06, "loss": 0.4583, "mean_token_accuracy": 0.8412914872169495, "step": 3104 }, { "epoch": 2.1137214845080012, "grad_norm": 1.6999661922454834, "learning_rate": 6.654356575573832e-06, "loss": 0.5678, "mean_token_accuracy": 0.8308755159378052, "step": 3105 }, { "epoch": 2.1144024514811033, "grad_norm": 1.8338086605072021, "learning_rate": 6.652230604485103e-06, "loss": 0.4894, "mean_token_accuracy": 0.8249503672122955, "step": 3106 }, { "epoch": 2.115083418454205, "grad_norm": 1.7438254356384277, "learning_rate": 6.650104298021076e-06, "loss": 0.5203, "mean_token_accuracy": 0.7696249783039093, "step": 3107 }, { "epoch": 2.115764385427307, "grad_norm": 1.7757552862167358, "learning_rate": 6.647977656613358e-06, "loss": 0.5086, "mean_token_accuracy": 0.8202524185180664, "step": 3108 }, { "epoch": 2.1164453524004085, "grad_norm": 1.7779394388198853, "learning_rate": 6.645850680693622e-06, "loss": 0.5918, "mean_token_accuracy": 0.7555901110172272, "step": 3109 }, { "epoch": 2.1171263193735106, "grad_norm": 1.759158730506897, "learning_rate": 6.643723370693608e-06, "loss": 0.4278, "mean_token_accuracy": 0.8539618253707886, "step": 3110 }, { "epoch": 2.117807286346612, "grad_norm": 1.9501572847366333, "learning_rate": 6.641595727045122e-06, "loss": 0.4083, "mean_token_accuracy": 0.8631599247455597, "step": 3111 }, { "epoch": 2.118488253319714, "grad_norm": 1.7430907487869263, "learning_rate": 6.639467750180042e-06, "loss": 0.4559, "mean_token_accuracy": 0.8257122933864594, "step": 3112 }, { "epoch": 2.119169220292816, "grad_norm": 1.7949436902999878, "learning_rate": 6.637339440530313e-06, "loss": 0.5079, "mean_token_accuracy": 0.8344006836414337, "step": 3113 }, { "epoch": 2.1198501872659175, "grad_norm": 1.6515710353851318, "learning_rate": 6.6352107985279455e-06, "loss": 0.696, "mean_token_accuracy": 0.7369974255561829, "step": 3114 }, { "epoch": 2.1205311542390195, "grad_norm": 1.8851721286773682, "learning_rate": 6.633081824605019e-06, "loss": 0.4205, "mean_token_accuracy": 0.8591335713863373, "step": 3115 }, { "epoch": 2.121212121212121, "grad_norm": 1.796883463859558, "learning_rate": 6.6309525191936766e-06, "loss": 0.4408, "mean_token_accuracy": 0.8553414344787598, "step": 3116 }, { "epoch": 2.121893088185223, "grad_norm": 1.7492361068725586, "learning_rate": 6.6288228827261365e-06, "loss": 0.6751, "mean_token_accuracy": 0.7728957533836365, "step": 3117 }, { "epoch": 2.1225740551583248, "grad_norm": 1.7513469457626343, "learning_rate": 6.626692915634677e-06, "loss": 0.5589, "mean_token_accuracy": 0.7846992909908295, "step": 3118 }, { "epoch": 2.123255022131427, "grad_norm": 1.7214397192001343, "learning_rate": 6.624562618351646e-06, "loss": 0.5161, "mean_token_accuracy": 0.8185124695301056, "step": 3119 }, { "epoch": 2.1239359891045284, "grad_norm": 1.6905442476272583, "learning_rate": 6.622431991309458e-06, "loss": 0.5636, "mean_token_accuracy": 0.7832719385623932, "step": 3120 }, { "epoch": 2.12461695607763, "grad_norm": 1.808129072189331, "learning_rate": 6.620301034940597e-06, "loss": 0.3382, "mean_token_accuracy": 0.8842048645019531, "step": 3121 }, { "epoch": 2.125297923050732, "grad_norm": 1.82322096824646, "learning_rate": 6.6181697496776084e-06, "loss": 0.4209, "mean_token_accuracy": 0.8482379913330078, "step": 3122 }, { "epoch": 2.1259788900238337, "grad_norm": 1.8705432415008545, "learning_rate": 6.61603813595311e-06, "loss": 0.3598, "mean_token_accuracy": 0.874654620885849, "step": 3123 }, { "epoch": 2.1266598569969357, "grad_norm": 1.7320040464401245, "learning_rate": 6.613906194199783e-06, "loss": 0.4412, "mean_token_accuracy": 0.8289861977100372, "step": 3124 }, { "epoch": 2.1273408239700373, "grad_norm": 1.4681822061538696, "learning_rate": 6.611773924850378e-06, "loss": 0.7014, "mean_token_accuracy": 0.7606703341007233, "step": 3125 }, { "epoch": 2.1280217909431394, "grad_norm": 1.8372607231140137, "learning_rate": 6.609641328337706e-06, "loss": 0.3689, "mean_token_accuracy": 0.8778688907623291, "step": 3126 }, { "epoch": 2.128702757916241, "grad_norm": 1.7791666984558105, "learning_rate": 6.6075084050946514e-06, "loss": 0.3868, "mean_token_accuracy": 0.8646840751171112, "step": 3127 }, { "epoch": 2.129383724889343, "grad_norm": 1.6810451745986938, "learning_rate": 6.605375155554162e-06, "loss": 0.5442, "mean_token_accuracy": 0.7889332175254822, "step": 3128 }, { "epoch": 2.1300646918624446, "grad_norm": 1.9131757020950317, "learning_rate": 6.603241580149251e-06, "loss": 0.5212, "mean_token_accuracy": 0.8300413489341736, "step": 3129 }, { "epoch": 2.1307456588355467, "grad_norm": 1.7163231372833252, "learning_rate": 6.601107679313001e-06, "loss": 0.6237, "mean_token_accuracy": 0.7809596061706543, "step": 3130 }, { "epoch": 2.1314266258086483, "grad_norm": 1.8946316242218018, "learning_rate": 6.598973453478556e-06, "loss": 0.5803, "mean_token_accuracy": 0.7869720757007599, "step": 3131 }, { "epoch": 2.13210759278175, "grad_norm": 1.8102792501449585, "learning_rate": 6.596838903079128e-06, "loss": 0.5321, "mean_token_accuracy": 0.8155505359172821, "step": 3132 }, { "epoch": 2.132788559754852, "grad_norm": 1.845676064491272, "learning_rate": 6.594704028547996e-06, "loss": 0.4781, "mean_token_accuracy": 0.8343968093395233, "step": 3133 }, { "epoch": 2.1334695267279535, "grad_norm": 1.7602384090423584, "learning_rate": 6.592568830318504e-06, "loss": 0.4654, "mean_token_accuracy": 0.8414501845836639, "step": 3134 }, { "epoch": 2.1341504937010556, "grad_norm": 1.6094752550125122, "learning_rate": 6.590433308824064e-06, "loss": 0.6237, "mean_token_accuracy": 0.7870599627494812, "step": 3135 }, { "epoch": 2.134831460674157, "grad_norm": 1.856740951538086, "learning_rate": 6.588297464498148e-06, "loss": 0.4619, "mean_token_accuracy": 0.8331404328346252, "step": 3136 }, { "epoch": 2.1355124276472592, "grad_norm": 1.8437224626541138, "learning_rate": 6.586161297774296e-06, "loss": 0.6144, "mean_token_accuracy": 0.7730868458747864, "step": 3137 }, { "epoch": 2.136193394620361, "grad_norm": 1.8359824419021606, "learning_rate": 6.584024809086118e-06, "loss": 0.4835, "mean_token_accuracy": 0.8295798897743225, "step": 3138 }, { "epoch": 2.136874361593463, "grad_norm": 1.8741041421890259, "learning_rate": 6.5818879988672824e-06, "loss": 0.5036, "mean_token_accuracy": 0.8119248151779175, "step": 3139 }, { "epoch": 2.1375553285665645, "grad_norm": 1.981065034866333, "learning_rate": 6.57975086755153e-06, "loss": 0.4942, "mean_token_accuracy": 0.8323199152946472, "step": 3140 }, { "epoch": 2.1382362955396665, "grad_norm": 1.8933343887329102, "learning_rate": 6.577613415572658e-06, "loss": 0.5583, "mean_token_accuracy": 0.7861518263816833, "step": 3141 }, { "epoch": 2.138917262512768, "grad_norm": 1.7238188982009888, "learning_rate": 6.5754756433645365e-06, "loss": 0.4847, "mean_token_accuracy": 0.7649150788784027, "step": 3142 }, { "epoch": 2.1395982294858698, "grad_norm": 1.777828574180603, "learning_rate": 6.5733375513610975e-06, "loss": 0.4999, "mean_token_accuracy": 0.8108226656913757, "step": 3143 }, { "epoch": 2.140279196458972, "grad_norm": 1.7843372821807861, "learning_rate": 6.571199139996336e-06, "loss": 0.5675, "mean_token_accuracy": 0.7619026303291321, "step": 3144 }, { "epoch": 2.1409601634320734, "grad_norm": 1.6621800661087036, "learning_rate": 6.569060409704317e-06, "loss": 0.4546, "mean_token_accuracy": 0.8364701867103577, "step": 3145 }, { "epoch": 2.1416411304051755, "grad_norm": 1.8414732217788696, "learning_rate": 6.5669213609191664e-06, "loss": 0.5393, "mean_token_accuracy": 0.8210538625717163, "step": 3146 }, { "epoch": 2.142322097378277, "grad_norm": 1.8851008415222168, "learning_rate": 6.564781994075073e-06, "loss": 0.4698, "mean_token_accuracy": 0.8346181213855743, "step": 3147 }, { "epoch": 2.143003064351379, "grad_norm": 1.9127329587936401, "learning_rate": 6.562642309606295e-06, "loss": 0.407, "mean_token_accuracy": 0.8518538177013397, "step": 3148 }, { "epoch": 2.1436840313244807, "grad_norm": 1.9382989406585693, "learning_rate": 6.5605023079471515e-06, "loss": 0.4636, "mean_token_accuracy": 0.8258962035179138, "step": 3149 }, { "epoch": 2.1443649982975828, "grad_norm": 1.831300973892212, "learning_rate": 6.55836198953203e-06, "loss": 0.4192, "mean_token_accuracy": 0.8389306962490082, "step": 3150 }, { "epoch": 2.1450459652706844, "grad_norm": 1.8387117385864258, "learning_rate": 6.556221354795376e-06, "loss": 0.5065, "mean_token_accuracy": 0.8086375892162323, "step": 3151 }, { "epoch": 2.145726932243786, "grad_norm": 1.8337514400482178, "learning_rate": 6.554080404171703e-06, "loss": 0.4456, "mean_token_accuracy": 0.8233310580253601, "step": 3152 }, { "epoch": 2.146407899216888, "grad_norm": 1.9178768396377563, "learning_rate": 6.551939138095589e-06, "loss": 0.3418, "mean_token_accuracy": 0.8845617175102234, "step": 3153 }, { "epoch": 2.1470888661899896, "grad_norm": 1.6946038007736206, "learning_rate": 6.549797557001676e-06, "loss": 0.4412, "mean_token_accuracy": 0.8432695269584656, "step": 3154 }, { "epoch": 2.1477698331630917, "grad_norm": 1.8044490814208984, "learning_rate": 6.547655661324671e-06, "loss": 0.3866, "mean_token_accuracy": 0.8581258058547974, "step": 3155 }, { "epoch": 2.1484508001361933, "grad_norm": 1.7324074506759644, "learning_rate": 6.545513451499339e-06, "loss": 0.4113, "mean_token_accuracy": 0.8487467169761658, "step": 3156 }, { "epoch": 2.1491317671092953, "grad_norm": 1.7087764739990234, "learning_rate": 6.543370927960515e-06, "loss": 0.4674, "mean_token_accuracy": 0.8385210335254669, "step": 3157 }, { "epoch": 2.149812734082397, "grad_norm": 1.8719544410705566, "learning_rate": 6.5412280911430966e-06, "loss": 0.3849, "mean_token_accuracy": 0.8355147242546082, "step": 3158 }, { "epoch": 2.150493701055499, "grad_norm": 1.886191487312317, "learning_rate": 6.539084941482042e-06, "loss": 0.5123, "mean_token_accuracy": 0.8150842189788818, "step": 3159 }, { "epoch": 2.1511746680286006, "grad_norm": 1.903635025024414, "learning_rate": 6.536941479412377e-06, "loss": 0.5673, "mean_token_accuracy": 0.7996192276477814, "step": 3160 }, { "epoch": 2.151855635001702, "grad_norm": 1.7883399724960327, "learning_rate": 6.534797705369187e-06, "loss": 0.6209, "mean_token_accuracy": 0.7730801105499268, "step": 3161 }, { "epoch": 2.1525366019748042, "grad_norm": 1.7870240211486816, "learning_rate": 6.5326536197876235e-06, "loss": 0.5077, "mean_token_accuracy": 0.8091237843036652, "step": 3162 }, { "epoch": 2.153217568947906, "grad_norm": 1.7776131629943848, "learning_rate": 6.530509223102899e-06, "loss": 0.539, "mean_token_accuracy": 0.7874128222465515, "step": 3163 }, { "epoch": 2.153898535921008, "grad_norm": 1.7625150680541992, "learning_rate": 6.528364515750291e-06, "loss": 0.4932, "mean_token_accuracy": 0.8310121595859528, "step": 3164 }, { "epoch": 2.1545795028941095, "grad_norm": 1.763295292854309, "learning_rate": 6.526219498165141e-06, "loss": 0.4843, "mean_token_accuracy": 0.8297437727451324, "step": 3165 }, { "epoch": 2.1552604698672115, "grad_norm": 1.8024483919143677, "learning_rate": 6.524074170782848e-06, "loss": 0.5263, "mean_token_accuracy": 0.7958525717258453, "step": 3166 }, { "epoch": 2.155941436840313, "grad_norm": 1.9764724969863892, "learning_rate": 6.52192853403888e-06, "loss": 0.4726, "mean_token_accuracy": 0.8353771269321442, "step": 3167 }, { "epoch": 2.156622403813415, "grad_norm": 1.780707597732544, "learning_rate": 6.519782588368766e-06, "loss": 0.4713, "mean_token_accuracy": 0.8170210123062134, "step": 3168 }, { "epoch": 2.157303370786517, "grad_norm": 1.7854868173599243, "learning_rate": 6.517636334208097e-06, "loss": 0.4779, "mean_token_accuracy": 0.838921070098877, "step": 3169 }, { "epoch": 2.157984337759619, "grad_norm": 1.9166629314422607, "learning_rate": 6.515489771992528e-06, "loss": 0.3588, "mean_token_accuracy": 0.8797295987606049, "step": 3170 }, { "epoch": 2.1586653047327204, "grad_norm": 1.7431988716125488, "learning_rate": 6.513342902157772e-06, "loss": 0.5445, "mean_token_accuracy": 0.8044962286949158, "step": 3171 }, { "epoch": 2.1593462717058225, "grad_norm": 1.833866834640503, "learning_rate": 6.51119572513961e-06, "loss": 0.4832, "mean_token_accuracy": 0.8262947797775269, "step": 3172 }, { "epoch": 2.160027238678924, "grad_norm": 1.717753291130066, "learning_rate": 6.5090482413738836e-06, "loss": 0.5791, "mean_token_accuracy": 0.7647446691989899, "step": 3173 }, { "epoch": 2.1607082056520257, "grad_norm": 1.739453673362732, "learning_rate": 6.506900451296494e-06, "loss": 0.5064, "mean_token_accuracy": 0.815268486738205, "step": 3174 }, { "epoch": 2.1613891726251278, "grad_norm": 1.7807821035385132, "learning_rate": 6.504752355343409e-06, "loss": 0.4597, "mean_token_accuracy": 0.8374549448490143, "step": 3175 }, { "epoch": 2.1620701395982294, "grad_norm": 1.8356547355651855, "learning_rate": 6.502603953950657e-06, "loss": 0.456, "mean_token_accuracy": 0.8203973472118378, "step": 3176 }, { "epoch": 2.1627511065713314, "grad_norm": 1.8455431461334229, "learning_rate": 6.500455247554326e-06, "loss": 0.5168, "mean_token_accuracy": 0.8305007219314575, "step": 3177 }, { "epoch": 2.163432073544433, "grad_norm": 1.8445154428482056, "learning_rate": 6.498306236590567e-06, "loss": 0.604, "mean_token_accuracy": 0.7904880046844482, "step": 3178 }, { "epoch": 2.164113040517535, "grad_norm": 1.8916040658950806, "learning_rate": 6.496156921495594e-06, "loss": 0.4342, "mean_token_accuracy": 0.8528663516044617, "step": 3179 }, { "epoch": 2.1647940074906367, "grad_norm": 1.7709128856658936, "learning_rate": 6.494007302705684e-06, "loss": 0.3947, "mean_token_accuracy": 0.8648788332939148, "step": 3180 }, { "epoch": 2.1654749744637387, "grad_norm": 1.7786800861358643, "learning_rate": 6.4918573806571735e-06, "loss": 0.3202, "mean_token_accuracy": 0.8874160051345825, "step": 3181 }, { "epoch": 2.1661559414368403, "grad_norm": 1.7129708528518677, "learning_rate": 6.489707155786458e-06, "loss": 0.4053, "mean_token_accuracy": 0.8539929986000061, "step": 3182 }, { "epoch": 2.166836908409942, "grad_norm": 1.8507537841796875, "learning_rate": 6.48755662853e-06, "loss": 0.5828, "mean_token_accuracy": 0.7937681078910828, "step": 3183 }, { "epoch": 2.167517875383044, "grad_norm": 2.0524206161499023, "learning_rate": 6.485405799324318e-06, "loss": 0.4858, "mean_token_accuracy": 0.8298144936561584, "step": 3184 }, { "epoch": 2.1681988423561456, "grad_norm": 1.688044548034668, "learning_rate": 6.483254668605998e-06, "loss": 0.5298, "mean_token_accuracy": 0.8084592223167419, "step": 3185 }, { "epoch": 2.1688798093292476, "grad_norm": 1.7785520553588867, "learning_rate": 6.481103236811684e-06, "loss": 0.4951, "mean_token_accuracy": 0.8189094066619873, "step": 3186 }, { "epoch": 2.1695607763023492, "grad_norm": 1.8291425704956055, "learning_rate": 6.478951504378075e-06, "loss": 0.5803, "mean_token_accuracy": 0.795171707868576, "step": 3187 }, { "epoch": 2.1702417432754513, "grad_norm": 1.934332251548767, "learning_rate": 6.476799471741944e-06, "loss": 0.472, "mean_token_accuracy": 0.835585206747055, "step": 3188 }, { "epoch": 2.170922710248553, "grad_norm": 1.800051212310791, "learning_rate": 6.474647139340112e-06, "loss": 0.5319, "mean_token_accuracy": 0.8103699386119843, "step": 3189 }, { "epoch": 2.171603677221655, "grad_norm": 2.000412940979004, "learning_rate": 6.472494507609471e-06, "loss": 0.4083, "mean_token_accuracy": 0.8461179435253143, "step": 3190 }, { "epoch": 2.1722846441947565, "grad_norm": 1.7796592712402344, "learning_rate": 6.470341576986967e-06, "loss": 0.4928, "mean_token_accuracy": 0.8313463032245636, "step": 3191 }, { "epoch": 2.172965611167858, "grad_norm": 1.6874091625213623, "learning_rate": 6.4681883479096096e-06, "loss": 0.4045, "mean_token_accuracy": 0.8603494465351105, "step": 3192 }, { "epoch": 2.17364657814096, "grad_norm": 2.0190277099609375, "learning_rate": 6.466034820814469e-06, "loss": 0.5046, "mean_token_accuracy": 0.823521077632904, "step": 3193 }, { "epoch": 2.174327545114062, "grad_norm": 1.8615080118179321, "learning_rate": 6.463880996138673e-06, "loss": 0.5452, "mean_token_accuracy": 0.807406097650528, "step": 3194 }, { "epoch": 2.175008512087164, "grad_norm": 1.8022621870040894, "learning_rate": 6.461726874319414e-06, "loss": 0.479, "mean_token_accuracy": 0.8256226181983948, "step": 3195 }, { "epoch": 2.1756894790602654, "grad_norm": 1.8178452253341675, "learning_rate": 6.4595724557939465e-06, "loss": 0.4939, "mean_token_accuracy": 0.8249292969703674, "step": 3196 }, { "epoch": 2.1763704460333675, "grad_norm": 1.6757057905197144, "learning_rate": 6.457417740999574e-06, "loss": 0.5169, "mean_token_accuracy": 0.8140959441661835, "step": 3197 }, { "epoch": 2.177051413006469, "grad_norm": 1.8927104473114014, "learning_rate": 6.455262730373673e-06, "loss": 0.3821, "mean_token_accuracy": 0.8692697584629059, "step": 3198 }, { "epoch": 2.177732379979571, "grad_norm": 1.9496276378631592, "learning_rate": 6.453107424353673e-06, "loss": 0.4667, "mean_token_accuracy": 0.83524289727211, "step": 3199 }, { "epoch": 2.1784133469526727, "grad_norm": 1.989438533782959, "learning_rate": 6.450951823377066e-06, "loss": 0.3797, "mean_token_accuracy": 0.8602637946605682, "step": 3200 }, { "epoch": 2.179094313925775, "grad_norm": 1.8685446977615356, "learning_rate": 6.448795927881404e-06, "loss": 0.4021, "mean_token_accuracy": 0.8626025915145874, "step": 3201 }, { "epoch": 2.1797752808988764, "grad_norm": 1.7532293796539307, "learning_rate": 6.446639738304294e-06, "loss": 0.6512, "mean_token_accuracy": 0.7751094400882721, "step": 3202 }, { "epoch": 2.180456247871978, "grad_norm": 1.8324023485183716, "learning_rate": 6.444483255083411e-06, "loss": 0.4108, "mean_token_accuracy": 0.8690150380134583, "step": 3203 }, { "epoch": 2.18113721484508, "grad_norm": 1.701759934425354, "learning_rate": 6.442326478656483e-06, "loss": 0.5543, "mean_token_accuracy": 0.8018774390220642, "step": 3204 }, { "epoch": 2.1818181818181817, "grad_norm": 1.721466302871704, "learning_rate": 6.4401694094612985e-06, "loss": 0.563, "mean_token_accuracy": 0.7673637270927429, "step": 3205 }, { "epoch": 2.1824991487912837, "grad_norm": 1.8890239000320435, "learning_rate": 6.438012047935711e-06, "loss": 0.4566, "mean_token_accuracy": 0.83802330493927, "step": 3206 }, { "epoch": 2.1831801157643853, "grad_norm": 1.6978334188461304, "learning_rate": 6.435854394517624e-06, "loss": 0.4942, "mean_token_accuracy": 0.8276210129261017, "step": 3207 }, { "epoch": 2.1838610827374874, "grad_norm": 1.8812144994735718, "learning_rate": 6.433696449645008e-06, "loss": 0.4518, "mean_token_accuracy": 0.846089780330658, "step": 3208 }, { "epoch": 2.184542049710589, "grad_norm": 1.7429791688919067, "learning_rate": 6.4315382137558894e-06, "loss": 0.3343, "mean_token_accuracy": 0.8882235884666443, "step": 3209 }, { "epoch": 2.185223016683691, "grad_norm": 1.7389099597930908, "learning_rate": 6.429379687288353e-06, "loss": 0.4959, "mean_token_accuracy": 0.8221446573734283, "step": 3210 }, { "epoch": 2.1859039836567926, "grad_norm": 1.9053276777267456, "learning_rate": 6.4272208706805475e-06, "loss": 0.3827, "mean_token_accuracy": 0.8637125790119171, "step": 3211 }, { "epoch": 2.1865849506298947, "grad_norm": 1.7600291967391968, "learning_rate": 6.42506176437067e-06, "loss": 0.4976, "mean_token_accuracy": 0.810449481010437, "step": 3212 }, { "epoch": 2.1872659176029963, "grad_norm": 1.6625733375549316, "learning_rate": 6.422902368796989e-06, "loss": 0.5731, "mean_token_accuracy": 0.7880772352218628, "step": 3213 }, { "epoch": 2.187946884576098, "grad_norm": 1.9539443254470825, "learning_rate": 6.420742684397823e-06, "loss": 0.4671, "mean_token_accuracy": 0.8391905725002289, "step": 3214 }, { "epoch": 2.1886278515492, "grad_norm": 1.7392603158950806, "learning_rate": 6.418582711611554e-06, "loss": 0.6202, "mean_token_accuracy": 0.7751371562480927, "step": 3215 }, { "epoch": 2.1893088185223015, "grad_norm": 1.8619186878204346, "learning_rate": 6.416422450876619e-06, "loss": 0.432, "mean_token_accuracy": 0.8430376052856445, "step": 3216 }, { "epoch": 2.1899897854954036, "grad_norm": 1.6155627965927124, "learning_rate": 6.414261902631515e-06, "loss": 0.5673, "mean_token_accuracy": 0.7666285932064056, "step": 3217 }, { "epoch": 2.190670752468505, "grad_norm": 1.7631423473358154, "learning_rate": 6.412101067314798e-06, "loss": 0.516, "mean_token_accuracy": 0.81621253490448, "step": 3218 }, { "epoch": 2.1913517194416072, "grad_norm": 1.5892959833145142, "learning_rate": 6.40993994536508e-06, "loss": 0.6432, "mean_token_accuracy": 0.7880813479423523, "step": 3219 }, { "epoch": 2.192032686414709, "grad_norm": 1.7898420095443726, "learning_rate": 6.407778537221034e-06, "loss": 0.4162, "mean_token_accuracy": 0.8340800106525421, "step": 3220 }, { "epoch": 2.192713653387811, "grad_norm": 1.6437880992889404, "learning_rate": 6.405616843321392e-06, "loss": 0.6049, "mean_token_accuracy": 0.7948702573776245, "step": 3221 }, { "epoch": 2.1933946203609125, "grad_norm": 1.806494951248169, "learning_rate": 6.403454864104938e-06, "loss": 0.5409, "mean_token_accuracy": 0.8157455623149872, "step": 3222 }, { "epoch": 2.194075587334014, "grad_norm": 1.9237306118011475, "learning_rate": 6.4012926000105204e-06, "loss": 0.5803, "mean_token_accuracy": 0.7892161309719086, "step": 3223 }, { "epoch": 2.194756554307116, "grad_norm": 1.8788256645202637, "learning_rate": 6.399130051477041e-06, "loss": 0.3818, "mean_token_accuracy": 0.8603110313415527, "step": 3224 }, { "epoch": 2.1954375212802177, "grad_norm": 1.918839693069458, "learning_rate": 6.39696721894346e-06, "loss": 0.4813, "mean_token_accuracy": 0.8390723168849945, "step": 3225 }, { "epoch": 2.19611848825332, "grad_norm": 1.7682372331619263, "learning_rate": 6.3948041028488016e-06, "loss": 0.4841, "mean_token_accuracy": 0.8377382755279541, "step": 3226 }, { "epoch": 2.1967994552264214, "grad_norm": 1.8140887022018433, "learning_rate": 6.392640703632138e-06, "loss": 0.3921, "mean_token_accuracy": 0.8491290211677551, "step": 3227 }, { "epoch": 2.1974804221995234, "grad_norm": 1.8402220010757446, "learning_rate": 6.390477021732604e-06, "loss": 0.4798, "mean_token_accuracy": 0.8362664580345154, "step": 3228 }, { "epoch": 2.198161389172625, "grad_norm": 1.7209163904190063, "learning_rate": 6.3883130575893915e-06, "loss": 0.5527, "mean_token_accuracy": 0.7866642475128174, "step": 3229 }, { "epoch": 2.198842356145727, "grad_norm": 1.7446959018707275, "learning_rate": 6.386148811641749e-06, "loss": 0.4595, "mean_token_accuracy": 0.8291427791118622, "step": 3230 }, { "epoch": 2.1995233231188287, "grad_norm": 1.9103312492370605, "learning_rate": 6.383984284328982e-06, "loss": 0.4483, "mean_token_accuracy": 0.8466905057430267, "step": 3231 }, { "epoch": 2.2002042900919307, "grad_norm": 1.714699625968933, "learning_rate": 6.381819476090455e-06, "loss": 0.6322, "mean_token_accuracy": 0.766505777835846, "step": 3232 }, { "epoch": 2.2008852570650324, "grad_norm": 1.9179965257644653, "learning_rate": 6.379654387365587e-06, "loss": 0.4442, "mean_token_accuracy": 0.8452650010585785, "step": 3233 }, { "epoch": 2.201566224038134, "grad_norm": 1.8956204652786255, "learning_rate": 6.377489018593853e-06, "loss": 0.4426, "mean_token_accuracy": 0.8240122199058533, "step": 3234 }, { "epoch": 2.202247191011236, "grad_norm": 1.7146730422973633, "learning_rate": 6.37532337021479e-06, "loss": 0.5434, "mean_token_accuracy": 0.8214548230171204, "step": 3235 }, { "epoch": 2.2029281579843376, "grad_norm": 1.730139970779419, "learning_rate": 6.373157442667985e-06, "loss": 0.5652, "mean_token_accuracy": 0.808545857667923, "step": 3236 }, { "epoch": 2.2036091249574397, "grad_norm": 1.8528876304626465, "learning_rate": 6.3709912363930915e-06, "loss": 0.4891, "mean_token_accuracy": 0.8379809558391571, "step": 3237 }, { "epoch": 2.2042900919305413, "grad_norm": 1.9214993715286255, "learning_rate": 6.368824751829807e-06, "loss": 0.477, "mean_token_accuracy": 0.7990526258945465, "step": 3238 }, { "epoch": 2.2049710589036433, "grad_norm": 1.9458556175231934, "learning_rate": 6.3666579894178925e-06, "loss": 0.457, "mean_token_accuracy": 0.8150736689567566, "step": 3239 }, { "epoch": 2.205652025876745, "grad_norm": 1.6799123287200928, "learning_rate": 6.364490949597168e-06, "loss": 0.4591, "mean_token_accuracy": 0.8450926542282104, "step": 3240 }, { "epoch": 2.206332992849847, "grad_norm": 1.8789130449295044, "learning_rate": 6.362323632807505e-06, "loss": 0.4511, "mean_token_accuracy": 0.8310529589653015, "step": 3241 }, { "epoch": 2.2070139598229486, "grad_norm": 1.7162513732910156, "learning_rate": 6.360156039488832e-06, "loss": 0.4262, "mean_token_accuracy": 0.8542211055755615, "step": 3242 }, { "epoch": 2.2076949267960506, "grad_norm": 1.8976727724075317, "learning_rate": 6.357988170081134e-06, "loss": 0.5405, "mean_token_accuracy": 0.815014511346817, "step": 3243 }, { "epoch": 2.208375893769152, "grad_norm": 1.7598005533218384, "learning_rate": 6.3558200250244525e-06, "loss": 0.5027, "mean_token_accuracy": 0.8087017834186554, "step": 3244 }, { "epoch": 2.209056860742254, "grad_norm": 1.7754911184310913, "learning_rate": 6.353651604758888e-06, "loss": 0.5083, "mean_token_accuracy": 0.8199585676193237, "step": 3245 }, { "epoch": 2.209737827715356, "grad_norm": 1.758779764175415, "learning_rate": 6.351482909724589e-06, "loss": 0.4854, "mean_token_accuracy": 0.8301365971565247, "step": 3246 }, { "epoch": 2.2104187946884575, "grad_norm": 1.6234755516052246, "learning_rate": 6.3493139403617674e-06, "loss": 0.7753, "mean_token_accuracy": 0.743328720331192, "step": 3247 }, { "epoch": 2.2110997616615595, "grad_norm": 1.7598648071289062, "learning_rate": 6.347144697110689e-06, "loss": 0.5283, "mean_token_accuracy": 0.7728915512561798, "step": 3248 }, { "epoch": 2.211780728634661, "grad_norm": 1.717799186706543, "learning_rate": 6.344975180411669e-06, "loss": 0.4456, "mean_token_accuracy": 0.8465192019939423, "step": 3249 }, { "epoch": 2.212461695607763, "grad_norm": 1.890489101409912, "learning_rate": 6.342805390705087e-06, "loss": 0.4108, "mean_token_accuracy": 0.8376311361789703, "step": 3250 }, { "epoch": 2.213142662580865, "grad_norm": 1.8553366661071777, "learning_rate": 6.340635328431374e-06, "loss": 0.592, "mean_token_accuracy": 0.7891602516174316, "step": 3251 }, { "epoch": 2.213823629553967, "grad_norm": 1.8095835447311401, "learning_rate": 6.338464994031017e-06, "loss": 0.4545, "mean_token_accuracy": 0.8396531343460083, "step": 3252 }, { "epoch": 2.2145045965270684, "grad_norm": 1.6485645771026611, "learning_rate": 6.336294387944556e-06, "loss": 0.5619, "mean_token_accuracy": 0.7847583591938019, "step": 3253 }, { "epoch": 2.21518556350017, "grad_norm": 1.844678521156311, "learning_rate": 6.334123510612588e-06, "loss": 0.5608, "mean_token_accuracy": 0.7954529225826263, "step": 3254 }, { "epoch": 2.215866530473272, "grad_norm": 1.8402934074401855, "learning_rate": 6.331952362475765e-06, "loss": 0.5434, "mean_token_accuracy": 0.8075903356075287, "step": 3255 }, { "epoch": 2.2165474974463737, "grad_norm": 1.6656297445297241, "learning_rate": 6.329780943974796e-06, "loss": 0.4931, "mean_token_accuracy": 0.8291557729244232, "step": 3256 }, { "epoch": 2.2172284644194757, "grad_norm": 1.6730202436447144, "learning_rate": 6.3276092555504405e-06, "loss": 0.6942, "mean_token_accuracy": 0.7622471749782562, "step": 3257 }, { "epoch": 2.2179094313925773, "grad_norm": 1.8257750272750854, "learning_rate": 6.325437297643516e-06, "loss": 0.5325, "mean_token_accuracy": 0.8089892268180847, "step": 3258 }, { "epoch": 2.2185903983656794, "grad_norm": 1.8655275106430054, "learning_rate": 6.323265070694893e-06, "loss": 0.4106, "mean_token_accuracy": 0.8622884750366211, "step": 3259 }, { "epoch": 2.219271365338781, "grad_norm": 1.838085651397705, "learning_rate": 6.3210925751455e-06, "loss": 0.4347, "mean_token_accuracy": 0.8404688537120819, "step": 3260 }, { "epoch": 2.219952332311883, "grad_norm": 1.7396360635757446, "learning_rate": 6.318919811436314e-06, "loss": 0.5071, "mean_token_accuracy": 0.7990049421787262, "step": 3261 }, { "epoch": 2.2206332992849847, "grad_norm": 1.8185561895370483, "learning_rate": 6.316746780008371e-06, "loss": 0.48, "mean_token_accuracy": 0.824403703212738, "step": 3262 }, { "epoch": 2.2213142662580863, "grad_norm": 1.7653082609176636, "learning_rate": 6.314573481302761e-06, "loss": 0.5306, "mean_token_accuracy": 0.8107064366340637, "step": 3263 }, { "epoch": 2.2219952332311883, "grad_norm": 1.849289894104004, "learning_rate": 6.312399915760628e-06, "loss": 0.3872, "mean_token_accuracy": 0.8639977276325226, "step": 3264 }, { "epoch": 2.22267620020429, "grad_norm": 1.9601314067840576, "learning_rate": 6.310226083823165e-06, "loss": 0.4739, "mean_token_accuracy": 0.8287750780582428, "step": 3265 }, { "epoch": 2.223357167177392, "grad_norm": 1.7799558639526367, "learning_rate": 6.30805198593163e-06, "loss": 0.5226, "mean_token_accuracy": 0.8218235075473785, "step": 3266 }, { "epoch": 2.2240381341504936, "grad_norm": 1.8071991205215454, "learning_rate": 6.3058776225273275e-06, "loss": 0.4465, "mean_token_accuracy": 0.8224359750747681, "step": 3267 }, { "epoch": 2.2247191011235956, "grad_norm": 1.9086966514587402, "learning_rate": 6.303702994051612e-06, "loss": 0.4485, "mean_token_accuracy": 0.8462652266025543, "step": 3268 }, { "epoch": 2.225400068096697, "grad_norm": 1.9421179294586182, "learning_rate": 6.301528100945902e-06, "loss": 0.4902, "mean_token_accuracy": 0.8091205656528473, "step": 3269 }, { "epoch": 2.2260810350697993, "grad_norm": 1.7451177835464478, "learning_rate": 6.299352943651662e-06, "loss": 0.5266, "mean_token_accuracy": 0.7960959076881409, "step": 3270 }, { "epoch": 2.226762002042901, "grad_norm": 1.9405556917190552, "learning_rate": 6.297177522610415e-06, "loss": 0.4256, "mean_token_accuracy": 0.8409234881401062, "step": 3271 }, { "epoch": 2.227442969016003, "grad_norm": 1.7215107679367065, "learning_rate": 6.295001838263734e-06, "loss": 0.6131, "mean_token_accuracy": 0.7926876246929169, "step": 3272 }, { "epoch": 2.2281239359891045, "grad_norm": 1.7235358953475952, "learning_rate": 6.292825891053245e-06, "loss": 0.5115, "mean_token_accuracy": 0.8029704988002777, "step": 3273 }, { "epoch": 2.2288049029622066, "grad_norm": 1.7053221464157104, "learning_rate": 6.290649681420631e-06, "loss": 0.5909, "mean_token_accuracy": 0.7825810015201569, "step": 3274 }, { "epoch": 2.229485869935308, "grad_norm": 1.7822061777114868, "learning_rate": 6.288473209807626e-06, "loss": 0.5414, "mean_token_accuracy": 0.8028908669948578, "step": 3275 }, { "epoch": 2.2301668369084098, "grad_norm": 1.817354679107666, "learning_rate": 6.2862964766560175e-06, "loss": 0.3724, "mean_token_accuracy": 0.8656419515609741, "step": 3276 }, { "epoch": 2.230847803881512, "grad_norm": 1.7334903478622437, "learning_rate": 6.284119482407645e-06, "loss": 0.5391, "mean_token_accuracy": 0.8131906390190125, "step": 3277 }, { "epoch": 2.2315287708546134, "grad_norm": 1.7115312814712524, "learning_rate": 6.2819422275044055e-06, "loss": 0.4638, "mean_token_accuracy": 0.8138588964939117, "step": 3278 }, { "epoch": 2.2322097378277155, "grad_norm": 1.8055847883224487, "learning_rate": 6.279764712388241e-06, "loss": 0.4624, "mean_token_accuracy": 0.8339909911155701, "step": 3279 }, { "epoch": 2.232890704800817, "grad_norm": 1.799526333808899, "learning_rate": 6.277586937501152e-06, "loss": 0.455, "mean_token_accuracy": 0.8385971188545227, "step": 3280 }, { "epoch": 2.233571671773919, "grad_norm": 1.8357056379318237, "learning_rate": 6.2754089032851925e-06, "loss": 0.5586, "mean_token_accuracy": 0.8146968185901642, "step": 3281 }, { "epoch": 2.2342526387470207, "grad_norm": 1.740854263305664, "learning_rate": 6.273230610182467e-06, "loss": 0.6321, "mean_token_accuracy": 0.7957105934619904, "step": 3282 }, { "epoch": 2.234933605720123, "grad_norm": 1.7382131814956665, "learning_rate": 6.271052058635132e-06, "loss": 0.6893, "mean_token_accuracy": 0.7491850256919861, "step": 3283 }, { "epoch": 2.2356145726932244, "grad_norm": 1.8025717735290527, "learning_rate": 6.2688732490853946e-06, "loss": 0.5502, "mean_token_accuracy": 0.8079434633255005, "step": 3284 }, { "epoch": 2.236295539666326, "grad_norm": 1.8851038217544556, "learning_rate": 6.26669418197552e-06, "loss": 0.5324, "mean_token_accuracy": 0.7782066464424133, "step": 3285 }, { "epoch": 2.236976506639428, "grad_norm": 1.899906873703003, "learning_rate": 6.264514857747823e-06, "loss": 0.4949, "mean_token_accuracy": 0.8209265768527985, "step": 3286 }, { "epoch": 2.2376574736125296, "grad_norm": 1.824537992477417, "learning_rate": 6.262335276844668e-06, "loss": 0.5793, "mean_token_accuracy": 0.7638862729072571, "step": 3287 }, { "epoch": 2.2383384405856317, "grad_norm": 1.7053614854812622, "learning_rate": 6.260155439708474e-06, "loss": 0.539, "mean_token_accuracy": 0.803756445646286, "step": 3288 }, { "epoch": 2.2390194075587333, "grad_norm": 1.8786301612854004, "learning_rate": 6.2579753467817125e-06, "loss": 0.5538, "mean_token_accuracy": 0.8198028206825256, "step": 3289 }, { "epoch": 2.2397003745318353, "grad_norm": 1.7939375638961792, "learning_rate": 6.255794998506905e-06, "loss": 0.4828, "mean_token_accuracy": 0.8254357278347015, "step": 3290 }, { "epoch": 2.240381341504937, "grad_norm": 1.962662696838379, "learning_rate": 6.253614395326628e-06, "loss": 0.3277, "mean_token_accuracy": 0.8907083570957184, "step": 3291 }, { "epoch": 2.241062308478039, "grad_norm": 1.4950110912322998, "learning_rate": 6.251433537683505e-06, "loss": 0.7661, "mean_token_accuracy": 0.7551760971546173, "step": 3292 }, { "epoch": 2.2417432754511406, "grad_norm": 1.7765365839004517, "learning_rate": 6.249252426020217e-06, "loss": 0.4517, "mean_token_accuracy": 0.8366592526435852, "step": 3293 }, { "epoch": 2.242424242424242, "grad_norm": 1.9369595050811768, "learning_rate": 6.247071060779489e-06, "loss": 0.4667, "mean_token_accuracy": 0.8297876417636871, "step": 3294 }, { "epoch": 2.2431052093973443, "grad_norm": 1.7783509492874146, "learning_rate": 6.244889442404105e-06, "loss": 0.4639, "mean_token_accuracy": 0.8334134519100189, "step": 3295 }, { "epoch": 2.243786176370446, "grad_norm": 1.9206454753875732, "learning_rate": 6.2427075713368955e-06, "loss": 0.4086, "mean_token_accuracy": 0.8599231541156769, "step": 3296 }, { "epoch": 2.244467143343548, "grad_norm": 1.8590353727340698, "learning_rate": 6.240525448020745e-06, "loss": 0.4876, "mean_token_accuracy": 0.8276002109050751, "step": 3297 }, { "epoch": 2.2451481103166495, "grad_norm": 1.6146117448806763, "learning_rate": 6.23834307289859e-06, "loss": 0.7168, "mean_token_accuracy": 0.7667668163776398, "step": 3298 }, { "epoch": 2.2458290772897516, "grad_norm": 1.9086894989013672, "learning_rate": 6.236160446413412e-06, "loss": 0.4552, "mean_token_accuracy": 0.8443198204040527, "step": 3299 }, { "epoch": 2.246510044262853, "grad_norm": 1.7560211420059204, "learning_rate": 6.23397756900825e-06, "loss": 0.5324, "mean_token_accuracy": 0.8037911951541901, "step": 3300 }, { "epoch": 2.247191011235955, "grad_norm": 1.5564569234848022, "learning_rate": 6.231794441126194e-06, "loss": 0.8221, "mean_token_accuracy": 0.7431586384773254, "step": 3301 }, { "epoch": 2.247871978209057, "grad_norm": 1.6464227437973022, "learning_rate": 6.229611063210379e-06, "loss": 0.6845, "mean_token_accuracy": 0.7485035061836243, "step": 3302 }, { "epoch": 2.248552945182159, "grad_norm": 1.8424423933029175, "learning_rate": 6.227427435703997e-06, "loss": 0.4372, "mean_token_accuracy": 0.8471263349056244, "step": 3303 }, { "epoch": 2.2492339121552605, "grad_norm": 1.6386919021606445, "learning_rate": 6.225243559050284e-06, "loss": 0.525, "mean_token_accuracy": 0.8305919468402863, "step": 3304 }, { "epoch": 2.249914879128362, "grad_norm": 1.7140580415725708, "learning_rate": 6.223059433692538e-06, "loss": 0.5099, "mean_token_accuracy": 0.8151004910469055, "step": 3305 }, { "epoch": 2.250595846101464, "grad_norm": 1.798925518989563, "learning_rate": 6.220875060074092e-06, "loss": 0.5035, "mean_token_accuracy": 0.7945258617401123, "step": 3306 }, { "epoch": 2.2512768130745657, "grad_norm": 1.7308862209320068, "learning_rate": 6.2186904386383415e-06, "loss": 0.4649, "mean_token_accuracy": 0.813698947429657, "step": 3307 }, { "epoch": 2.251957780047668, "grad_norm": 1.7498010396957397, "learning_rate": 6.21650556982873e-06, "loss": 0.5627, "mean_token_accuracy": 0.7889959514141083, "step": 3308 }, { "epoch": 2.2526387470207694, "grad_norm": 1.9488532543182373, "learning_rate": 6.214320454088745e-06, "loss": 0.589, "mean_token_accuracy": 0.7785849571228027, "step": 3309 }, { "epoch": 2.2533197139938714, "grad_norm": 1.9707499742507935, "learning_rate": 6.21213509186193e-06, "loss": 0.4185, "mean_token_accuracy": 0.8436129987239838, "step": 3310 }, { "epoch": 2.254000680966973, "grad_norm": 1.6839250326156616, "learning_rate": 6.209949483591881e-06, "loss": 0.7227, "mean_token_accuracy": 0.7482742965221405, "step": 3311 }, { "epoch": 2.254681647940075, "grad_norm": 1.7522445917129517, "learning_rate": 6.2077636297222355e-06, "loss": 0.581, "mean_token_accuracy": 0.7862367033958435, "step": 3312 }, { "epoch": 2.2553626149131767, "grad_norm": 1.8092833757400513, "learning_rate": 6.205577530696687e-06, "loss": 0.5113, "mean_token_accuracy": 0.8222593367099762, "step": 3313 }, { "epoch": 2.2560435818862787, "grad_norm": 1.9302878379821777, "learning_rate": 6.203391186958977e-06, "loss": 0.3555, "mean_token_accuracy": 0.879440039396286, "step": 3314 }, { "epoch": 2.2567245488593803, "grad_norm": 1.801163911819458, "learning_rate": 6.201204598952897e-06, "loss": 0.4883, "mean_token_accuracy": 0.8291152119636536, "step": 3315 }, { "epoch": 2.257405515832482, "grad_norm": 1.7127882242202759, "learning_rate": 6.199017767122289e-06, "loss": 0.4803, "mean_token_accuracy": 0.8219919800758362, "step": 3316 }, { "epoch": 2.258086482805584, "grad_norm": 1.8091559410095215, "learning_rate": 6.196830691911041e-06, "loss": 0.4855, "mean_token_accuracy": 0.8018249869346619, "step": 3317 }, { "epoch": 2.2587674497786856, "grad_norm": 1.9486396312713623, "learning_rate": 6.194643373763096e-06, "loss": 0.4126, "mean_token_accuracy": 0.8592789769172668, "step": 3318 }, { "epoch": 2.2594484167517876, "grad_norm": 1.877825379371643, "learning_rate": 6.19245581312244e-06, "loss": 0.4935, "mean_token_accuracy": 0.8389545381069183, "step": 3319 }, { "epoch": 2.2601293837248893, "grad_norm": 1.7240279912948608, "learning_rate": 6.190268010433114e-06, "loss": 0.5449, "mean_token_accuracy": 0.8152183592319489, "step": 3320 }, { "epoch": 2.2608103506979913, "grad_norm": 1.8851752281188965, "learning_rate": 6.1880799661392036e-06, "loss": 0.391, "mean_token_accuracy": 0.8535477519035339, "step": 3321 }, { "epoch": 2.261491317671093, "grad_norm": 1.8397454023361206, "learning_rate": 6.1858916806848454e-06, "loss": 0.4299, "mean_token_accuracy": 0.8475962579250336, "step": 3322 }, { "epoch": 2.262172284644195, "grad_norm": 1.8792883157730103, "learning_rate": 6.183703154514228e-06, "loss": 0.4348, "mean_token_accuracy": 0.8467669785022736, "step": 3323 }, { "epoch": 2.2628532516172966, "grad_norm": 1.8245497941970825, "learning_rate": 6.181514388071582e-06, "loss": 0.4895, "mean_token_accuracy": 0.816152036190033, "step": 3324 }, { "epoch": 2.263534218590398, "grad_norm": 1.593917727470398, "learning_rate": 6.1793253818011914e-06, "loss": 0.5782, "mean_token_accuracy": 0.7946918308734894, "step": 3325 }, { "epoch": 2.2642151855635, "grad_norm": 1.7978758811950684, "learning_rate": 6.177136136147388e-06, "loss": 0.5038, "mean_token_accuracy": 0.8295866847038269, "step": 3326 }, { "epoch": 2.264896152536602, "grad_norm": 1.7197799682617188, "learning_rate": 6.174946651554554e-06, "loss": 0.5228, "mean_token_accuracy": 0.8186790943145752, "step": 3327 }, { "epoch": 2.265577119509704, "grad_norm": 1.751043438911438, "learning_rate": 6.1727569284671175e-06, "loss": 0.5491, "mean_token_accuracy": 0.809089720249176, "step": 3328 }, { "epoch": 2.2662580864828055, "grad_norm": 1.847988486289978, "learning_rate": 6.170566967329554e-06, "loss": 0.3702, "mean_token_accuracy": 0.8608919680118561, "step": 3329 }, { "epoch": 2.2669390534559075, "grad_norm": 1.6872472763061523, "learning_rate": 6.168376768586391e-06, "loss": 0.5812, "mean_token_accuracy": 0.807163417339325, "step": 3330 }, { "epoch": 2.267620020429009, "grad_norm": 1.7331063747406006, "learning_rate": 6.166186332682203e-06, "loss": 0.6215, "mean_token_accuracy": 0.7915039956569672, "step": 3331 }, { "epoch": 2.268300987402111, "grad_norm": 1.8808649778366089, "learning_rate": 6.1639956600616095e-06, "loss": 0.5142, "mean_token_accuracy": 0.8148282170295715, "step": 3332 }, { "epoch": 2.2689819543752128, "grad_norm": 1.6977171897888184, "learning_rate": 6.161804751169282e-06, "loss": 0.4226, "mean_token_accuracy": 0.8589349985122681, "step": 3333 }, { "epoch": 2.2696629213483144, "grad_norm": 1.9283397197723389, "learning_rate": 6.159613606449939e-06, "loss": 0.5371, "mean_token_accuracy": 0.8114591240882874, "step": 3334 }, { "epoch": 2.2703438883214164, "grad_norm": 1.7654483318328857, "learning_rate": 6.157422226348346e-06, "loss": 0.595, "mean_token_accuracy": 0.7981814742088318, "step": 3335 }, { "epoch": 2.271024855294518, "grad_norm": 1.8151544332504272, "learning_rate": 6.155230611309316e-06, "loss": 0.3987, "mean_token_accuracy": 0.8581008613109589, "step": 3336 }, { "epoch": 2.27170582226762, "grad_norm": 1.8721914291381836, "learning_rate": 6.153038761777711e-06, "loss": 0.3914, "mean_token_accuracy": 0.8698821067810059, "step": 3337 }, { "epoch": 2.2723867892407217, "grad_norm": 1.9332234859466553, "learning_rate": 6.150846678198441e-06, "loss": 0.4016, "mean_token_accuracy": 0.8536344170570374, "step": 3338 }, { "epoch": 2.2730677562138237, "grad_norm": 1.8210368156433105, "learning_rate": 6.1486543610164604e-06, "loss": 0.5401, "mean_token_accuracy": 0.811759889125824, "step": 3339 }, { "epoch": 2.2737487231869253, "grad_norm": 1.734622836112976, "learning_rate": 6.146461810676774e-06, "loss": 0.6379, "mean_token_accuracy": 0.7612916231155396, "step": 3340 }, { "epoch": 2.2744296901600274, "grad_norm": 1.7900309562683105, "learning_rate": 6.144269027624431e-06, "loss": 0.5367, "mean_token_accuracy": 0.7966537773609161, "step": 3341 }, { "epoch": 2.275110657133129, "grad_norm": 1.653672695159912, "learning_rate": 6.1420760123045344e-06, "loss": 0.5519, "mean_token_accuracy": 0.8040771186351776, "step": 3342 }, { "epoch": 2.275791624106231, "grad_norm": 1.7106190919876099, "learning_rate": 6.139882765162226e-06, "loss": 0.4967, "mean_token_accuracy": 0.8307057321071625, "step": 3343 }, { "epoch": 2.2764725910793326, "grad_norm": 1.6467031240463257, "learning_rate": 6.137689286642701e-06, "loss": 0.5313, "mean_token_accuracy": 0.7823669016361237, "step": 3344 }, { "epoch": 2.2771535580524347, "grad_norm": 2.0084123611450195, "learning_rate": 6.135495577191196e-06, "loss": 0.4424, "mean_token_accuracy": 0.8596700727939606, "step": 3345 }, { "epoch": 2.2778345250255363, "grad_norm": 1.7782691717147827, "learning_rate": 6.1333016372529995e-06, "loss": 0.4227, "mean_token_accuracy": 0.8365634381771088, "step": 3346 }, { "epoch": 2.278515491998638, "grad_norm": 1.9821839332580566, "learning_rate": 6.131107467273445e-06, "loss": 0.4646, "mean_token_accuracy": 0.8352358043193817, "step": 3347 }, { "epoch": 2.27919645897174, "grad_norm": 1.7807027101516724, "learning_rate": 6.128913067697911e-06, "loss": 0.4943, "mean_token_accuracy": 0.7976981103420258, "step": 3348 }, { "epoch": 2.2798774259448416, "grad_norm": 1.7508631944656372, "learning_rate": 6.126718438971826e-06, "loss": 0.407, "mean_token_accuracy": 0.8503564298152924, "step": 3349 }, { "epoch": 2.2805583929179436, "grad_norm": 1.815429449081421, "learning_rate": 6.124523581540662e-06, "loss": 0.5922, "mean_token_accuracy": 0.8015924990177155, "step": 3350 }, { "epoch": 2.281239359891045, "grad_norm": 1.8634933233261108, "learning_rate": 6.122328495849939e-06, "loss": 0.4537, "mean_token_accuracy": 0.848136693239212, "step": 3351 }, { "epoch": 2.2819203268641473, "grad_norm": 1.8432629108428955, "learning_rate": 6.1201331823452205e-06, "loss": 0.5684, "mean_token_accuracy": 0.7914450168609619, "step": 3352 }, { "epoch": 2.282601293837249, "grad_norm": 1.7415144443511963, "learning_rate": 6.117937641472122e-06, "loss": 0.5429, "mean_token_accuracy": 0.8138529658317566, "step": 3353 }, { "epoch": 2.283282260810351, "grad_norm": 1.8414549827575684, "learning_rate": 6.115741873676299e-06, "loss": 0.489, "mean_token_accuracy": 0.8202427327632904, "step": 3354 }, { "epoch": 2.2839632277834525, "grad_norm": 1.7602084875106812, "learning_rate": 6.113545879403457e-06, "loss": 0.4689, "mean_token_accuracy": 0.841525673866272, "step": 3355 }, { "epoch": 2.284644194756554, "grad_norm": 1.780708909034729, "learning_rate": 6.111349659099347e-06, "loss": 0.4302, "mean_token_accuracy": 0.8560822606086731, "step": 3356 }, { "epoch": 2.285325161729656, "grad_norm": 1.7198655605316162, "learning_rate": 6.109153213209764e-06, "loss": 0.5286, "mean_token_accuracy": 0.827396035194397, "step": 3357 }, { "epoch": 2.2860061287027578, "grad_norm": 1.8227308988571167, "learning_rate": 6.106956542180551e-06, "loss": 0.5259, "mean_token_accuracy": 0.8245325088500977, "step": 3358 }, { "epoch": 2.28668709567586, "grad_norm": 1.8276032209396362, "learning_rate": 6.104759646457594e-06, "loss": 0.484, "mean_token_accuracy": 0.827955573797226, "step": 3359 }, { "epoch": 2.2873680626489614, "grad_norm": 1.7985774278640747, "learning_rate": 6.102562526486828e-06, "loss": 0.641, "mean_token_accuracy": 0.7834445834159851, "step": 3360 }, { "epoch": 2.2880490296220635, "grad_norm": 1.7931421995162964, "learning_rate": 6.100365182714232e-06, "loss": 0.4839, "mean_token_accuracy": 0.8095361590385437, "step": 3361 }, { "epoch": 2.288729996595165, "grad_norm": 1.8743878602981567, "learning_rate": 6.098167615585826e-06, "loss": 0.5001, "mean_token_accuracy": 0.8438660502433777, "step": 3362 }, { "epoch": 2.289410963568267, "grad_norm": 1.6992241144180298, "learning_rate": 6.0959698255476854e-06, "loss": 0.4341, "mean_token_accuracy": 0.842191070318222, "step": 3363 }, { "epoch": 2.2900919305413687, "grad_norm": 1.8270041942596436, "learning_rate": 6.093771813045921e-06, "loss": 0.493, "mean_token_accuracy": 0.8111929595470428, "step": 3364 }, { "epoch": 2.2907728975144703, "grad_norm": 1.7695554494857788, "learning_rate": 6.091573578526695e-06, "loss": 0.5039, "mean_token_accuracy": 0.8159421682357788, "step": 3365 }, { "epoch": 2.2914538644875724, "grad_norm": 1.8152040243148804, "learning_rate": 6.089375122436211e-06, "loss": 0.4998, "mean_token_accuracy": 0.7891778647899628, "step": 3366 }, { "epoch": 2.292134831460674, "grad_norm": 1.7986692190170288, "learning_rate": 6.087176445220718e-06, "loss": 0.407, "mean_token_accuracy": 0.84855917096138, "step": 3367 }, { "epoch": 2.292815798433776, "grad_norm": 1.7287065982818604, "learning_rate": 6.0849775473265136e-06, "loss": 0.6123, "mean_token_accuracy": 0.8041750490665436, "step": 3368 }, { "epoch": 2.2934967654068776, "grad_norm": 1.744943618774414, "learning_rate": 6.082778429199937e-06, "loss": 0.6378, "mean_token_accuracy": 0.7677757143974304, "step": 3369 }, { "epoch": 2.2941777323799797, "grad_norm": 1.8868619203567505, "learning_rate": 6.08057909128737e-06, "loss": 0.5835, "mean_token_accuracy": 0.7693026959896088, "step": 3370 }, { "epoch": 2.2948586993530813, "grad_norm": 1.5730252265930176, "learning_rate": 6.078379534035244e-06, "loss": 0.5536, "mean_token_accuracy": 0.8174395561218262, "step": 3371 }, { "epoch": 2.2955396663261833, "grad_norm": 1.8754907846450806, "learning_rate": 6.0761797578900306e-06, "loss": 0.6099, "mean_token_accuracy": 0.8009621202945709, "step": 3372 }, { "epoch": 2.296220633299285, "grad_norm": 1.8223477602005005, "learning_rate": 6.07397976329825e-06, "loss": 0.425, "mean_token_accuracy": 0.8428741991519928, "step": 3373 }, { "epoch": 2.2969016002723865, "grad_norm": 1.7020151615142822, "learning_rate": 6.0717795507064625e-06, "loss": 0.6172, "mean_token_accuracy": 0.7711336314678192, "step": 3374 }, { "epoch": 2.2975825672454886, "grad_norm": 1.895175814628601, "learning_rate": 6.069579120561276e-06, "loss": 0.4868, "mean_token_accuracy": 0.832250714302063, "step": 3375 }, { "epoch": 2.2982635342185906, "grad_norm": 1.6885908842086792, "learning_rate": 6.067378473309341e-06, "loss": 0.5938, "mean_token_accuracy": 0.780136376619339, "step": 3376 }, { "epoch": 2.2989445011916922, "grad_norm": 1.8405263423919678, "learning_rate": 6.06517760939735e-06, "loss": 0.4823, "mean_token_accuracy": 0.8289546370506287, "step": 3377 }, { "epoch": 2.299625468164794, "grad_norm": 1.7172366380691528, "learning_rate": 6.062976529272046e-06, "loss": 0.5788, "mean_token_accuracy": 0.8098002970218658, "step": 3378 }, { "epoch": 2.300306435137896, "grad_norm": 1.8223122358322144, "learning_rate": 6.060775233380208e-06, "loss": 0.4056, "mean_token_accuracy": 0.8477878272533417, "step": 3379 }, { "epoch": 2.3009874021109975, "grad_norm": 1.8088181018829346, "learning_rate": 6.058573722168664e-06, "loss": 0.5779, "mean_token_accuracy": 0.7897052466869354, "step": 3380 }, { "epoch": 2.3016683690840996, "grad_norm": 1.6852887868881226, "learning_rate": 6.056371996084283e-06, "loss": 0.4887, "mean_token_accuracy": 0.8308826386928558, "step": 3381 }, { "epoch": 2.302349336057201, "grad_norm": 1.7164808511734009, "learning_rate": 6.054170055573978e-06, "loss": 0.5994, "mean_token_accuracy": 0.7858304679393768, "step": 3382 }, { "epoch": 2.303030303030303, "grad_norm": 1.7578258514404297, "learning_rate": 6.05196790108471e-06, "loss": 0.4959, "mean_token_accuracy": 0.8336486220359802, "step": 3383 }, { "epoch": 2.303711270003405, "grad_norm": 1.8836132287979126, "learning_rate": 6.049765533063476e-06, "loss": 0.5221, "mean_token_accuracy": 0.8243390023708344, "step": 3384 }, { "epoch": 2.304392236976507, "grad_norm": 1.8644359111785889, "learning_rate": 6.047562951957322e-06, "loss": 0.4641, "mean_token_accuracy": 0.8383236229419708, "step": 3385 }, { "epoch": 2.3050732039496085, "grad_norm": 1.7280751466751099, "learning_rate": 6.045360158213335e-06, "loss": 0.4424, "mean_token_accuracy": 0.8339874446392059, "step": 3386 }, { "epoch": 2.30575417092271, "grad_norm": 1.6918400526046753, "learning_rate": 6.043157152278645e-06, "loss": 0.5552, "mean_token_accuracy": 0.800993025302887, "step": 3387 }, { "epoch": 2.306435137895812, "grad_norm": 1.6826804876327515, "learning_rate": 6.040953934600425e-06, "loss": 0.6643, "mean_token_accuracy": 0.7798753380775452, "step": 3388 }, { "epoch": 2.3071161048689137, "grad_norm": 1.6557353734970093, "learning_rate": 6.038750505625891e-06, "loss": 0.6019, "mean_token_accuracy": 0.7971195876598358, "step": 3389 }, { "epoch": 2.3077970718420158, "grad_norm": 1.79705011844635, "learning_rate": 6.036546865802306e-06, "loss": 0.5841, "mean_token_accuracy": 0.7759333252906799, "step": 3390 }, { "epoch": 2.3084780388151174, "grad_norm": 1.7643839120864868, "learning_rate": 6.034343015576969e-06, "loss": 0.4855, "mean_token_accuracy": 0.8356196880340576, "step": 3391 }, { "epoch": 2.3091590057882194, "grad_norm": 1.74996018409729, "learning_rate": 6.032138955397224e-06, "loss": 0.5906, "mean_token_accuracy": 0.7878971993923187, "step": 3392 }, { "epoch": 2.309839972761321, "grad_norm": 1.975338339805603, "learning_rate": 6.029934685710462e-06, "loss": 0.4511, "mean_token_accuracy": 0.8427252173423767, "step": 3393 }, { "epoch": 2.310520939734423, "grad_norm": 1.8646013736724854, "learning_rate": 6.027730206964109e-06, "loss": 0.4902, "mean_token_accuracy": 0.8174355924129486, "step": 3394 }, { "epoch": 2.3112019067075247, "grad_norm": 1.7214957475662231, "learning_rate": 6.025525519605643e-06, "loss": 0.3985, "mean_token_accuracy": 0.8632234036922455, "step": 3395 }, { "epoch": 2.3118828736806263, "grad_norm": 1.7024873495101929, "learning_rate": 6.023320624082573e-06, "loss": 0.4946, "mean_token_accuracy": 0.8221939206123352, "step": 3396 }, { "epoch": 2.3125638406537283, "grad_norm": 1.7698371410369873, "learning_rate": 6.0211155208424575e-06, "loss": 0.5907, "mean_token_accuracy": 0.7946065664291382, "step": 3397 }, { "epoch": 2.31324480762683, "grad_norm": 1.6808823347091675, "learning_rate": 6.018910210332899e-06, "loss": 0.4872, "mean_token_accuracy": 0.8271128237247467, "step": 3398 }, { "epoch": 2.313925774599932, "grad_norm": 1.6484918594360352, "learning_rate": 6.016704693001535e-06, "loss": 0.571, "mean_token_accuracy": 0.7758117318153381, "step": 3399 }, { "epoch": 2.3146067415730336, "grad_norm": 1.862742304801941, "learning_rate": 6.014498969296051e-06, "loss": 0.4992, "mean_token_accuracy": 0.8199767470359802, "step": 3400 }, { "epoch": 2.3152877085461356, "grad_norm": 1.95926034450531, "learning_rate": 6.012293039664171e-06, "loss": 0.48, "mean_token_accuracy": 0.8371191918849945, "step": 3401 }, { "epoch": 2.3159686755192372, "grad_norm": 1.7734631299972534, "learning_rate": 6.010086904553663e-06, "loss": 0.5438, "mean_token_accuracy": 0.8160054683685303, "step": 3402 }, { "epoch": 2.3166496424923393, "grad_norm": 2.072950839996338, "learning_rate": 6.007880564412335e-06, "loss": 0.5071, "mean_token_accuracy": 0.7766198217868805, "step": 3403 }, { "epoch": 2.317330609465441, "grad_norm": 2.011234760284424, "learning_rate": 6.005674019688037e-06, "loss": 0.3445, "mean_token_accuracy": 0.8727047145366669, "step": 3404 }, { "epoch": 2.3180115764385425, "grad_norm": 1.7107768058776855, "learning_rate": 6.003467270828662e-06, "loss": 0.4891, "mean_token_accuracy": 0.8375480473041534, "step": 3405 }, { "epoch": 2.3186925434116445, "grad_norm": 1.7792770862579346, "learning_rate": 6.001260318282142e-06, "loss": 0.4786, "mean_token_accuracy": 0.8335727155208588, "step": 3406 }, { "epoch": 2.319373510384746, "grad_norm": 1.8117051124572754, "learning_rate": 5.999053162496453e-06, "loss": 0.4845, "mean_token_accuracy": 0.8132218420505524, "step": 3407 }, { "epoch": 2.320054477357848, "grad_norm": 1.8495659828186035, "learning_rate": 5.9968458039196086e-06, "loss": 0.484, "mean_token_accuracy": 0.835676908493042, "step": 3408 }, { "epoch": 2.32073544433095, "grad_norm": 1.8223726749420166, "learning_rate": 5.994638242999669e-06, "loss": 0.4543, "mean_token_accuracy": 0.804435133934021, "step": 3409 }, { "epoch": 2.321416411304052, "grad_norm": 1.750443696975708, "learning_rate": 5.99243048018473e-06, "loss": 0.5162, "mean_token_accuracy": 0.8232167661190033, "step": 3410 }, { "epoch": 2.3220973782771535, "grad_norm": 1.7721977233886719, "learning_rate": 5.99022251592293e-06, "loss": 0.6168, "mean_token_accuracy": 0.7920122742652893, "step": 3411 }, { "epoch": 2.3227783452502555, "grad_norm": 1.8882712125778198, "learning_rate": 5.988014350662451e-06, "loss": 0.4262, "mean_token_accuracy": 0.8368774950504303, "step": 3412 }, { "epoch": 2.323459312223357, "grad_norm": 1.7020468711853027, "learning_rate": 5.985805984851514e-06, "loss": 0.5986, "mean_token_accuracy": 0.7616225481033325, "step": 3413 }, { "epoch": 2.324140279196459, "grad_norm": 1.7064250707626343, "learning_rate": 5.983597418938378e-06, "loss": 0.5512, "mean_token_accuracy": 0.7937167882919312, "step": 3414 }, { "epoch": 2.3248212461695608, "grad_norm": 1.8485143184661865, "learning_rate": 5.981388653371347e-06, "loss": 0.4408, "mean_token_accuracy": 0.8360830247402191, "step": 3415 }, { "epoch": 2.325502213142663, "grad_norm": 1.8013330698013306, "learning_rate": 5.9791796885987645e-06, "loss": 0.6337, "mean_token_accuracy": 0.7744005918502808, "step": 3416 }, { "epoch": 2.3261831801157644, "grad_norm": 1.7272751331329346, "learning_rate": 5.976970525069011e-06, "loss": 0.5438, "mean_token_accuracy": 0.7983367443084717, "step": 3417 }, { "epoch": 2.326864147088866, "grad_norm": 1.6467136144638062, "learning_rate": 5.974761163230511e-06, "loss": 0.7082, "mean_token_accuracy": 0.7691249251365662, "step": 3418 }, { "epoch": 2.327545114061968, "grad_norm": 1.7431541681289673, "learning_rate": 5.972551603531728e-06, "loss": 0.5157, "mean_token_accuracy": 0.8169008493423462, "step": 3419 }, { "epoch": 2.3282260810350697, "grad_norm": 1.5827593803405762, "learning_rate": 5.970341846421168e-06, "loss": 0.473, "mean_token_accuracy": 0.8324446678161621, "step": 3420 }, { "epoch": 2.3289070480081717, "grad_norm": 1.6509591341018677, "learning_rate": 5.968131892347372e-06, "loss": 0.5488, "mean_token_accuracy": 0.7671186625957489, "step": 3421 }, { "epoch": 2.3295880149812733, "grad_norm": 1.8671128749847412, "learning_rate": 5.9659217417589245e-06, "loss": 0.5032, "mean_token_accuracy": 0.8420043289661407, "step": 3422 }, { "epoch": 2.3302689819543754, "grad_norm": 1.811260461807251, "learning_rate": 5.96371139510445e-06, "loss": 0.5603, "mean_token_accuracy": 0.8082449734210968, "step": 3423 }, { "epoch": 2.330949948927477, "grad_norm": 1.8980376720428467, "learning_rate": 5.961500852832613e-06, "loss": 0.5083, "mean_token_accuracy": 0.823774516582489, "step": 3424 }, { "epoch": 2.331630915900579, "grad_norm": 1.7575809955596924, "learning_rate": 5.959290115392112e-06, "loss": 0.4818, "mean_token_accuracy": 0.8197359442710876, "step": 3425 }, { "epoch": 2.3323118828736806, "grad_norm": 1.6980211734771729, "learning_rate": 5.957079183231696e-06, "loss": 0.609, "mean_token_accuracy": 0.786464273929596, "step": 3426 }, { "epoch": 2.3329928498467822, "grad_norm": 1.9083874225616455, "learning_rate": 5.954868056800145e-06, "loss": 0.451, "mean_token_accuracy": 0.8334582149982452, "step": 3427 }, { "epoch": 2.3336738168198843, "grad_norm": 1.8647428750991821, "learning_rate": 5.952656736546281e-06, "loss": 0.5853, "mean_token_accuracy": 0.7947478592395782, "step": 3428 }, { "epoch": 2.334354783792986, "grad_norm": 1.6391313076019287, "learning_rate": 5.9504452229189636e-06, "loss": 0.5823, "mean_token_accuracy": 0.7948986291885376, "step": 3429 }, { "epoch": 2.335035750766088, "grad_norm": 1.8821738958358765, "learning_rate": 5.948233516367093e-06, "loss": 0.4915, "mean_token_accuracy": 0.8167249262332916, "step": 3430 }, { "epoch": 2.3357167177391895, "grad_norm": 1.8203309774398804, "learning_rate": 5.946021617339613e-06, "loss": 0.4226, "mean_token_accuracy": 0.8546670973300934, "step": 3431 }, { "epoch": 2.3363976847122916, "grad_norm": 1.8181352615356445, "learning_rate": 5.9438095262855e-06, "loss": 0.4875, "mean_token_accuracy": 0.8494873046875, "step": 3432 }, { "epoch": 2.337078651685393, "grad_norm": 1.667148232460022, "learning_rate": 5.941597243653769e-06, "loss": 0.5389, "mean_token_accuracy": 0.8111377060413361, "step": 3433 }, { "epoch": 2.3377596186584952, "grad_norm": 1.697427749633789, "learning_rate": 5.9393847698934795e-06, "loss": 0.6728, "mean_token_accuracy": 0.7832036912441254, "step": 3434 }, { "epoch": 2.338440585631597, "grad_norm": 1.652578592300415, "learning_rate": 5.9371721054537255e-06, "loss": 0.569, "mean_token_accuracy": 0.7782399356365204, "step": 3435 }, { "epoch": 2.3391215526046985, "grad_norm": 1.773943543434143, "learning_rate": 5.934959250783643e-06, "loss": 0.5502, "mean_token_accuracy": 0.7888185083866119, "step": 3436 }, { "epoch": 2.3398025195778005, "grad_norm": 1.7182471752166748, "learning_rate": 5.932746206332403e-06, "loss": 0.5926, "mean_token_accuracy": 0.7756296694278717, "step": 3437 }, { "epoch": 2.340483486550902, "grad_norm": 1.8080095052719116, "learning_rate": 5.9305329725492144e-06, "loss": 0.3269, "mean_token_accuracy": 0.8837346732616425, "step": 3438 }, { "epoch": 2.341164453524004, "grad_norm": 1.8488436937332153, "learning_rate": 5.928319549883333e-06, "loss": 0.5109, "mean_token_accuracy": 0.8317335546016693, "step": 3439 }, { "epoch": 2.3418454204971058, "grad_norm": 1.6763824224472046, "learning_rate": 5.926105938784039e-06, "loss": 0.5335, "mean_token_accuracy": 0.7983836233615875, "step": 3440 }, { "epoch": 2.342526387470208, "grad_norm": 1.7909811735153198, "learning_rate": 5.923892139700665e-06, "loss": 0.5168, "mean_token_accuracy": 0.8025989234447479, "step": 3441 }, { "epoch": 2.3432073544433094, "grad_norm": 1.8505922555923462, "learning_rate": 5.921678153082573e-06, "loss": 0.4843, "mean_token_accuracy": 0.8128517270088196, "step": 3442 }, { "epoch": 2.3438883214164115, "grad_norm": 1.7703087329864502, "learning_rate": 5.9194639793791655e-06, "loss": 0.5682, "mean_token_accuracy": 0.7893977761268616, "step": 3443 }, { "epoch": 2.344569288389513, "grad_norm": 2.1375155448913574, "learning_rate": 5.917249619039882e-06, "loss": 0.3721, "mean_token_accuracy": 0.8678562939167023, "step": 3444 }, { "epoch": 2.3452502553626147, "grad_norm": 1.7914232015609741, "learning_rate": 5.915035072514202e-06, "loss": 0.5249, "mean_token_accuracy": 0.8123494684696198, "step": 3445 }, { "epoch": 2.3459312223357167, "grad_norm": 1.8261620998382568, "learning_rate": 5.912820340251641e-06, "loss": 0.5481, "mean_token_accuracy": 0.8057699799537659, "step": 3446 }, { "epoch": 2.3466121893088188, "grad_norm": 1.839717149734497, "learning_rate": 5.910605422701753e-06, "loss": 0.5274, "mean_token_accuracy": 0.802015870809555, "step": 3447 }, { "epoch": 2.3472931562819204, "grad_norm": 1.8094511032104492, "learning_rate": 5.908390320314128e-06, "loss": 0.4946, "mean_token_accuracy": 0.8255929946899414, "step": 3448 }, { "epoch": 2.347974123255022, "grad_norm": 1.7990772724151611, "learning_rate": 5.906175033538397e-06, "loss": 0.5149, "mean_token_accuracy": 0.8129796981811523, "step": 3449 }, { "epoch": 2.348655090228124, "grad_norm": 1.888479471206665, "learning_rate": 5.903959562824226e-06, "loss": 0.6017, "mean_token_accuracy": 0.7720218896865845, "step": 3450 }, { "epoch": 2.3493360572012256, "grad_norm": 1.7859262228012085, "learning_rate": 5.901743908621318e-06, "loss": 0.4865, "mean_token_accuracy": 0.8214544057846069, "step": 3451 }, { "epoch": 2.3500170241743277, "grad_norm": 1.7177319526672363, "learning_rate": 5.899528071379413e-06, "loss": 0.4808, "mean_token_accuracy": 0.8329940140247345, "step": 3452 }, { "epoch": 2.3506979911474293, "grad_norm": 1.8618440628051758, "learning_rate": 5.8973120515482896e-06, "loss": 0.5041, "mean_token_accuracy": 0.8273205459117889, "step": 3453 }, { "epoch": 2.3513789581205313, "grad_norm": 1.8225716352462769, "learning_rate": 5.895095849577766e-06, "loss": 0.4804, "mean_token_accuracy": 0.8193550705909729, "step": 3454 }, { "epoch": 2.352059925093633, "grad_norm": 1.8735734224319458, "learning_rate": 5.892879465917689e-06, "loss": 0.553, "mean_token_accuracy": 0.7956746816635132, "step": 3455 }, { "epoch": 2.352740892066735, "grad_norm": 1.9767735004425049, "learning_rate": 5.890662901017951e-06, "loss": 0.5357, "mean_token_accuracy": 0.7883510291576385, "step": 3456 }, { "epoch": 2.3534218590398366, "grad_norm": 1.8214356899261475, "learning_rate": 5.888446155328478e-06, "loss": 0.5141, "mean_token_accuracy": 0.8357497155666351, "step": 3457 }, { "epoch": 2.354102826012938, "grad_norm": 1.8520828485488892, "learning_rate": 5.886229229299232e-06, "loss": 0.5639, "mean_token_accuracy": 0.777983546257019, "step": 3458 }, { "epoch": 2.3547837929860402, "grad_norm": 1.7073514461517334, "learning_rate": 5.884012123380209e-06, "loss": 0.5274, "mean_token_accuracy": 0.8086519539356232, "step": 3459 }, { "epoch": 2.355464759959142, "grad_norm": 1.8638206720352173, "learning_rate": 5.881794838021449e-06, "loss": 0.3809, "mean_token_accuracy": 0.8699544370174408, "step": 3460 }, { "epoch": 2.356145726932244, "grad_norm": 1.6631289720535278, "learning_rate": 5.8795773736730214e-06, "loss": 0.6658, "mean_token_accuracy": 0.7751843631267548, "step": 3461 }, { "epoch": 2.3568266939053455, "grad_norm": 1.6258069276809692, "learning_rate": 5.877359730785036e-06, "loss": 0.612, "mean_token_accuracy": 0.7761760950088501, "step": 3462 }, { "epoch": 2.3575076608784475, "grad_norm": 1.8174560070037842, "learning_rate": 5.875141909807636e-06, "loss": 0.5244, "mean_token_accuracy": 0.8154164850711823, "step": 3463 }, { "epoch": 2.358188627851549, "grad_norm": 1.7280985116958618, "learning_rate": 5.872923911191002e-06, "loss": 0.5462, "mean_token_accuracy": 0.7849937677383423, "step": 3464 }, { "epoch": 2.358869594824651, "grad_norm": 1.8790572881698608, "learning_rate": 5.870705735385352e-06, "loss": 0.6189, "mean_token_accuracy": 0.7950771749019623, "step": 3465 }, { "epoch": 2.359550561797753, "grad_norm": 1.7945563793182373, "learning_rate": 5.868487382840939e-06, "loss": 0.5615, "mean_token_accuracy": 0.7977118790149689, "step": 3466 }, { "epoch": 2.3602315287708544, "grad_norm": 1.8425743579864502, "learning_rate": 5.86626885400805e-06, "loss": 0.4535, "mean_token_accuracy": 0.8403680920600891, "step": 3467 }, { "epoch": 2.3609124957439565, "grad_norm": 1.7779107093811035, "learning_rate": 5.8640501493370105e-06, "loss": 0.4257, "mean_token_accuracy": 0.8520627617835999, "step": 3468 }, { "epoch": 2.361593462717058, "grad_norm": 1.8505913019180298, "learning_rate": 5.861831269278182e-06, "loss": 0.4518, "mean_token_accuracy": 0.8279004395008087, "step": 3469 }, { "epoch": 2.36227442969016, "grad_norm": 1.7937657833099365, "learning_rate": 5.859612214281956e-06, "loss": 0.5186, "mean_token_accuracy": 0.8058010339736938, "step": 3470 }, { "epoch": 2.3629553966632617, "grad_norm": 1.9108505249023438, "learning_rate": 5.857392984798769e-06, "loss": 0.5186, "mean_token_accuracy": 0.8022550344467163, "step": 3471 }, { "epoch": 2.3636363636363638, "grad_norm": 1.7938998937606812, "learning_rate": 5.855173581279082e-06, "loss": 0.4667, "mean_token_accuracy": 0.8048633933067322, "step": 3472 }, { "epoch": 2.3643173306094654, "grad_norm": 1.7143316268920898, "learning_rate": 5.852954004173402e-06, "loss": 0.5527, "mean_token_accuracy": 0.8110211789608002, "step": 3473 }, { "epoch": 2.3649982975825674, "grad_norm": 1.5987529754638672, "learning_rate": 5.850734253932263e-06, "loss": 0.6084, "mean_token_accuracy": 0.7851647138595581, "step": 3474 }, { "epoch": 2.365679264555669, "grad_norm": 1.8129156827926636, "learning_rate": 5.848514331006239e-06, "loss": 0.5913, "mean_token_accuracy": 0.7755208015441895, "step": 3475 }, { "epoch": 2.3663602315287706, "grad_norm": 1.713407039642334, "learning_rate": 5.8462942358459375e-06, "loss": 0.5932, "mean_token_accuracy": 0.7781489491462708, "step": 3476 }, { "epoch": 2.3670411985018727, "grad_norm": 1.677044153213501, "learning_rate": 5.8440739689019996e-06, "loss": 0.547, "mean_token_accuracy": 0.7820753455162048, "step": 3477 }, { "epoch": 2.3677221654749743, "grad_norm": 1.9038578271865845, "learning_rate": 5.841853530625102e-06, "loss": 0.3492, "mean_token_accuracy": 0.8799779713153839, "step": 3478 }, { "epoch": 2.3684031324480763, "grad_norm": 1.6402901411056519, "learning_rate": 5.839632921465959e-06, "loss": 0.4791, "mean_token_accuracy": 0.8389498591423035, "step": 3479 }, { "epoch": 2.369084099421178, "grad_norm": 1.9787769317626953, "learning_rate": 5.837412141875315e-06, "loss": 0.4653, "mean_token_accuracy": 0.8430567681789398, "step": 3480 }, { "epoch": 2.36976506639428, "grad_norm": 1.806567907333374, "learning_rate": 5.8351911923039526e-06, "loss": 0.5154, "mean_token_accuracy": 0.8303660154342651, "step": 3481 }, { "epoch": 2.3704460333673816, "grad_norm": 1.5921847820281982, "learning_rate": 5.832970073202688e-06, "loss": 0.6754, "mean_token_accuracy": 0.761794239282608, "step": 3482 }, { "epoch": 2.3711270003404836, "grad_norm": 1.7793079614639282, "learning_rate": 5.830748785022369e-06, "loss": 0.5605, "mean_token_accuracy": 0.7930490374565125, "step": 3483 }, { "epoch": 2.3718079673135852, "grad_norm": 1.8064467906951904, "learning_rate": 5.8285273282138845e-06, "loss": 0.5322, "mean_token_accuracy": 0.7993034422397614, "step": 3484 }, { "epoch": 2.3724889342866873, "grad_norm": 1.8285807371139526, "learning_rate": 5.826305703228148e-06, "loss": 0.4536, "mean_token_accuracy": 0.8323166072368622, "step": 3485 }, { "epoch": 2.373169901259789, "grad_norm": 1.732938289642334, "learning_rate": 5.824083910516115e-06, "loss": 0.5177, "mean_token_accuracy": 0.8158302009105682, "step": 3486 }, { "epoch": 2.373850868232891, "grad_norm": 1.7620738744735718, "learning_rate": 5.821861950528773e-06, "loss": 0.4847, "mean_token_accuracy": 0.8028750717639923, "step": 3487 }, { "epoch": 2.3745318352059925, "grad_norm": 1.7711241245269775, "learning_rate": 5.8196398237171424e-06, "loss": 0.5948, "mean_token_accuracy": 0.8012538552284241, "step": 3488 }, { "epoch": 2.375212802179094, "grad_norm": 1.9531179666519165, "learning_rate": 5.8174175305322775e-06, "loss": 0.4163, "mean_token_accuracy": 0.8541300296783447, "step": 3489 }, { "epoch": 2.375893769152196, "grad_norm": 1.8235678672790527, "learning_rate": 5.815195071425267e-06, "loss": 0.4647, "mean_token_accuracy": 0.8140643835067749, "step": 3490 }, { "epoch": 2.376574736125298, "grad_norm": 1.7530709505081177, "learning_rate": 5.812972446847234e-06, "loss": 0.4735, "mean_token_accuracy": 0.8279136419296265, "step": 3491 }, { "epoch": 2.3772557030984, "grad_norm": 1.9252742528915405, "learning_rate": 5.810749657249333e-06, "loss": 0.369, "mean_token_accuracy": 0.8755924105644226, "step": 3492 }, { "epoch": 2.3779366700715014, "grad_norm": 1.9414697885513306, "learning_rate": 5.808526703082752e-06, "loss": 0.493, "mean_token_accuracy": 0.8172665536403656, "step": 3493 }, { "epoch": 2.3786176370446035, "grad_norm": 1.8243446350097656, "learning_rate": 5.806303584798716e-06, "loss": 0.5526, "mean_token_accuracy": 0.7842724621295929, "step": 3494 }, { "epoch": 2.379298604017705, "grad_norm": 1.616610050201416, "learning_rate": 5.8040803028484805e-06, "loss": 0.6657, "mean_token_accuracy": 0.7524738609790802, "step": 3495 }, { "epoch": 2.379979570990807, "grad_norm": 1.8598754405975342, "learning_rate": 5.8018568576833345e-06, "loss": 0.5128, "mean_token_accuracy": 0.790308803319931, "step": 3496 }, { "epoch": 2.3806605379639088, "grad_norm": 1.7596672773361206, "learning_rate": 5.7996332497546015e-06, "loss": 0.5019, "mean_token_accuracy": 0.8337397575378418, "step": 3497 }, { "epoch": 2.3813415049370104, "grad_norm": 1.6309138536453247, "learning_rate": 5.797409479513634e-06, "loss": 0.6606, "mean_token_accuracy": 0.779148280620575, "step": 3498 }, { "epoch": 2.3820224719101124, "grad_norm": 1.7027112245559692, "learning_rate": 5.795185547411823e-06, "loss": 0.4993, "mean_token_accuracy": 0.8259619474411011, "step": 3499 }, { "epoch": 2.382703438883214, "grad_norm": 1.7719072103500366, "learning_rate": 5.792961453900588e-06, "loss": 0.4527, "mean_token_accuracy": 0.8416943848133087, "step": 3500 }, { "epoch": 2.383384405856316, "grad_norm": 1.626949667930603, "learning_rate": 5.790737199431384e-06, "loss": 0.6915, "mean_token_accuracy": 0.7599803507328033, "step": 3501 }, { "epoch": 2.3840653728294177, "grad_norm": 1.7767534255981445, "learning_rate": 5.788512784455697e-06, "loss": 0.4479, "mean_token_accuracy": 0.8433937430381775, "step": 3502 }, { "epoch": 2.3847463398025197, "grad_norm": 1.5754777193069458, "learning_rate": 5.786288209425049e-06, "loss": 0.6007, "mean_token_accuracy": 0.7979679107666016, "step": 3503 }, { "epoch": 2.3854273067756213, "grad_norm": 1.8136874437332153, "learning_rate": 5.784063474790986e-06, "loss": 0.526, "mean_token_accuracy": 0.7877598106861115, "step": 3504 }, { "epoch": 2.3861082737487234, "grad_norm": 1.7513128519058228, "learning_rate": 5.781838581005096e-06, "loss": 0.514, "mean_token_accuracy": 0.8132897019386292, "step": 3505 }, { "epoch": 2.386789240721825, "grad_norm": 1.8282508850097656, "learning_rate": 5.779613528518996e-06, "loss": 0.4996, "mean_token_accuracy": 0.832863450050354, "step": 3506 }, { "epoch": 2.3874702076949266, "grad_norm": 1.724844217300415, "learning_rate": 5.777388317784333e-06, "loss": 0.4628, "mean_token_accuracy": 0.8415766954421997, "step": 3507 }, { "epoch": 2.3881511746680286, "grad_norm": 1.7342923879623413, "learning_rate": 5.775162949252786e-06, "loss": 0.5976, "mean_token_accuracy": 0.77391317486763, "step": 3508 }, { "epoch": 2.3888321416411302, "grad_norm": 1.794116497039795, "learning_rate": 5.77293742337607e-06, "loss": 0.6116, "mean_token_accuracy": 0.8017635643482208, "step": 3509 }, { "epoch": 2.3895131086142323, "grad_norm": 1.7815364599227905, "learning_rate": 5.770711740605931e-06, "loss": 0.5011, "mean_token_accuracy": 0.8128030598163605, "step": 3510 }, { "epoch": 2.390194075587334, "grad_norm": 1.933415412902832, "learning_rate": 5.768485901394143e-06, "loss": 0.4166, "mean_token_accuracy": 0.8484544456005096, "step": 3511 }, { "epoch": 2.390875042560436, "grad_norm": 1.812828540802002, "learning_rate": 5.766259906192516e-06, "loss": 0.4931, "mean_token_accuracy": 0.835696667432785, "step": 3512 }, { "epoch": 2.3915560095335375, "grad_norm": 1.8115359544754028, "learning_rate": 5.764033755452888e-06, "loss": 0.5925, "mean_token_accuracy": 0.7876133620738983, "step": 3513 }, { "epoch": 2.3922369765066396, "grad_norm": 1.8785802125930786, "learning_rate": 5.761807449627133e-06, "loss": 0.4579, "mean_token_accuracy": 0.8440218567848206, "step": 3514 }, { "epoch": 2.392917943479741, "grad_norm": 1.8322362899780273, "learning_rate": 5.759580989167152e-06, "loss": 0.5793, "mean_token_accuracy": 0.7986041605472565, "step": 3515 }, { "epoch": 2.393598910452843, "grad_norm": 1.840289831161499, "learning_rate": 5.757354374524879e-06, "loss": 0.4638, "mean_token_accuracy": 0.8324860632419586, "step": 3516 }, { "epoch": 2.394279877425945, "grad_norm": 2.034687042236328, "learning_rate": 5.7551276061522835e-06, "loss": 0.5438, "mean_token_accuracy": 0.8138094842433929, "step": 3517 }, { "epoch": 2.394960844399047, "grad_norm": 1.94163179397583, "learning_rate": 5.752900684501358e-06, "loss": 0.39, "mean_token_accuracy": 0.8512072265148163, "step": 3518 }, { "epoch": 2.3956418113721485, "grad_norm": 1.9533782005310059, "learning_rate": 5.7506736100241324e-06, "loss": 0.4268, "mean_token_accuracy": 0.8498731553554535, "step": 3519 }, { "epoch": 2.39632277834525, "grad_norm": 1.8785114288330078, "learning_rate": 5.748446383172664e-06, "loss": 0.4335, "mean_token_accuracy": 0.8541833758354187, "step": 3520 }, { "epoch": 2.397003745318352, "grad_norm": 1.8729883432388306, "learning_rate": 5.746219004399047e-06, "loss": 0.4621, "mean_token_accuracy": 0.843559205532074, "step": 3521 }, { "epoch": 2.3976847122914537, "grad_norm": 1.656722903251648, "learning_rate": 5.743991474155398e-06, "loss": 0.532, "mean_token_accuracy": 0.8054904937744141, "step": 3522 }, { "epoch": 2.398365679264556, "grad_norm": 1.767008662223816, "learning_rate": 5.741763792893871e-06, "loss": 0.4529, "mean_token_accuracy": 0.8271495997905731, "step": 3523 }, { "epoch": 2.3990466462376574, "grad_norm": 1.8275251388549805, "learning_rate": 5.739535961066645e-06, "loss": 0.5067, "mean_token_accuracy": 0.8146032989025116, "step": 3524 }, { "epoch": 2.3997276132107594, "grad_norm": 1.6408902406692505, "learning_rate": 5.737307979125936e-06, "loss": 0.5551, "mean_token_accuracy": 0.8056017458438873, "step": 3525 }, { "epoch": 2.400408580183861, "grad_norm": 2.003913164138794, "learning_rate": 5.735079847523988e-06, "loss": 0.5064, "mean_token_accuracy": 0.8180328011512756, "step": 3526 }, { "epoch": 2.401089547156963, "grad_norm": 1.9312788248062134, "learning_rate": 5.73285156671307e-06, "loss": 0.4266, "mean_token_accuracy": 0.8450799584388733, "step": 3527 }, { "epoch": 2.4017705141300647, "grad_norm": 1.9989688396453857, "learning_rate": 5.730623137145491e-06, "loss": 0.4995, "mean_token_accuracy": 0.8086964190006256, "step": 3528 }, { "epoch": 2.4024514811031663, "grad_norm": 2.012775182723999, "learning_rate": 5.728394559273583e-06, "loss": 0.4225, "mean_token_accuracy": 0.8503105342388153, "step": 3529 }, { "epoch": 2.4031324480762684, "grad_norm": 2.0166592597961426, "learning_rate": 5.726165833549709e-06, "loss": 0.4702, "mean_token_accuracy": 0.8395357728004456, "step": 3530 }, { "epoch": 2.40381341504937, "grad_norm": 1.6223711967468262, "learning_rate": 5.723936960426263e-06, "loss": 0.6289, "mean_token_accuracy": 0.7811292111873627, "step": 3531 }, { "epoch": 2.404494382022472, "grad_norm": 1.9204193353652954, "learning_rate": 5.721707940355672e-06, "loss": 0.392, "mean_token_accuracy": 0.8692401945590973, "step": 3532 }, { "epoch": 2.4051753489955736, "grad_norm": 1.7874000072479248, "learning_rate": 5.719478773790389e-06, "loss": 0.4306, "mean_token_accuracy": 0.8531653583049774, "step": 3533 }, { "epoch": 2.4058563159686757, "grad_norm": 1.8320953845977783, "learning_rate": 5.7172494611828946e-06, "loss": 0.4586, "mean_token_accuracy": 0.8371573686599731, "step": 3534 }, { "epoch": 2.4065372829417773, "grad_norm": 1.579261302947998, "learning_rate": 5.715020002985705e-06, "loss": 0.6014, "mean_token_accuracy": 0.8108899891376495, "step": 3535 }, { "epoch": 2.4072182499148793, "grad_norm": 1.7674221992492676, "learning_rate": 5.712790399651364e-06, "loss": 0.5168, "mean_token_accuracy": 0.8081730008125305, "step": 3536 }, { "epoch": 2.407899216887981, "grad_norm": 1.7411290407180786, "learning_rate": 5.710560651632442e-06, "loss": 0.5924, "mean_token_accuracy": 0.793085128068924, "step": 3537 }, { "epoch": 2.4085801838610825, "grad_norm": 1.829296350479126, "learning_rate": 5.708330759381542e-06, "loss": 0.4533, "mean_token_accuracy": 0.8205020427703857, "step": 3538 }, { "epoch": 2.4092611508341846, "grad_norm": 1.8078449964523315, "learning_rate": 5.706100723351292e-06, "loss": 0.6812, "mean_token_accuracy": 0.7825891375541687, "step": 3539 }, { "epoch": 2.409942117807286, "grad_norm": 1.7139358520507812, "learning_rate": 5.703870543994357e-06, "loss": 0.5996, "mean_token_accuracy": 0.7798226475715637, "step": 3540 }, { "epoch": 2.4106230847803882, "grad_norm": 1.7905559539794922, "learning_rate": 5.701640221763421e-06, "loss": 0.4673, "mean_token_accuracy": 0.8252035677433014, "step": 3541 }, { "epoch": 2.41130405175349, "grad_norm": 1.8149691820144653, "learning_rate": 5.699409757111206e-06, "loss": 0.4364, "mean_token_accuracy": 0.8441117703914642, "step": 3542 }, { "epoch": 2.411985018726592, "grad_norm": 1.8438279628753662, "learning_rate": 5.697179150490459e-06, "loss": 0.5413, "mean_token_accuracy": 0.794971227645874, "step": 3543 }, { "epoch": 2.4126659856996935, "grad_norm": 1.8896909952163696, "learning_rate": 5.694948402353955e-06, "loss": 0.4573, "mean_token_accuracy": 0.8523000478744507, "step": 3544 }, { "epoch": 2.4133469526727955, "grad_norm": 1.722581148147583, "learning_rate": 5.692717513154499e-06, "loss": 0.4991, "mean_token_accuracy": 0.8291942179203033, "step": 3545 }, { "epoch": 2.414027919645897, "grad_norm": 1.7824674844741821, "learning_rate": 5.690486483344922e-06, "loss": 0.5058, "mean_token_accuracy": 0.816104918718338, "step": 3546 }, { "epoch": 2.4147088866189987, "grad_norm": 1.790300965309143, "learning_rate": 5.68825531337809e-06, "loss": 0.5359, "mean_token_accuracy": 0.8196649551391602, "step": 3547 }, { "epoch": 2.415389853592101, "grad_norm": 1.9129544496536255, "learning_rate": 5.686024003706892e-06, "loss": 0.5628, "mean_token_accuracy": 0.7798348665237427, "step": 3548 }, { "epoch": 2.416070820565203, "grad_norm": 2.0164124965667725, "learning_rate": 5.683792554784245e-06, "loss": 0.4698, "mean_token_accuracy": 0.832764059305191, "step": 3549 }, { "epoch": 2.4167517875383044, "grad_norm": 1.7663782835006714, "learning_rate": 5.6815609670630975e-06, "loss": 0.509, "mean_token_accuracy": 0.8279618322849274, "step": 3550 }, { "epoch": 2.417432754511406, "grad_norm": 1.7635358572006226, "learning_rate": 5.679329240996425e-06, "loss": 0.5082, "mean_token_accuracy": 0.8258982002735138, "step": 3551 }, { "epoch": 2.418113721484508, "grad_norm": 1.8274472951889038, "learning_rate": 5.677097377037228e-06, "loss": 0.4578, "mean_token_accuracy": 0.8436243832111359, "step": 3552 }, { "epoch": 2.4187946884576097, "grad_norm": 1.8096388578414917, "learning_rate": 5.674865375638542e-06, "loss": 0.5669, "mean_token_accuracy": 0.7657033801078796, "step": 3553 }, { "epoch": 2.4194756554307117, "grad_norm": 1.9224135875701904, "learning_rate": 5.672633237253422e-06, "loss": 0.4843, "mean_token_accuracy": 0.8097735047340393, "step": 3554 }, { "epoch": 2.4201566224038134, "grad_norm": 1.6279438734054565, "learning_rate": 5.670400962334959e-06, "loss": 0.5987, "mean_token_accuracy": 0.7954937517642975, "step": 3555 }, { "epoch": 2.4208375893769154, "grad_norm": 1.6920334100723267, "learning_rate": 5.668168551336263e-06, "loss": 0.6242, "mean_token_accuracy": 0.7845261693000793, "step": 3556 }, { "epoch": 2.421518556350017, "grad_norm": 1.6886565685272217, "learning_rate": 5.6659360047104785e-06, "loss": 0.5906, "mean_token_accuracy": 0.7884023189544678, "step": 3557 }, { "epoch": 2.422199523323119, "grad_norm": 1.7735265493392944, "learning_rate": 5.663703322910778e-06, "loss": 0.5308, "mean_token_accuracy": 0.8139632940292358, "step": 3558 }, { "epoch": 2.4228804902962207, "grad_norm": 1.94435453414917, "learning_rate": 5.661470506390354e-06, "loss": 0.4749, "mean_token_accuracy": 0.841544508934021, "step": 3559 }, { "epoch": 2.4235614572693223, "grad_norm": 1.7400964498519897, "learning_rate": 5.6592375556024335e-06, "loss": 0.6067, "mean_token_accuracy": 0.7872035801410675, "step": 3560 }, { "epoch": 2.4242424242424243, "grad_norm": 1.7564185857772827, "learning_rate": 5.657004471000268e-06, "loss": 0.5201, "mean_token_accuracy": 0.8083772957324982, "step": 3561 }, { "epoch": 2.424923391215526, "grad_norm": 1.788056492805481, "learning_rate": 5.654771253037137e-06, "loss": 0.442, "mean_token_accuracy": 0.845661848783493, "step": 3562 }, { "epoch": 2.425604358188628, "grad_norm": 1.8485132455825806, "learning_rate": 5.652537902166346e-06, "loss": 0.4859, "mean_token_accuracy": 0.8212621212005615, "step": 3563 }, { "epoch": 2.4262853251617296, "grad_norm": 1.6273679733276367, "learning_rate": 5.650304418841228e-06, "loss": 0.6506, "mean_token_accuracy": 0.7862094342708588, "step": 3564 }, { "epoch": 2.4269662921348316, "grad_norm": 1.7989100217819214, "learning_rate": 5.648070803515142e-06, "loss": 0.5293, "mean_token_accuracy": 0.8189464807510376, "step": 3565 }, { "epoch": 2.427647259107933, "grad_norm": 1.7948371171951294, "learning_rate": 5.645837056641477e-06, "loss": 0.557, "mean_token_accuracy": 0.7982705533504486, "step": 3566 }, { "epoch": 2.4283282260810353, "grad_norm": 1.8103867769241333, "learning_rate": 5.643603178673644e-06, "loss": 0.4242, "mean_token_accuracy": 0.8563993573188782, "step": 3567 }, { "epoch": 2.429009193054137, "grad_norm": 1.987227439880371, "learning_rate": 5.6413691700650865e-06, "loss": 0.3914, "mean_token_accuracy": 0.8694651424884796, "step": 3568 }, { "epoch": 2.4296901600272385, "grad_norm": 1.8502546548843384, "learning_rate": 5.639135031269266e-06, "loss": 0.479, "mean_token_accuracy": 0.8271031081676483, "step": 3569 }, { "epoch": 2.4303711270003405, "grad_norm": 1.8659027814865112, "learning_rate": 5.63690076273968e-06, "loss": 0.5064, "mean_token_accuracy": 0.8233088254928589, "step": 3570 }, { "epoch": 2.431052093973442, "grad_norm": 1.8207979202270508, "learning_rate": 5.634666364929845e-06, "loss": 0.475, "mean_token_accuracy": 0.8445324301719666, "step": 3571 }, { "epoch": 2.431733060946544, "grad_norm": 1.8103445768356323, "learning_rate": 5.6324318382933065e-06, "loss": 0.5677, "mean_token_accuracy": 0.7990860641002655, "step": 3572 }, { "epoch": 2.432414027919646, "grad_norm": 1.7273980379104614, "learning_rate": 5.630197183283639e-06, "loss": 0.6569, "mean_token_accuracy": 0.7367520332336426, "step": 3573 }, { "epoch": 2.433094994892748, "grad_norm": 1.5753469467163086, "learning_rate": 5.6279624003544385e-06, "loss": 0.6849, "mean_token_accuracy": 0.7684541046619415, "step": 3574 }, { "epoch": 2.4337759618658494, "grad_norm": 1.7392182350158691, "learning_rate": 5.625727489959329e-06, "loss": 0.4846, "mean_token_accuracy": 0.8246137201786041, "step": 3575 }, { "epoch": 2.4344569288389515, "grad_norm": 1.8810651302337646, "learning_rate": 5.623492452551958e-06, "loss": 0.4347, "mean_token_accuracy": 0.8505960404872894, "step": 3576 }, { "epoch": 2.435137895812053, "grad_norm": 1.8553625345230103, "learning_rate": 5.621257288586004e-06, "loss": 0.4058, "mean_token_accuracy": 0.8626854419708252, "step": 3577 }, { "epoch": 2.4358188627851547, "grad_norm": 1.8199758529663086, "learning_rate": 5.619021998515165e-06, "loss": 0.5597, "mean_token_accuracy": 0.8021115064620972, "step": 3578 }, { "epoch": 2.4364998297582567, "grad_norm": 1.791660189628601, "learning_rate": 5.616786582793171e-06, "loss": 0.6026, "mean_token_accuracy": 0.8137822151184082, "step": 3579 }, { "epoch": 2.4371807967313583, "grad_norm": 1.9058558940887451, "learning_rate": 5.614551041873772e-06, "loss": 0.4949, "mean_token_accuracy": 0.8234712183475494, "step": 3580 }, { "epoch": 2.4378617637044604, "grad_norm": 1.6664741039276123, "learning_rate": 5.612315376210747e-06, "loss": 0.5945, "mean_token_accuracy": 0.7978937327861786, "step": 3581 }, { "epoch": 2.438542730677562, "grad_norm": 1.8864622116088867, "learning_rate": 5.610079586257897e-06, "loss": 0.3943, "mean_token_accuracy": 0.8663992583751678, "step": 3582 }, { "epoch": 2.439223697650664, "grad_norm": 1.756061315536499, "learning_rate": 5.607843672469051e-06, "loss": 0.5668, "mean_token_accuracy": 0.8057580292224884, "step": 3583 }, { "epoch": 2.4399046646237657, "grad_norm": 1.9741790294647217, "learning_rate": 5.6056076352980624e-06, "loss": 0.4208, "mean_token_accuracy": 0.8504855930805206, "step": 3584 }, { "epoch": 2.4405856315968677, "grad_norm": 1.7793923616409302, "learning_rate": 5.603371475198811e-06, "loss": 0.4935, "mean_token_accuracy": 0.8341158032417297, "step": 3585 }, { "epoch": 2.4412665985699693, "grad_norm": 1.9223380088806152, "learning_rate": 5.601135192625198e-06, "loss": 0.392, "mean_token_accuracy": 0.8747198581695557, "step": 3586 }, { "epoch": 2.4419475655430714, "grad_norm": 1.7137888669967651, "learning_rate": 5.598898788031151e-06, "loss": 0.539, "mean_token_accuracy": 0.8314259648323059, "step": 3587 }, { "epoch": 2.442628532516173, "grad_norm": 1.696307897567749, "learning_rate": 5.596662261870625e-06, "loss": 0.5127, "mean_token_accuracy": 0.819991946220398, "step": 3588 }, { "epoch": 2.443309499489275, "grad_norm": 1.9756735563278198, "learning_rate": 5.594425614597596e-06, "loss": 0.448, "mean_token_accuracy": 0.8469951748847961, "step": 3589 }, { "epoch": 2.4439904664623766, "grad_norm": 1.8649810552597046, "learning_rate": 5.592188846666066e-06, "loss": 0.4978, "mean_token_accuracy": 0.8183762431144714, "step": 3590 }, { "epoch": 2.444671433435478, "grad_norm": 1.6667509078979492, "learning_rate": 5.5899519585300625e-06, "loss": 0.5792, "mean_token_accuracy": 0.8026621043682098, "step": 3591 }, { "epoch": 2.4453524004085803, "grad_norm": 1.583653450012207, "learning_rate": 5.587714950643638e-06, "loss": 0.6, "mean_token_accuracy": 0.8146407604217529, "step": 3592 }, { "epoch": 2.446033367381682, "grad_norm": 1.7322888374328613, "learning_rate": 5.5854778234608645e-06, "loss": 0.5667, "mean_token_accuracy": 0.8007032871246338, "step": 3593 }, { "epoch": 2.446714334354784, "grad_norm": 1.7074865102767944, "learning_rate": 5.5832405774358445e-06, "loss": 0.5195, "mean_token_accuracy": 0.8227068781852722, "step": 3594 }, { "epoch": 2.4473953013278855, "grad_norm": 1.8187662363052368, "learning_rate": 5.581003213022699e-06, "loss": 0.6417, "mean_token_accuracy": 0.784758985042572, "step": 3595 }, { "epoch": 2.4480762683009876, "grad_norm": 1.9090932607650757, "learning_rate": 5.578765730675578e-06, "loss": 0.5803, "mean_token_accuracy": 0.7799476981163025, "step": 3596 }, { "epoch": 2.448757235274089, "grad_norm": 1.770227074623108, "learning_rate": 5.576528130848652e-06, "loss": 0.4891, "mean_token_accuracy": 0.8348551392555237, "step": 3597 }, { "epoch": 2.449438202247191, "grad_norm": 1.8518941402435303, "learning_rate": 5.574290413996117e-06, "loss": 0.6584, "mean_token_accuracy": 0.7685248255729675, "step": 3598 }, { "epoch": 2.450119169220293, "grad_norm": 1.8115695714950562, "learning_rate": 5.572052580572193e-06, "loss": 0.557, "mean_token_accuracy": 0.7647252082824707, "step": 3599 }, { "epoch": 2.4508001361933944, "grad_norm": 1.8340429067611694, "learning_rate": 5.569814631031121e-06, "loss": 0.4953, "mean_token_accuracy": 0.8104205429553986, "step": 3600 }, { "epoch": 2.4514811031664965, "grad_norm": 1.913498044013977, "learning_rate": 5.567576565827169e-06, "loss": 0.5523, "mean_token_accuracy": 0.7735106647014618, "step": 3601 }, { "epoch": 2.452162070139598, "grad_norm": 1.9544669389724731, "learning_rate": 5.565338385414625e-06, "loss": 0.5196, "mean_token_accuracy": 0.8246332705020905, "step": 3602 }, { "epoch": 2.4528430371127, "grad_norm": 1.7944769859313965, "learning_rate": 5.563100090247805e-06, "loss": 0.4508, "mean_token_accuracy": 0.8302318453788757, "step": 3603 }, { "epoch": 2.4535240040858017, "grad_norm": 1.8823477029800415, "learning_rate": 5.560861680781046e-06, "loss": 0.5984, "mean_token_accuracy": 0.788791835308075, "step": 3604 }, { "epoch": 2.454204971058904, "grad_norm": 1.8216211795806885, "learning_rate": 5.558623157468704e-06, "loss": 0.4578, "mean_token_accuracy": 0.8335550129413605, "step": 3605 }, { "epoch": 2.4548859380320054, "grad_norm": 1.8573999404907227, "learning_rate": 5.5563845207651634e-06, "loss": 0.4416, "mean_token_accuracy": 0.8465836942195892, "step": 3606 }, { "epoch": 2.4555669050051074, "grad_norm": 1.8141182661056519, "learning_rate": 5.554145771124832e-06, "loss": 0.4361, "mean_token_accuracy": 0.8410320580005646, "step": 3607 }, { "epoch": 2.456247871978209, "grad_norm": 1.821554183959961, "learning_rate": 5.5519069090021385e-06, "loss": 0.4476, "mean_token_accuracy": 0.8308936059474945, "step": 3608 }, { "epoch": 2.4569288389513106, "grad_norm": 1.8454619646072388, "learning_rate": 5.549667934851534e-06, "loss": 0.4297, "mean_token_accuracy": 0.8487754762172699, "step": 3609 }, { "epoch": 2.4576098059244127, "grad_norm": 1.84751558303833, "learning_rate": 5.5474288491274916e-06, "loss": 0.5246, "mean_token_accuracy": 0.8227699100971222, "step": 3610 }, { "epoch": 2.4582907728975143, "grad_norm": 1.9939604997634888, "learning_rate": 5.545189652284511e-06, "loss": 0.5277, "mean_token_accuracy": 0.8206274807453156, "step": 3611 }, { "epoch": 2.4589717398706163, "grad_norm": 1.78388512134552, "learning_rate": 5.542950344777109e-06, "loss": 0.4848, "mean_token_accuracy": 0.7953798174858093, "step": 3612 }, { "epoch": 2.459652706843718, "grad_norm": 1.8214253187179565, "learning_rate": 5.540710927059829e-06, "loss": 0.5805, "mean_token_accuracy": 0.804116427898407, "step": 3613 }, { "epoch": 2.46033367381682, "grad_norm": 1.8553786277770996, "learning_rate": 5.5384713995872375e-06, "loss": 0.5718, "mean_token_accuracy": 0.8013556003570557, "step": 3614 }, { "epoch": 2.4610146407899216, "grad_norm": 1.7606487274169922, "learning_rate": 5.536231762813919e-06, "loss": 0.4111, "mean_token_accuracy": 0.8399420380592346, "step": 3615 }, { "epoch": 2.4616956077630237, "grad_norm": 1.8502566814422607, "learning_rate": 5.533992017194481e-06, "loss": 0.5765, "mean_token_accuracy": 0.8113327920436859, "step": 3616 }, { "epoch": 2.4623765747361253, "grad_norm": 1.8948173522949219, "learning_rate": 5.531752163183558e-06, "loss": 0.467, "mean_token_accuracy": 0.829171895980835, "step": 3617 }, { "epoch": 2.463057541709227, "grad_norm": 1.7692073583602905, "learning_rate": 5.529512201235803e-06, "loss": 0.4466, "mean_token_accuracy": 0.8538122475147247, "step": 3618 }, { "epoch": 2.463738508682329, "grad_norm": 1.6704109907150269, "learning_rate": 5.5272721318058886e-06, "loss": 0.5786, "mean_token_accuracy": 0.8085638284683228, "step": 3619 }, { "epoch": 2.464419475655431, "grad_norm": 1.7795679569244385, "learning_rate": 5.525031955348511e-06, "loss": 0.5379, "mean_token_accuracy": 0.7745042443275452, "step": 3620 }, { "epoch": 2.4651004426285326, "grad_norm": 1.7658922672271729, "learning_rate": 5.5227916723183905e-06, "loss": 0.3957, "mean_token_accuracy": 0.8594871163368225, "step": 3621 }, { "epoch": 2.465781409601634, "grad_norm": 1.8792778253555298, "learning_rate": 5.520551283170268e-06, "loss": 0.4962, "mean_token_accuracy": 0.8281857371330261, "step": 3622 }, { "epoch": 2.466462376574736, "grad_norm": 1.9606786966323853, "learning_rate": 5.518310788358904e-06, "loss": 0.4055, "mean_token_accuracy": 0.860957533121109, "step": 3623 }, { "epoch": 2.467143343547838, "grad_norm": 1.9684487581253052, "learning_rate": 5.516070188339082e-06, "loss": 0.43, "mean_token_accuracy": 0.8362628221511841, "step": 3624 }, { "epoch": 2.46782431052094, "grad_norm": 1.855893850326538, "learning_rate": 5.513829483565606e-06, "loss": 0.4996, "mean_token_accuracy": 0.8196890652179718, "step": 3625 }, { "epoch": 2.4685052774940415, "grad_norm": 1.9399598836898804, "learning_rate": 5.511588674493302e-06, "loss": 0.4753, "mean_token_accuracy": 0.8354816734790802, "step": 3626 }, { "epoch": 2.4691862444671435, "grad_norm": 1.8073464632034302, "learning_rate": 5.5093477615770165e-06, "loss": 0.5462, "mean_token_accuracy": 0.8042354881763458, "step": 3627 }, { "epoch": 2.469867211440245, "grad_norm": 2.0958502292633057, "learning_rate": 5.507106745271618e-06, "loss": 0.3819, "mean_token_accuracy": 0.8635105192661285, "step": 3628 }, { "epoch": 2.470548178413347, "grad_norm": 1.974939227104187, "learning_rate": 5.504865626031997e-06, "loss": 0.4395, "mean_token_accuracy": 0.8355922698974609, "step": 3629 }, { "epoch": 2.4712291453864488, "grad_norm": 1.9828968048095703, "learning_rate": 5.5026244043130605e-06, "loss": 0.4194, "mean_token_accuracy": 0.8397445380687714, "step": 3630 }, { "epoch": 2.4719101123595504, "grad_norm": 1.7752190828323364, "learning_rate": 5.50038308056974e-06, "loss": 0.4728, "mean_token_accuracy": 0.8294410109519958, "step": 3631 }, { "epoch": 2.4725910793326524, "grad_norm": 1.8935378789901733, "learning_rate": 5.498141655256988e-06, "loss": 0.4251, "mean_token_accuracy": 0.8541288375854492, "step": 3632 }, { "epoch": 2.473272046305754, "grad_norm": 1.7412961721420288, "learning_rate": 5.495900128829776e-06, "loss": 0.6262, "mean_token_accuracy": 0.784597784280777, "step": 3633 }, { "epoch": 2.473953013278856, "grad_norm": 1.8188273906707764, "learning_rate": 5.493658501743097e-06, "loss": 0.5204, "mean_token_accuracy": 0.81205153465271, "step": 3634 }, { "epoch": 2.4746339802519577, "grad_norm": 1.9717053174972534, "learning_rate": 5.491416774451963e-06, "loss": 0.3326, "mean_token_accuracy": 0.8868533372879028, "step": 3635 }, { "epoch": 2.4753149472250597, "grad_norm": 1.894541621208191, "learning_rate": 5.489174947411408e-06, "loss": 0.4084, "mean_token_accuracy": 0.849835067987442, "step": 3636 }, { "epoch": 2.4759959141981613, "grad_norm": 1.8071209192276, "learning_rate": 5.4869330210764856e-06, "loss": 0.4676, "mean_token_accuracy": 0.8191790878772736, "step": 3637 }, { "epoch": 2.4766768811712634, "grad_norm": 1.695525884628296, "learning_rate": 5.484690995902268e-06, "loss": 0.5102, "mean_token_accuracy": 0.7870627641677856, "step": 3638 }, { "epoch": 2.477357848144365, "grad_norm": 1.8878998756408691, "learning_rate": 5.482448872343851e-06, "loss": 0.566, "mean_token_accuracy": 0.7986297905445099, "step": 3639 }, { "epoch": 2.4780388151174666, "grad_norm": 1.755286693572998, "learning_rate": 5.480206650856348e-06, "loss": 0.4172, "mean_token_accuracy": 0.848899632692337, "step": 3640 }, { "epoch": 2.4787197820905686, "grad_norm": 1.9664114713668823, "learning_rate": 5.477964331894891e-06, "loss": 0.4434, "mean_token_accuracy": 0.8262508809566498, "step": 3641 }, { "epoch": 2.4794007490636703, "grad_norm": 1.860650897026062, "learning_rate": 5.4757219159146346e-06, "loss": 0.5811, "mean_token_accuracy": 0.7925495505332947, "step": 3642 }, { "epoch": 2.4800817160367723, "grad_norm": 1.967965006828308, "learning_rate": 5.4734794033707515e-06, "loss": 0.4434, "mean_token_accuracy": 0.8499889373779297, "step": 3643 }, { "epoch": 2.480762683009874, "grad_norm": 1.7665364742279053, "learning_rate": 5.471236794718436e-06, "loss": 0.5285, "mean_token_accuracy": 0.7961522936820984, "step": 3644 }, { "epoch": 2.481443649982976, "grad_norm": 2.0055124759674072, "learning_rate": 5.4689940904129e-06, "loss": 0.3818, "mean_token_accuracy": 0.8706793785095215, "step": 3645 }, { "epoch": 2.4821246169560776, "grad_norm": 1.6233175992965698, "learning_rate": 5.466751290909372e-06, "loss": 0.6951, "mean_token_accuracy": 0.7661291658878326, "step": 3646 }, { "epoch": 2.4828055839291796, "grad_norm": 1.8591053485870361, "learning_rate": 5.464508396663105e-06, "loss": 0.5026, "mean_token_accuracy": 0.810670405626297, "step": 3647 }, { "epoch": 2.483486550902281, "grad_norm": 1.6625962257385254, "learning_rate": 5.462265408129372e-06, "loss": 0.5975, "mean_token_accuracy": 0.775530219078064, "step": 3648 }, { "epoch": 2.484167517875383, "grad_norm": 1.8693641424179077, "learning_rate": 5.460022325763457e-06, "loss": 0.4498, "mean_token_accuracy": 0.8457703590393066, "step": 3649 }, { "epoch": 2.484848484848485, "grad_norm": 1.5772273540496826, "learning_rate": 5.457779150020672e-06, "loss": 0.6489, "mean_token_accuracy": 0.7853945195674896, "step": 3650 }, { "epoch": 2.4855294518215865, "grad_norm": 1.8015602827072144, "learning_rate": 5.455535881356342e-06, "loss": 0.5502, "mean_token_accuracy": 0.7907316088676453, "step": 3651 }, { "epoch": 2.4862104187946885, "grad_norm": 1.8531016111373901, "learning_rate": 5.453292520225817e-06, "loss": 0.4703, "mean_token_accuracy": 0.832579493522644, "step": 3652 }, { "epoch": 2.48689138576779, "grad_norm": 1.6710478067398071, "learning_rate": 5.451049067084457e-06, "loss": 0.6144, "mean_token_accuracy": 0.7787199318408966, "step": 3653 }, { "epoch": 2.487572352740892, "grad_norm": 1.7410997152328491, "learning_rate": 5.4488055223876494e-06, "loss": 0.5543, "mean_token_accuracy": 0.7993090152740479, "step": 3654 }, { "epoch": 2.4882533197139938, "grad_norm": 1.7340024709701538, "learning_rate": 5.446561886590794e-06, "loss": 0.5516, "mean_token_accuracy": 0.8013689815998077, "step": 3655 }, { "epoch": 2.488934286687096, "grad_norm": 1.9550917148590088, "learning_rate": 5.4443181601493145e-06, "loss": 0.4464, "mean_token_accuracy": 0.846960574388504, "step": 3656 }, { "epoch": 2.4896152536601974, "grad_norm": 1.8084132671356201, "learning_rate": 5.4420743435186465e-06, "loss": 0.4775, "mean_token_accuracy": 0.8255265951156616, "step": 3657 }, { "epoch": 2.4902962206332995, "grad_norm": 1.775758981704712, "learning_rate": 5.439830437154249e-06, "loss": 0.4589, "mean_token_accuracy": 0.8356443345546722, "step": 3658 }, { "epoch": 2.490977187606401, "grad_norm": 1.6570348739624023, "learning_rate": 5.437586441511598e-06, "loss": 0.5658, "mean_token_accuracy": 0.8090629279613495, "step": 3659 }, { "epoch": 2.491658154579503, "grad_norm": 1.8847370147705078, "learning_rate": 5.4353423570461875e-06, "loss": 0.6092, "mean_token_accuracy": 0.7580398619174957, "step": 3660 }, { "epoch": 2.4923391215526047, "grad_norm": 1.7492262125015259, "learning_rate": 5.433098184213528e-06, "loss": 0.576, "mean_token_accuracy": 0.7887641191482544, "step": 3661 }, { "epoch": 2.4930200885257063, "grad_norm": 1.980277419090271, "learning_rate": 5.4308539234691485e-06, "loss": 0.3744, "mean_token_accuracy": 0.8758587837219238, "step": 3662 }, { "epoch": 2.4937010554988084, "grad_norm": 1.789885401725769, "learning_rate": 5.4286095752686e-06, "loss": 0.4405, "mean_token_accuracy": 0.8383717834949493, "step": 3663 }, { "epoch": 2.49438202247191, "grad_norm": 1.8620492219924927, "learning_rate": 5.426365140067445e-06, "loss": 0.5787, "mean_token_accuracy": 0.7754564881324768, "step": 3664 }, { "epoch": 2.495062989445012, "grad_norm": 1.9753268957138062, "learning_rate": 5.424120618321267e-06, "loss": 0.4588, "mean_token_accuracy": 0.8421938419342041, "step": 3665 }, { "epoch": 2.4957439564181136, "grad_norm": 1.9352283477783203, "learning_rate": 5.421876010485666e-06, "loss": 0.5702, "mean_token_accuracy": 0.7761302888393402, "step": 3666 }, { "epoch": 2.4964249233912157, "grad_norm": 1.7996115684509277, "learning_rate": 5.419631317016262e-06, "loss": 0.4188, "mean_token_accuracy": 0.8563787639141083, "step": 3667 }, { "epoch": 2.4971058903643173, "grad_norm": 1.9704170227050781, "learning_rate": 5.417386538368689e-06, "loss": 0.4312, "mean_token_accuracy": 0.8334885835647583, "step": 3668 }, { "epoch": 2.4977868573374193, "grad_norm": 1.6323210000991821, "learning_rate": 5.415141674998597e-06, "loss": 0.5516, "mean_token_accuracy": 0.8201780617237091, "step": 3669 }, { "epoch": 2.498467824310521, "grad_norm": 1.8526690006256104, "learning_rate": 5.412896727361663e-06, "loss": 0.4558, "mean_token_accuracy": 0.8333994150161743, "step": 3670 }, { "epoch": 2.4991487912836226, "grad_norm": 1.9259216785430908, "learning_rate": 5.410651695913568e-06, "loss": 0.4137, "mean_token_accuracy": 0.8584092557430267, "step": 3671 }, { "epoch": 2.4998297582567246, "grad_norm": 1.9456804990768433, "learning_rate": 5.408406581110016e-06, "loss": 0.4477, "mean_token_accuracy": 0.844681590795517, "step": 3672 }, { "epoch": 2.500510725229826, "grad_norm": 1.8963879346847534, "learning_rate": 5.40616138340673e-06, "loss": 0.5099, "mean_token_accuracy": 0.824531227350235, "step": 3673 }, { "epoch": 2.5011916922029283, "grad_norm": 1.8069339990615845, "learning_rate": 5.403916103259449e-06, "loss": 0.5563, "mean_token_accuracy": 0.8033181130886078, "step": 3674 }, { "epoch": 2.50187265917603, "grad_norm": 1.545018196105957, "learning_rate": 5.401670741123926e-06, "loss": 0.5358, "mean_token_accuracy": 0.8072474002838135, "step": 3675 }, { "epoch": 2.502553626149132, "grad_norm": 1.8335916996002197, "learning_rate": 5.39942529745593e-06, "loss": 0.462, "mean_token_accuracy": 0.8482854068279266, "step": 3676 }, { "epoch": 2.5032345931222335, "grad_norm": 1.8609930276870728, "learning_rate": 5.397179772711251e-06, "loss": 0.5415, "mean_token_accuracy": 0.8152963519096375, "step": 3677 }, { "epoch": 2.5039155600953356, "grad_norm": 1.813775897026062, "learning_rate": 5.394934167345693e-06, "loss": 0.4799, "mean_token_accuracy": 0.8300600945949554, "step": 3678 }, { "epoch": 2.504596527068437, "grad_norm": 1.7606698274612427, "learning_rate": 5.392688481815076e-06, "loss": 0.5499, "mean_token_accuracy": 0.8102218806743622, "step": 3679 }, { "epoch": 2.5052774940415388, "grad_norm": 1.7053468227386475, "learning_rate": 5.3904427165752375e-06, "loss": 0.6404, "mean_token_accuracy": 0.7692924439907074, "step": 3680 }, { "epoch": 2.505958461014641, "grad_norm": 1.5375772714614868, "learning_rate": 5.388196872082028e-06, "loss": 0.6437, "mean_token_accuracy": 0.7794827222824097, "step": 3681 }, { "epoch": 2.506639427987743, "grad_norm": 1.7938247919082642, "learning_rate": 5.385950948791322e-06, "loss": 0.5336, "mean_token_accuracy": 0.8368692696094513, "step": 3682 }, { "epoch": 2.5073203949608445, "grad_norm": 1.8348809480667114, "learning_rate": 5.383704947158998e-06, "loss": 0.4417, "mean_token_accuracy": 0.829406201839447, "step": 3683 }, { "epoch": 2.508001361933946, "grad_norm": 1.9632993936538696, "learning_rate": 5.381458867640961e-06, "loss": 0.4376, "mean_token_accuracy": 0.8403402864933014, "step": 3684 }, { "epoch": 2.508682328907048, "grad_norm": 1.8292417526245117, "learning_rate": 5.379212710693126e-06, "loss": 0.5588, "mean_token_accuracy": 0.7851592600345612, "step": 3685 }, { "epoch": 2.5093632958801497, "grad_norm": 1.6848725080490112, "learning_rate": 5.376966476771427e-06, "loss": 0.6436, "mean_token_accuracy": 0.7794948518276215, "step": 3686 }, { "epoch": 2.5100442628532518, "grad_norm": 1.8305721282958984, "learning_rate": 5.374720166331811e-06, "loss": 0.5389, "mean_token_accuracy": 0.7821758389472961, "step": 3687 }, { "epoch": 2.5107252298263534, "grad_norm": 1.8134502172470093, "learning_rate": 5.372473779830239e-06, "loss": 0.5572, "mean_token_accuracy": 0.8033475875854492, "step": 3688 }, { "epoch": 2.511406196799455, "grad_norm": 1.7098408937454224, "learning_rate": 5.370227317722697e-06, "loss": 0.6187, "mean_token_accuracy": 0.7734894156455994, "step": 3689 }, { "epoch": 2.512087163772557, "grad_norm": 1.6679129600524902, "learning_rate": 5.367980780465173e-06, "loss": 0.5641, "mean_token_accuracy": 0.7929785251617432, "step": 3690 }, { "epoch": 2.512768130745659, "grad_norm": 1.8149715662002563, "learning_rate": 5.3657341685136785e-06, "loss": 0.496, "mean_token_accuracy": 0.8074253499507904, "step": 3691 }, { "epoch": 2.5134490977187607, "grad_norm": 1.7425267696380615, "learning_rate": 5.363487482324239e-06, "loss": 0.555, "mean_token_accuracy": 0.7879433929920197, "step": 3692 }, { "epoch": 2.5141300646918623, "grad_norm": 1.780888557434082, "learning_rate": 5.361240722352895e-06, "loss": 0.4577, "mean_token_accuracy": 0.8394545316696167, "step": 3693 }, { "epoch": 2.5148110316649643, "grad_norm": 1.779781699180603, "learning_rate": 5.358993889055699e-06, "loss": 0.4001, "mean_token_accuracy": 0.862323135137558, "step": 3694 }, { "epoch": 2.515491998638066, "grad_norm": 1.7343096733093262, "learning_rate": 5.356746982888723e-06, "loss": 0.4298, "mean_token_accuracy": 0.8560982346534729, "step": 3695 }, { "epoch": 2.516172965611168, "grad_norm": 1.7030959129333496, "learning_rate": 5.354500004308051e-06, "loss": 0.5272, "mean_token_accuracy": 0.7992377579212189, "step": 3696 }, { "epoch": 2.5168539325842696, "grad_norm": 1.9384387731552124, "learning_rate": 5.352252953769782e-06, "loss": 0.4727, "mean_token_accuracy": 0.8221357762813568, "step": 3697 }, { "epoch": 2.517534899557371, "grad_norm": 1.6468069553375244, "learning_rate": 5.350005831730028e-06, "loss": 0.4744, "mean_token_accuracy": 0.8248433172702789, "step": 3698 }, { "epoch": 2.5182158665304732, "grad_norm": 1.9389901161193848, "learning_rate": 5.34775863864492e-06, "loss": 0.5113, "mean_token_accuracy": 0.8162802457809448, "step": 3699 }, { "epoch": 2.5188968335035753, "grad_norm": 1.972139835357666, "learning_rate": 5.345511374970601e-06, "loss": 0.4427, "mean_token_accuracy": 0.8489015698432922, "step": 3700 }, { "epoch": 2.519577800476677, "grad_norm": 1.7585726976394653, "learning_rate": 5.343264041163226e-06, "loss": 0.4978, "mean_token_accuracy": 0.8338052332401276, "step": 3701 }, { "epoch": 2.5202587674497785, "grad_norm": 1.8508027791976929, "learning_rate": 5.3410166376789675e-06, "loss": 0.4187, "mean_token_accuracy": 0.8544524013996124, "step": 3702 }, { "epoch": 2.5209397344228806, "grad_norm": 1.7726534605026245, "learning_rate": 5.338769164974009e-06, "loss": 0.4815, "mean_token_accuracy": 0.8122090399265289, "step": 3703 }, { "epoch": 2.521620701395982, "grad_norm": 1.9654409885406494, "learning_rate": 5.3365216235045545e-06, "loss": 0.4923, "mean_token_accuracy": 0.8290049135684967, "step": 3704 }, { "epoch": 2.522301668369084, "grad_norm": 1.931881070137024, "learning_rate": 5.334274013726813e-06, "loss": 0.4059, "mean_token_accuracy": 0.8546430766582489, "step": 3705 }, { "epoch": 2.522982635342186, "grad_norm": 1.7837551832199097, "learning_rate": 5.332026336097015e-06, "loss": 0.5363, "mean_token_accuracy": 0.7910828292369843, "step": 3706 }, { "epoch": 2.523663602315288, "grad_norm": 1.8904635906219482, "learning_rate": 5.329778591071399e-06, "loss": 0.4494, "mean_token_accuracy": 0.8016493022441864, "step": 3707 }, { "epoch": 2.5243445692883895, "grad_norm": 1.9276920557022095, "learning_rate": 5.327530779106222e-06, "loss": 0.5938, "mean_token_accuracy": 0.7701182663440704, "step": 3708 }, { "epoch": 2.5250255362614915, "grad_norm": 1.6684834957122803, "learning_rate": 5.3252829006577515e-06, "loss": 0.5663, "mean_token_accuracy": 0.7917463481426239, "step": 3709 }, { "epoch": 2.525706503234593, "grad_norm": 1.747624158859253, "learning_rate": 5.323034956182268e-06, "loss": 0.7166, "mean_token_accuracy": 0.7529706358909607, "step": 3710 }, { "epoch": 2.5263874702076947, "grad_norm": 1.786914587020874, "learning_rate": 5.32078694613607e-06, "loss": 0.4102, "mean_token_accuracy": 0.8634398877620697, "step": 3711 }, { "epoch": 2.5270684371807968, "grad_norm": 2.009204387664795, "learning_rate": 5.318538870975464e-06, "loss": 0.4141, "mean_token_accuracy": 0.8528569042682648, "step": 3712 }, { "epoch": 2.5277494041538984, "grad_norm": 1.9977306127548218, "learning_rate": 5.316290731156771e-06, "loss": 0.4811, "mean_token_accuracy": 0.8406555950641632, "step": 3713 }, { "epoch": 2.5284303711270004, "grad_norm": 1.7659919261932373, "learning_rate": 5.3140425271363275e-06, "loss": 0.5995, "mean_token_accuracy": 0.7720408737659454, "step": 3714 }, { "epoch": 2.529111338100102, "grad_norm": 1.8357495069503784, "learning_rate": 5.3117942593704805e-06, "loss": 0.5131, "mean_token_accuracy": 0.8160626888275146, "step": 3715 }, { "epoch": 2.529792305073204, "grad_norm": 1.7568553686141968, "learning_rate": 5.309545928315593e-06, "loss": 0.5312, "mean_token_accuracy": 0.8111161887645721, "step": 3716 }, { "epoch": 2.5304732720463057, "grad_norm": 1.7872637510299683, "learning_rate": 5.307297534428035e-06, "loss": 0.493, "mean_token_accuracy": 0.8137012422084808, "step": 3717 }, { "epoch": 2.5311542390194077, "grad_norm": 1.7870378494262695, "learning_rate": 5.305049078164196e-06, "loss": 0.5005, "mean_token_accuracy": 0.8315429985523224, "step": 3718 }, { "epoch": 2.5318352059925093, "grad_norm": 1.7422162294387817, "learning_rate": 5.302800559980476e-06, "loss": 0.4811, "mean_token_accuracy": 0.8168706893920898, "step": 3719 }, { "epoch": 2.532516172965611, "grad_norm": 1.7696478366851807, "learning_rate": 5.3005519803332825e-06, "loss": 0.4433, "mean_token_accuracy": 0.8152427971363068, "step": 3720 }, { "epoch": 2.533197139938713, "grad_norm": 1.873904824256897, "learning_rate": 5.298303339679044e-06, "loss": 0.3582, "mean_token_accuracy": 0.8733398020267487, "step": 3721 }, { "epoch": 2.533878106911815, "grad_norm": 1.7252622842788696, "learning_rate": 5.296054638474194e-06, "loss": 0.4917, "mean_token_accuracy": 0.8208313584327698, "step": 3722 }, { "epoch": 2.5345590738849166, "grad_norm": 1.9647434949874878, "learning_rate": 5.293805877175184e-06, "loss": 0.5939, "mean_token_accuracy": 0.7955995202064514, "step": 3723 }, { "epoch": 2.5352400408580182, "grad_norm": 1.7898879051208496, "learning_rate": 5.291557056238474e-06, "loss": 0.4311, "mean_token_accuracy": 0.8398360908031464, "step": 3724 }, { "epoch": 2.5359210078311203, "grad_norm": 1.7020509243011475, "learning_rate": 5.2893081761205365e-06, "loss": 0.6276, "mean_token_accuracy": 0.7812789380550385, "step": 3725 }, { "epoch": 2.536601974804222, "grad_norm": 1.8563992977142334, "learning_rate": 5.287059237277858e-06, "loss": 0.497, "mean_token_accuracy": 0.819608062505722, "step": 3726 }, { "epoch": 2.537282941777324, "grad_norm": 1.8626549243927002, "learning_rate": 5.2848102401669355e-06, "loss": 0.4299, "mean_token_accuracy": 0.8385472893714905, "step": 3727 }, { "epoch": 2.5379639087504255, "grad_norm": 1.7972300052642822, "learning_rate": 5.282561185244276e-06, "loss": 0.5253, "mean_token_accuracy": 0.8224315643310547, "step": 3728 }, { "epoch": 2.538644875723527, "grad_norm": 1.7767047882080078, "learning_rate": 5.280312072966402e-06, "loss": 0.5641, "mean_token_accuracy": 0.806415468454361, "step": 3729 }, { "epoch": 2.539325842696629, "grad_norm": 1.7804197072982788, "learning_rate": 5.278062903789846e-06, "loss": 0.5036, "mean_token_accuracy": 0.8112309575080872, "step": 3730 }, { "epoch": 2.5400068096697312, "grad_norm": 1.9418033361434937, "learning_rate": 5.27581367817115e-06, "loss": 0.4288, "mean_token_accuracy": 0.8503353297710419, "step": 3731 }, { "epoch": 2.540687776642833, "grad_norm": 1.8115503787994385, "learning_rate": 5.2735643965668715e-06, "loss": 0.4857, "mean_token_accuracy": 0.843194991350174, "step": 3732 }, { "epoch": 2.5413687436159345, "grad_norm": 1.9189836978912354, "learning_rate": 5.271315059433576e-06, "loss": 0.4307, "mean_token_accuracy": 0.8516194522380829, "step": 3733 }, { "epoch": 2.5420497105890365, "grad_norm": 1.723480463027954, "learning_rate": 5.269065667227843e-06, "loss": 0.5862, "mean_token_accuracy": 0.7912851274013519, "step": 3734 }, { "epoch": 2.542730677562138, "grad_norm": 1.8556225299835205, "learning_rate": 5.266816220406259e-06, "loss": 0.5002, "mean_token_accuracy": 0.8238620758056641, "step": 3735 }, { "epoch": 2.54341164453524, "grad_norm": 1.8093342781066895, "learning_rate": 5.264566719425427e-06, "loss": 0.4776, "mean_token_accuracy": 0.8154422342777252, "step": 3736 }, { "epoch": 2.5440926115083418, "grad_norm": 1.842893362045288, "learning_rate": 5.262317164741957e-06, "loss": 0.3962, "mean_token_accuracy": 0.8649167716503143, "step": 3737 }, { "epoch": 2.544773578481444, "grad_norm": 1.5492671728134155, "learning_rate": 5.260067556812472e-06, "loss": 0.7383, "mean_token_accuracy": 0.7336495518684387, "step": 3738 }, { "epoch": 2.5454545454545454, "grad_norm": 1.7893553972244263, "learning_rate": 5.257817896093601e-06, "loss": 0.4829, "mean_token_accuracy": 0.8371506333351135, "step": 3739 }, { "epoch": 2.5461355124276475, "grad_norm": 1.7139365673065186, "learning_rate": 5.255568183041992e-06, "loss": 0.6073, "mean_token_accuracy": 0.7913106679916382, "step": 3740 }, { "epoch": 2.546816479400749, "grad_norm": 1.8993875980377197, "learning_rate": 5.253318418114299e-06, "loss": 0.4584, "mean_token_accuracy": 0.8400224447250366, "step": 3741 }, { "epoch": 2.5474974463738507, "grad_norm": 1.8541043996810913, "learning_rate": 5.251068601767186e-06, "loss": 0.3884, "mean_token_accuracy": 0.8549625873565674, "step": 3742 }, { "epoch": 2.5481784133469527, "grad_norm": 1.6404716968536377, "learning_rate": 5.2488187344573275e-06, "loss": 0.5715, "mean_token_accuracy": 0.7807170748710632, "step": 3743 }, { "epoch": 2.5488593803200543, "grad_norm": 1.8086215257644653, "learning_rate": 5.246568816641408e-06, "loss": 0.5179, "mean_token_accuracy": 0.8089957237243652, "step": 3744 }, { "epoch": 2.5495403472931564, "grad_norm": 1.8457177877426147, "learning_rate": 5.2443188487761275e-06, "loss": 0.4858, "mean_token_accuracy": 0.8240674436092377, "step": 3745 }, { "epoch": 2.550221314266258, "grad_norm": 1.7817306518554688, "learning_rate": 5.242068831318189e-06, "loss": 0.3697, "mean_token_accuracy": 0.8698756396770477, "step": 3746 }, { "epoch": 2.55090228123936, "grad_norm": 1.828357219696045, "learning_rate": 5.239818764724309e-06, "loss": 0.4865, "mean_token_accuracy": 0.8251413404941559, "step": 3747 }, { "epoch": 2.5515832482124616, "grad_norm": 1.7464619874954224, "learning_rate": 5.237568649451213e-06, "loss": 0.6026, "mean_token_accuracy": 0.7962621450424194, "step": 3748 }, { "epoch": 2.5522642151855637, "grad_norm": 1.8154178857803345, "learning_rate": 5.235318485955638e-06, "loss": 0.4318, "mean_token_accuracy": 0.8363227248191833, "step": 3749 }, { "epoch": 2.5529451821586653, "grad_norm": 1.834807276725769, "learning_rate": 5.23306827469433e-06, "loss": 0.5524, "mean_token_accuracy": 0.810005396604538, "step": 3750 }, { "epoch": 2.553626149131767, "grad_norm": 1.7527662515640259, "learning_rate": 5.230818016124042e-06, "loss": 0.5746, "mean_token_accuracy": 0.8041817843914032, "step": 3751 }, { "epoch": 2.554307116104869, "grad_norm": 1.9308918714523315, "learning_rate": 5.228567710701542e-06, "loss": 0.5709, "mean_token_accuracy": 0.7910760343074799, "step": 3752 }, { "epoch": 2.554988083077971, "grad_norm": 1.8937721252441406, "learning_rate": 5.2263173588836035e-06, "loss": 0.5293, "mean_token_accuracy": 0.81471848487854, "step": 3753 }, { "epoch": 2.5556690500510726, "grad_norm": 1.8653521537780762, "learning_rate": 5.2240669611270105e-06, "loss": 0.4382, "mean_token_accuracy": 0.8199248909950256, "step": 3754 }, { "epoch": 2.556350017024174, "grad_norm": 1.7799698114395142, "learning_rate": 5.221816517888554e-06, "loss": 0.529, "mean_token_accuracy": 0.8217410743236542, "step": 3755 }, { "epoch": 2.5570309839972762, "grad_norm": 2.028961658477783, "learning_rate": 5.21956602962504e-06, "loss": 0.426, "mean_token_accuracy": 0.8569370806217194, "step": 3756 }, { "epoch": 2.557711950970378, "grad_norm": 1.8152440786361694, "learning_rate": 5.21731549679328e-06, "loss": 0.524, "mean_token_accuracy": 0.8247220814228058, "step": 3757 }, { "epoch": 2.55839291794348, "grad_norm": 2.008720874786377, "learning_rate": 5.2150649198500905e-06, "loss": 0.4076, "mean_token_accuracy": 0.8648543357849121, "step": 3758 }, { "epoch": 2.5590738849165815, "grad_norm": 1.8609572649002075, "learning_rate": 5.212814299252304e-06, "loss": 0.3901, "mean_token_accuracy": 0.8443043828010559, "step": 3759 }, { "epoch": 2.559754851889683, "grad_norm": 1.784136176109314, "learning_rate": 5.21056363545676e-06, "loss": 0.4321, "mean_token_accuracy": 0.8257769346237183, "step": 3760 }, { "epoch": 2.560435818862785, "grad_norm": 1.8591113090515137, "learning_rate": 5.208312928920304e-06, "loss": 0.346, "mean_token_accuracy": 0.8821823000907898, "step": 3761 }, { "epoch": 2.561116785835887, "grad_norm": 1.6102843284606934, "learning_rate": 5.206062180099794e-06, "loss": 0.7339, "mean_token_accuracy": 0.7442496716976166, "step": 3762 }, { "epoch": 2.561797752808989, "grad_norm": 1.9670847654342651, "learning_rate": 5.203811389452092e-06, "loss": 0.5334, "mean_token_accuracy": 0.820659875869751, "step": 3763 }, { "epoch": 2.5624787197820904, "grad_norm": 1.964630126953125, "learning_rate": 5.201560557434073e-06, "loss": 0.4545, "mean_token_accuracy": 0.8445968925952911, "step": 3764 }, { "epoch": 2.5631596867551925, "grad_norm": 1.7107411623001099, "learning_rate": 5.199309684502616e-06, "loss": 0.5744, "mean_token_accuracy": 0.812658965587616, "step": 3765 }, { "epoch": 2.563840653728294, "grad_norm": 1.9026087522506714, "learning_rate": 5.197058771114614e-06, "loss": 0.3473, "mean_token_accuracy": 0.8819602429866791, "step": 3766 }, { "epoch": 2.564521620701396, "grad_norm": 1.6634140014648438, "learning_rate": 5.194807817726962e-06, "loss": 0.5345, "mean_token_accuracy": 0.8058277666568756, "step": 3767 }, { "epoch": 2.5652025876744977, "grad_norm": 1.7182507514953613, "learning_rate": 5.1925568247965686e-06, "loss": 0.5231, "mean_token_accuracy": 0.7897914350032806, "step": 3768 }, { "epoch": 2.5658835546475993, "grad_norm": 1.7184697389602661, "learning_rate": 5.190305792780346e-06, "loss": 0.4385, "mean_token_accuracy": 0.8395944535732269, "step": 3769 }, { "epoch": 2.5665645216207014, "grad_norm": 1.821163535118103, "learning_rate": 5.188054722135216e-06, "loss": 0.4635, "mean_token_accuracy": 0.8299150764942169, "step": 3770 }, { "epoch": 2.5672454885938034, "grad_norm": 1.7417821884155273, "learning_rate": 5.18580361331811e-06, "loss": 0.6182, "mean_token_accuracy": 0.7757788002490997, "step": 3771 }, { "epoch": 2.567926455566905, "grad_norm": 1.6814371347427368, "learning_rate": 5.183552466785966e-06, "loss": 0.6309, "mean_token_accuracy": 0.7966594994068146, "step": 3772 }, { "epoch": 2.5686074225400066, "grad_norm": 1.9316405057907104, "learning_rate": 5.181301282995725e-06, "loss": 0.5144, "mean_token_accuracy": 0.8112457692623138, "step": 3773 }, { "epoch": 2.5692883895131087, "grad_norm": 1.689530849456787, "learning_rate": 5.179050062404345e-06, "loss": 0.5676, "mean_token_accuracy": 0.7647641599178314, "step": 3774 }, { "epoch": 2.5699693564862103, "grad_norm": 1.8398023843765259, "learning_rate": 5.176798805468784e-06, "loss": 0.5887, "mean_token_accuracy": 0.7957122325897217, "step": 3775 }, { "epoch": 2.5706503234593123, "grad_norm": 1.8037439584732056, "learning_rate": 5.17454751264601e-06, "loss": 0.4872, "mean_token_accuracy": 0.8257708549499512, "step": 3776 }, { "epoch": 2.571331290432414, "grad_norm": 1.8277889490127563, "learning_rate": 5.172296184392997e-06, "loss": 0.4358, "mean_token_accuracy": 0.8508167266845703, "step": 3777 }, { "epoch": 2.572012257405516, "grad_norm": 1.816701889038086, "learning_rate": 5.170044821166729e-06, "loss": 0.5042, "mean_token_accuracy": 0.8275014162063599, "step": 3778 }, { "epoch": 2.5726932243786176, "grad_norm": 1.6226404905319214, "learning_rate": 5.167793423424194e-06, "loss": 0.703, "mean_token_accuracy": 0.7918167412281036, "step": 3779 }, { "epoch": 2.5733741913517196, "grad_norm": 1.8051317930221558, "learning_rate": 5.165541991622388e-06, "loss": 0.4922, "mean_token_accuracy": 0.8343258798122406, "step": 3780 }, { "epoch": 2.5740551583248212, "grad_norm": 1.575480580329895, "learning_rate": 5.163290526218314e-06, "loss": 0.4934, "mean_token_accuracy": 0.8225412964820862, "step": 3781 }, { "epoch": 2.574736125297923, "grad_norm": 1.8530303239822388, "learning_rate": 5.161039027668983e-06, "loss": 0.4754, "mean_token_accuracy": 0.831265926361084, "step": 3782 }, { "epoch": 2.575417092271025, "grad_norm": 1.6717123985290527, "learning_rate": 5.158787496431413e-06, "loss": 0.5126, "mean_token_accuracy": 0.8242722451686859, "step": 3783 }, { "epoch": 2.576098059244127, "grad_norm": 1.7900068759918213, "learning_rate": 5.156535932962624e-06, "loss": 0.5288, "mean_token_accuracy": 0.8154428005218506, "step": 3784 }, { "epoch": 2.5767790262172285, "grad_norm": 2.0941405296325684, "learning_rate": 5.154284337719647e-06, "loss": 0.4281, "mean_token_accuracy": 0.8588894903659821, "step": 3785 }, { "epoch": 2.57745999319033, "grad_norm": 1.8988195657730103, "learning_rate": 5.152032711159521e-06, "loss": 0.5243, "mean_token_accuracy": 0.8176229596138, "step": 3786 }, { "epoch": 2.578140960163432, "grad_norm": 1.682064175605774, "learning_rate": 5.1497810537392844e-06, "loss": 0.5115, "mean_token_accuracy": 0.7930424511432648, "step": 3787 }, { "epoch": 2.578821927136534, "grad_norm": 1.6522043943405151, "learning_rate": 5.14752936591599e-06, "loss": 0.6649, "mean_token_accuracy": 0.7590160369873047, "step": 3788 }, { "epoch": 2.579502894109636, "grad_norm": 1.729137897491455, "learning_rate": 5.145277648146689e-06, "loss": 0.6219, "mean_token_accuracy": 0.7593075633049011, "step": 3789 }, { "epoch": 2.5801838610827375, "grad_norm": 1.7585145235061646, "learning_rate": 5.143025900888448e-06, "loss": 0.4434, "mean_token_accuracy": 0.8530014157295227, "step": 3790 }, { "epoch": 2.580864828055839, "grad_norm": 1.7914438247680664, "learning_rate": 5.140774124598328e-06, "loss": 0.3923, "mean_token_accuracy": 0.8551012575626373, "step": 3791 }, { "epoch": 2.581545795028941, "grad_norm": 1.5299654006958008, "learning_rate": 5.138522319733405e-06, "loss": 0.6004, "mean_token_accuracy": 0.7813750207424164, "step": 3792 }, { "epoch": 2.582226762002043, "grad_norm": 1.8156172037124634, "learning_rate": 5.136270486750761e-06, "loss": 0.6291, "mean_token_accuracy": 0.7771827280521393, "step": 3793 }, { "epoch": 2.5829077289751448, "grad_norm": 1.8845579624176025, "learning_rate": 5.1340186261074765e-06, "loss": 0.5164, "mean_token_accuracy": 0.8105901181697845, "step": 3794 }, { "epoch": 2.5835886959482464, "grad_norm": 1.9366589784622192, "learning_rate": 5.131766738260641e-06, "loss": 0.5476, "mean_token_accuracy": 0.8066576719284058, "step": 3795 }, { "epoch": 2.5842696629213484, "grad_norm": 1.7869882583618164, "learning_rate": 5.129514823667353e-06, "loss": 0.506, "mean_token_accuracy": 0.8311807811260223, "step": 3796 }, { "epoch": 2.58495062989445, "grad_norm": 1.8559455871582031, "learning_rate": 5.127262882784712e-06, "loss": 0.5227, "mean_token_accuracy": 0.8256265223026276, "step": 3797 }, { "epoch": 2.585631596867552, "grad_norm": 1.878161072731018, "learning_rate": 5.125010916069829e-06, "loss": 0.47, "mean_token_accuracy": 0.8391354978084564, "step": 3798 }, { "epoch": 2.5863125638406537, "grad_norm": 1.7781606912612915, "learning_rate": 5.122758923979809e-06, "loss": 0.4737, "mean_token_accuracy": 0.8376010358333588, "step": 3799 }, { "epoch": 2.5869935308137553, "grad_norm": 1.8492070436477661, "learning_rate": 5.120506906971772e-06, "loss": 0.5599, "mean_token_accuracy": 0.797383576631546, "step": 3800 }, { "epoch": 2.5876744977868573, "grad_norm": 1.7594187259674072, "learning_rate": 5.11825486550284e-06, "loss": 0.4666, "mean_token_accuracy": 0.8195065557956696, "step": 3801 }, { "epoch": 2.5883554647599594, "grad_norm": 1.7464426755905151, "learning_rate": 5.116002800030139e-06, "loss": 0.5358, "mean_token_accuracy": 0.8073053956031799, "step": 3802 }, { "epoch": 2.589036431733061, "grad_norm": 1.9759453535079956, "learning_rate": 5.113750711010803e-06, "loss": 0.4157, "mean_token_accuracy": 0.8489798307418823, "step": 3803 }, { "epoch": 2.5897173987061626, "grad_norm": 1.7486155033111572, "learning_rate": 5.111498598901965e-06, "loss": 0.5134, "mean_token_accuracy": 0.8202922344207764, "step": 3804 }, { "epoch": 2.5903983656792646, "grad_norm": 1.8682745695114136, "learning_rate": 5.1092464641607695e-06, "loss": 0.4683, "mean_token_accuracy": 0.831298440694809, "step": 3805 }, { "epoch": 2.5910793326523662, "grad_norm": 1.7813259363174438, "learning_rate": 5.106994307244361e-06, "loss": 0.5368, "mean_token_accuracy": 0.8067423403263092, "step": 3806 }, { "epoch": 2.5917602996254683, "grad_norm": 1.787564754486084, "learning_rate": 5.10474212860989e-06, "loss": 0.561, "mean_token_accuracy": 0.8171505630016327, "step": 3807 }, { "epoch": 2.59244126659857, "grad_norm": 2.0085160732269287, "learning_rate": 5.102489928714511e-06, "loss": 0.3813, "mean_token_accuracy": 0.8758067190647125, "step": 3808 }, { "epoch": 2.593122233571672, "grad_norm": 1.7846393585205078, "learning_rate": 5.1002377080153845e-06, "loss": 0.533, "mean_token_accuracy": 0.8010200560092926, "step": 3809 }, { "epoch": 2.5938032005447735, "grad_norm": 1.8566921949386597, "learning_rate": 5.097985466969671e-06, "loss": 0.4822, "mean_token_accuracy": 0.8179301023483276, "step": 3810 }, { "epoch": 2.5944841675178756, "grad_norm": 1.8147797584533691, "learning_rate": 5.09573320603454e-06, "loss": 0.545, "mean_token_accuracy": 0.7862045466899872, "step": 3811 }, { "epoch": 2.595165134490977, "grad_norm": 1.899457335472107, "learning_rate": 5.093480925667163e-06, "loss": 0.4276, "mean_token_accuracy": 0.8612910211086273, "step": 3812 }, { "epoch": 2.595846101464079, "grad_norm": 1.956411361694336, "learning_rate": 5.091228626324716e-06, "loss": 0.5116, "mean_token_accuracy": 0.8288843333721161, "step": 3813 }, { "epoch": 2.596527068437181, "grad_norm": 1.7981034517288208, "learning_rate": 5.088976308464375e-06, "loss": 0.5785, "mean_token_accuracy": 0.7939584851264954, "step": 3814 }, { "epoch": 2.5972080354102824, "grad_norm": 1.7693982124328613, "learning_rate": 5.086723972543325e-06, "loss": 0.6478, "mean_token_accuracy": 0.7567442357540131, "step": 3815 }, { "epoch": 2.5978890023833845, "grad_norm": 1.6754764318466187, "learning_rate": 5.084471619018755e-06, "loss": 0.5163, "mean_token_accuracy": 0.8143523633480072, "step": 3816 }, { "epoch": 2.598569969356486, "grad_norm": 1.812422275543213, "learning_rate": 5.082219248347851e-06, "loss": 0.3812, "mean_token_accuracy": 0.8736593723297119, "step": 3817 }, { "epoch": 2.599250936329588, "grad_norm": 1.6040598154067993, "learning_rate": 5.079966860987811e-06, "loss": 0.6777, "mean_token_accuracy": 0.7726484537124634, "step": 3818 }, { "epoch": 2.5999319033026898, "grad_norm": 1.913385033607483, "learning_rate": 5.077714457395828e-06, "loss": 0.419, "mean_token_accuracy": 0.8554151058197021, "step": 3819 }, { "epoch": 2.600612870275792, "grad_norm": 1.8141378164291382, "learning_rate": 5.075462038029105e-06, "loss": 0.4789, "mean_token_accuracy": 0.8451694846153259, "step": 3820 }, { "epoch": 2.6012938372488934, "grad_norm": 1.8010969161987305, "learning_rate": 5.073209603344845e-06, "loss": 0.4474, "mean_token_accuracy": 0.8434492349624634, "step": 3821 }, { "epoch": 2.601974804221995, "grad_norm": 1.7837097644805908, "learning_rate": 5.070957153800255e-06, "loss": 0.4828, "mean_token_accuracy": 0.8365462720394135, "step": 3822 }, { "epoch": 2.602655771195097, "grad_norm": 1.7893743515014648, "learning_rate": 5.0687046898525436e-06, "loss": 0.5258, "mean_token_accuracy": 0.8053061962127686, "step": 3823 }, { "epoch": 2.603336738168199, "grad_norm": 1.8054605722427368, "learning_rate": 5.066452211958927e-06, "loss": 0.4432, "mean_token_accuracy": 0.8351310193538666, "step": 3824 }, { "epoch": 2.6040177051413007, "grad_norm": 1.706113338470459, "learning_rate": 5.064199720576615e-06, "loss": 0.5662, "mean_token_accuracy": 0.7752610445022583, "step": 3825 }, { "epoch": 2.6046986721144023, "grad_norm": 1.703467845916748, "learning_rate": 5.061947216162829e-06, "loss": 0.574, "mean_token_accuracy": 0.7957549393177032, "step": 3826 }, { "epoch": 2.6053796390875044, "grad_norm": 1.8502651453018188, "learning_rate": 5.059694699174791e-06, "loss": 0.5488, "mean_token_accuracy": 0.8040269911289215, "step": 3827 }, { "epoch": 2.606060606060606, "grad_norm": 1.9133244752883911, "learning_rate": 5.0574421700697256e-06, "loss": 0.4469, "mean_token_accuracy": 0.8407385647296906, "step": 3828 }, { "epoch": 2.606741573033708, "grad_norm": 1.7903739213943481, "learning_rate": 5.055189629304852e-06, "loss": 0.4873, "mean_token_accuracy": 0.8078927397727966, "step": 3829 }, { "epoch": 2.6074225400068096, "grad_norm": 1.8281365633010864, "learning_rate": 5.052937077337405e-06, "loss": 0.5301, "mean_token_accuracy": 0.8091269731521606, "step": 3830 }, { "epoch": 2.6081035069799112, "grad_norm": 1.843822717666626, "learning_rate": 5.0506845146246115e-06, "loss": 0.5083, "mean_token_accuracy": 0.8129550218582153, "step": 3831 }, { "epoch": 2.6087844739530133, "grad_norm": 1.6419953107833862, "learning_rate": 5.048431941623707e-06, "loss": 0.6729, "mean_token_accuracy": 0.7748821377754211, "step": 3832 }, { "epoch": 2.6094654409261153, "grad_norm": 1.9090790748596191, "learning_rate": 5.046179358791924e-06, "loss": 0.3661, "mean_token_accuracy": 0.8776244223117828, "step": 3833 }, { "epoch": 2.610146407899217, "grad_norm": 1.847901701927185, "learning_rate": 5.043926766586501e-06, "loss": 0.4879, "mean_token_accuracy": 0.8292578160762787, "step": 3834 }, { "epoch": 2.6108273748723185, "grad_norm": 1.6932260990142822, "learning_rate": 5.041674165464676e-06, "loss": 0.5527, "mean_token_accuracy": 0.8104543685913086, "step": 3835 }, { "epoch": 2.6115083418454206, "grad_norm": 1.7256951332092285, "learning_rate": 5.039421555883689e-06, "loss": 0.5662, "mean_token_accuracy": 0.8147152066230774, "step": 3836 }, { "epoch": 2.612189308818522, "grad_norm": 1.8377188444137573, "learning_rate": 5.037168938300782e-06, "loss": 0.4152, "mean_token_accuracy": 0.8119881749153137, "step": 3837 }, { "epoch": 2.6128702757916242, "grad_norm": 1.7437400817871094, "learning_rate": 5.0349163131732e-06, "loss": 0.5458, "mean_token_accuracy": 0.7762071192264557, "step": 3838 }, { "epoch": 2.613551242764726, "grad_norm": 1.8762437105178833, "learning_rate": 5.03266368095819e-06, "loss": 0.3817, "mean_token_accuracy": 0.8710637986660004, "step": 3839 }, { "epoch": 2.6142322097378274, "grad_norm": 1.9395854473114014, "learning_rate": 5.0304110421129935e-06, "loss": 0.4908, "mean_token_accuracy": 0.8348848521709442, "step": 3840 }, { "epoch": 2.6149131767109295, "grad_norm": 1.8203136920928955, "learning_rate": 5.028158397094864e-06, "loss": 0.5389, "mean_token_accuracy": 0.8170152306556702, "step": 3841 }, { "epoch": 2.6155941436840315, "grad_norm": 1.816740870475769, "learning_rate": 5.025905746361047e-06, "loss": 0.7129, "mean_token_accuracy": 0.7728090286254883, "step": 3842 }, { "epoch": 2.616275110657133, "grad_norm": 1.8606075048446655, "learning_rate": 5.023653090368796e-06, "loss": 0.4935, "mean_token_accuracy": 0.8186349272727966, "step": 3843 }, { "epoch": 2.6169560776302347, "grad_norm": 1.6907193660736084, "learning_rate": 5.021400429575363e-06, "loss": 0.5633, "mean_token_accuracy": 0.7966488897800446, "step": 3844 }, { "epoch": 2.617637044603337, "grad_norm": 1.8717435598373413, "learning_rate": 5.019147764437997e-06, "loss": 0.4634, "mean_token_accuracy": 0.8397066295146942, "step": 3845 }, { "epoch": 2.6183180115764384, "grad_norm": 1.6948333978652954, "learning_rate": 5.016895095413955e-06, "loss": 0.5272, "mean_token_accuracy": 0.8144732117652893, "step": 3846 }, { "epoch": 2.6189989785495404, "grad_norm": 1.8234837055206299, "learning_rate": 5.01464242296049e-06, "loss": 0.444, "mean_token_accuracy": 0.8448449075222015, "step": 3847 }, { "epoch": 2.619679945522642, "grad_norm": 1.9839900732040405, "learning_rate": 5.012389747534856e-06, "loss": 0.4458, "mean_token_accuracy": 0.8250021636486053, "step": 3848 }, { "epoch": 2.620360912495744, "grad_norm": 1.8771488666534424, "learning_rate": 5.010137069594311e-06, "loss": 0.4547, "mean_token_accuracy": 0.8470446467399597, "step": 3849 }, { "epoch": 2.6210418794688457, "grad_norm": 1.7900296449661255, "learning_rate": 5.007884389596108e-06, "loss": 0.4576, "mean_token_accuracy": 0.8327911794185638, "step": 3850 }, { "epoch": 2.6217228464419478, "grad_norm": 1.8055541515350342, "learning_rate": 5.005631707997506e-06, "loss": 0.5403, "mean_token_accuracy": 0.8051477372646332, "step": 3851 }, { "epoch": 2.6224038134150494, "grad_norm": 1.7194080352783203, "learning_rate": 5.0033790252557615e-06, "loss": 0.4728, "mean_token_accuracy": 0.8117278814315796, "step": 3852 }, { "epoch": 2.623084780388151, "grad_norm": 1.8057985305786133, "learning_rate": 5.00112634182813e-06, "loss": 0.5741, "mean_token_accuracy": 0.7894122898578644, "step": 3853 }, { "epoch": 2.623765747361253, "grad_norm": 1.8581061363220215, "learning_rate": 4.99887365817187e-06, "loss": 0.504, "mean_token_accuracy": 0.813201516866684, "step": 3854 }, { "epoch": 2.624446714334355, "grad_norm": 1.786885142326355, "learning_rate": 4.996620974744241e-06, "loss": 0.4432, "mean_token_accuracy": 0.85154989361763, "step": 3855 }, { "epoch": 2.6251276813074567, "grad_norm": 1.9190343618392944, "learning_rate": 4.994368292002494e-06, "loss": 0.5177, "mean_token_accuracy": 0.8226475119590759, "step": 3856 }, { "epoch": 2.6258086482805583, "grad_norm": 1.7007496356964111, "learning_rate": 4.992115610403893e-06, "loss": 0.517, "mean_token_accuracy": 0.8308874070644379, "step": 3857 }, { "epoch": 2.6264896152536603, "grad_norm": 1.9748460054397583, "learning_rate": 4.989862930405693e-06, "loss": 0.4326, "mean_token_accuracy": 0.8533489108085632, "step": 3858 }, { "epoch": 2.627170582226762, "grad_norm": 2.036576509475708, "learning_rate": 4.9876102524651446e-06, "loss": 0.429, "mean_token_accuracy": 0.8527028262615204, "step": 3859 }, { "epoch": 2.627851549199864, "grad_norm": 1.9370096921920776, "learning_rate": 4.985357577039512e-06, "loss": 0.4895, "mean_token_accuracy": 0.8410833477973938, "step": 3860 }, { "epoch": 2.6285325161729656, "grad_norm": 1.9394879341125488, "learning_rate": 4.983104904586046e-06, "loss": 0.5221, "mean_token_accuracy": 0.8235369920730591, "step": 3861 }, { "epoch": 2.629213483146067, "grad_norm": 1.8300940990447998, "learning_rate": 4.980852235562004e-06, "loss": 0.465, "mean_token_accuracy": 0.8300965428352356, "step": 3862 }, { "epoch": 2.6298944501191692, "grad_norm": 1.787372350692749, "learning_rate": 4.97859957042464e-06, "loss": 0.5569, "mean_token_accuracy": 0.7998397350311279, "step": 3863 }, { "epoch": 2.6305754170922713, "grad_norm": 1.883310317993164, "learning_rate": 4.976346909631204e-06, "loss": 0.4859, "mean_token_accuracy": 0.828083872795105, "step": 3864 }, { "epoch": 2.631256384065373, "grad_norm": 1.655369520187378, "learning_rate": 4.974094253638954e-06, "loss": 0.6744, "mean_token_accuracy": 0.7762046754360199, "step": 3865 }, { "epoch": 2.6319373510384745, "grad_norm": 1.816660761833191, "learning_rate": 4.971841602905137e-06, "loss": 0.5328, "mean_token_accuracy": 0.8231110274791718, "step": 3866 }, { "epoch": 2.6326183180115765, "grad_norm": 1.8082754611968994, "learning_rate": 4.969588957887007e-06, "loss": 0.505, "mean_token_accuracy": 0.7937084436416626, "step": 3867 }, { "epoch": 2.633299284984678, "grad_norm": 1.8027045726776123, "learning_rate": 4.967336319041813e-06, "loss": 0.5163, "mean_token_accuracy": 0.8264971673488617, "step": 3868 }, { "epoch": 2.63398025195778, "grad_norm": 1.769935131072998, "learning_rate": 4.9650836868268e-06, "loss": 0.4891, "mean_token_accuracy": 0.8175025582313538, "step": 3869 }, { "epoch": 2.634661218930882, "grad_norm": 1.791712999343872, "learning_rate": 4.962831061699219e-06, "loss": 0.4688, "mean_token_accuracy": 0.8414970338344574, "step": 3870 }, { "epoch": 2.6353421859039834, "grad_norm": 1.7833095788955688, "learning_rate": 4.960578444116311e-06, "loss": 0.3552, "mean_token_accuracy": 0.884288102388382, "step": 3871 }, { "epoch": 2.6360231528770854, "grad_norm": 1.7765542268753052, "learning_rate": 4.958325834535326e-06, "loss": 0.6144, "mean_token_accuracy": 0.759204626083374, "step": 3872 }, { "epoch": 2.6367041198501875, "grad_norm": 1.8112907409667969, "learning_rate": 4.956073233413502e-06, "loss": 0.346, "mean_token_accuracy": 0.8836810886859894, "step": 3873 }, { "epoch": 2.637385086823289, "grad_norm": 1.882103681564331, "learning_rate": 4.953820641208077e-06, "loss": 0.5094, "mean_token_accuracy": 0.8243763148784637, "step": 3874 }, { "epoch": 2.6380660537963907, "grad_norm": 1.9127395153045654, "learning_rate": 4.9515680583762945e-06, "loss": 0.5371, "mean_token_accuracy": 0.8011230230331421, "step": 3875 }, { "epoch": 2.6387470207694927, "grad_norm": 1.7805240154266357, "learning_rate": 4.949315485375389e-06, "loss": 0.6052, "mean_token_accuracy": 0.7798895835876465, "step": 3876 }, { "epoch": 2.6394279877425944, "grad_norm": 1.6574201583862305, "learning_rate": 4.947062922662597e-06, "loss": 0.5866, "mean_token_accuracy": 0.7967601716518402, "step": 3877 }, { "epoch": 2.6401089547156964, "grad_norm": 1.942797303199768, "learning_rate": 4.94481037069515e-06, "loss": 0.365, "mean_token_accuracy": 0.8772804439067841, "step": 3878 }, { "epoch": 2.640789921688798, "grad_norm": 1.762840986251831, "learning_rate": 4.942557829930277e-06, "loss": 0.4441, "mean_token_accuracy": 0.8448428511619568, "step": 3879 }, { "epoch": 2.6414708886619, "grad_norm": 1.6413846015930176, "learning_rate": 4.940305300825211e-06, "loss": 0.7471, "mean_token_accuracy": 0.7418224513530731, "step": 3880 }, { "epoch": 2.6421518556350017, "grad_norm": 1.7842758893966675, "learning_rate": 4.938052783837171e-06, "loss": 0.4996, "mean_token_accuracy": 0.8308405876159668, "step": 3881 }, { "epoch": 2.6428328226081037, "grad_norm": 1.865980863571167, "learning_rate": 4.935800279423386e-06, "loss": 0.5524, "mean_token_accuracy": 0.7946785092353821, "step": 3882 }, { "epoch": 2.6435137895812053, "grad_norm": 1.9523581266403198, "learning_rate": 4.933547788041076e-06, "loss": 0.428, "mean_token_accuracy": 0.8566509783267975, "step": 3883 }, { "epoch": 2.644194756554307, "grad_norm": 1.9252150058746338, "learning_rate": 4.931295310147457e-06, "loss": 0.4444, "mean_token_accuracy": 0.8573628067970276, "step": 3884 }, { "epoch": 2.644875723527409, "grad_norm": 1.9002150297164917, "learning_rate": 4.929042846199747e-06, "loss": 0.4312, "mean_token_accuracy": 0.857248067855835, "step": 3885 }, { "epoch": 2.6455566905005106, "grad_norm": 1.6019632816314697, "learning_rate": 4.926790396655156e-06, "loss": 0.517, "mean_token_accuracy": 0.8074398636817932, "step": 3886 }, { "epoch": 2.6462376574736126, "grad_norm": 1.6942660808563232, "learning_rate": 4.924537961970896e-06, "loss": 0.509, "mean_token_accuracy": 0.8015955984592438, "step": 3887 }, { "epoch": 2.646918624446714, "grad_norm": 1.7103142738342285, "learning_rate": 4.922285542604175e-06, "loss": 0.6881, "mean_token_accuracy": 0.7808598875999451, "step": 3888 }, { "epoch": 2.6475995914198163, "grad_norm": 1.8642045259475708, "learning_rate": 4.920033139012191e-06, "loss": 0.4267, "mean_token_accuracy": 0.861565113067627, "step": 3889 }, { "epoch": 2.648280558392918, "grad_norm": 1.6681026220321655, "learning_rate": 4.91778075165215e-06, "loss": 0.4114, "mean_token_accuracy": 0.8415514826774597, "step": 3890 }, { "epoch": 2.64896152536602, "grad_norm": 1.891650915145874, "learning_rate": 4.915528380981247e-06, "loss": 0.4304, "mean_token_accuracy": 0.8514105379581451, "step": 3891 }, { "epoch": 2.6496424923391215, "grad_norm": 1.8297301530838013, "learning_rate": 4.913276027456676e-06, "loss": 0.4886, "mean_token_accuracy": 0.8249491751194, "step": 3892 }, { "epoch": 2.650323459312223, "grad_norm": 1.7789618968963623, "learning_rate": 4.911023691535628e-06, "loss": 0.4126, "mean_token_accuracy": 0.8542689979076385, "step": 3893 }, { "epoch": 2.651004426285325, "grad_norm": 1.996366024017334, "learning_rate": 4.908771373675286e-06, "loss": 0.5203, "mean_token_accuracy": 0.8262518644332886, "step": 3894 }, { "epoch": 2.6516853932584272, "grad_norm": 1.6236470937728882, "learning_rate": 4.90651907433284e-06, "loss": 0.6431, "mean_token_accuracy": 0.7717425227165222, "step": 3895 }, { "epoch": 2.652366360231529, "grad_norm": 1.815350890159607, "learning_rate": 4.90426679396546e-06, "loss": 0.5371, "mean_token_accuracy": 0.8063397407531738, "step": 3896 }, { "epoch": 2.6530473272046304, "grad_norm": 1.8230129480361938, "learning_rate": 4.9020145330303305e-06, "loss": 0.403, "mean_token_accuracy": 0.8599833250045776, "step": 3897 }, { "epoch": 2.6537282941777325, "grad_norm": 1.7271928787231445, "learning_rate": 4.899762291984618e-06, "loss": 0.6501, "mean_token_accuracy": 0.7680227756500244, "step": 3898 }, { "epoch": 2.654409261150834, "grad_norm": 1.571820855140686, "learning_rate": 4.89751007128549e-06, "loss": 0.6988, "mean_token_accuracy": 0.7686778604984283, "step": 3899 }, { "epoch": 2.655090228123936, "grad_norm": 1.7785524129867554, "learning_rate": 4.895257871390112e-06, "loss": 0.5408, "mean_token_accuracy": 0.8190335631370544, "step": 3900 }, { "epoch": 2.6557711950970377, "grad_norm": 1.690747857093811, "learning_rate": 4.893005692755639e-06, "loss": 0.5519, "mean_token_accuracy": 0.7777336239814758, "step": 3901 }, { "epoch": 2.6564521620701393, "grad_norm": 1.817525863647461, "learning_rate": 4.890753535839231e-06, "loss": 0.4876, "mean_token_accuracy": 0.8405249118804932, "step": 3902 }, { "epoch": 2.6571331290432414, "grad_norm": 1.9709147214889526, "learning_rate": 4.8885014010980375e-06, "loss": 0.4184, "mean_token_accuracy": 0.8492234647274017, "step": 3903 }, { "epoch": 2.6578140960163434, "grad_norm": 1.706158995628357, "learning_rate": 4.886249288989199e-06, "loss": 0.6639, "mean_token_accuracy": 0.7679643034934998, "step": 3904 }, { "epoch": 2.658495062989445, "grad_norm": 1.8670127391815186, "learning_rate": 4.883997199969863e-06, "loss": 0.5307, "mean_token_accuracy": 0.8073523938655853, "step": 3905 }, { "epoch": 2.6591760299625467, "grad_norm": 1.6855868101119995, "learning_rate": 4.8817451344971616e-06, "loss": 0.5622, "mean_token_accuracy": 0.7872601747512817, "step": 3906 }, { "epoch": 2.6598569969356487, "grad_norm": 1.7051819562911987, "learning_rate": 4.879493093028231e-06, "loss": 0.5801, "mean_token_accuracy": 0.8049601018428802, "step": 3907 }, { "epoch": 2.6605379639087503, "grad_norm": 1.6054905652999878, "learning_rate": 4.877241076020194e-06, "loss": 0.6488, "mean_token_accuracy": 0.7596305906772614, "step": 3908 }, { "epoch": 2.6612189308818524, "grad_norm": 1.7604715824127197, "learning_rate": 4.874989083930172e-06, "loss": 0.5004, "mean_token_accuracy": 0.8187099397182465, "step": 3909 }, { "epoch": 2.661899897854954, "grad_norm": 1.711681604385376, "learning_rate": 4.8727371172152885e-06, "loss": 0.4612, "mean_token_accuracy": 0.8384099900722504, "step": 3910 }, { "epoch": 2.662580864828056, "grad_norm": 1.7051833868026733, "learning_rate": 4.870485176332647e-06, "loss": 0.5556, "mean_token_accuracy": 0.7832590639591217, "step": 3911 }, { "epoch": 2.6632618318011576, "grad_norm": 1.8571518659591675, "learning_rate": 4.8682332617393595e-06, "loss": 0.554, "mean_token_accuracy": 0.8118780255317688, "step": 3912 }, { "epoch": 2.6639427987742597, "grad_norm": 1.7580088376998901, "learning_rate": 4.865981373892526e-06, "loss": 0.5064, "mean_token_accuracy": 0.8235782384872437, "step": 3913 }, { "epoch": 2.6646237657473613, "grad_norm": 1.926615834236145, "learning_rate": 4.863729513249241e-06, "loss": 0.5023, "mean_token_accuracy": 0.8089651167392731, "step": 3914 }, { "epoch": 2.665304732720463, "grad_norm": 1.8925856351852417, "learning_rate": 4.8614776802665955e-06, "loss": 0.4092, "mean_token_accuracy": 0.8606904149055481, "step": 3915 }, { "epoch": 2.665985699693565, "grad_norm": 2.008470058441162, "learning_rate": 4.859225875401672e-06, "loss": 0.4043, "mean_token_accuracy": 0.864379346370697, "step": 3916 }, { "epoch": 2.6666666666666665, "grad_norm": 1.80912184715271, "learning_rate": 4.856974099111555e-06, "loss": 0.416, "mean_token_accuracy": 0.8499147891998291, "step": 3917 }, { "epoch": 2.6673476336397686, "grad_norm": 1.8473830223083496, "learning_rate": 4.8547223518533134e-06, "loss": 0.476, "mean_token_accuracy": 0.7987478077411652, "step": 3918 }, { "epoch": 2.66802860061287, "grad_norm": 1.8104387521743774, "learning_rate": 4.852470634084012e-06, "loss": 0.4722, "mean_token_accuracy": 0.832041472196579, "step": 3919 }, { "epoch": 2.668709567585972, "grad_norm": 1.7978397607803345, "learning_rate": 4.850218946260717e-06, "loss": 0.5595, "mean_token_accuracy": 0.7924724817276001, "step": 3920 }, { "epoch": 2.669390534559074, "grad_norm": 1.8018285036087036, "learning_rate": 4.847967288840481e-06, "loss": 0.5379, "mean_token_accuracy": 0.7934514880180359, "step": 3921 }, { "epoch": 2.670071501532176, "grad_norm": 1.7633957862854004, "learning_rate": 4.845715662280354e-06, "loss": 0.5002, "mean_token_accuracy": 0.8325177431106567, "step": 3922 }, { "epoch": 2.6707524685052775, "grad_norm": 1.96757972240448, "learning_rate": 4.843464067037378e-06, "loss": 0.3775, "mean_token_accuracy": 0.873424768447876, "step": 3923 }, { "epoch": 2.671433435478379, "grad_norm": 1.8505839109420776, "learning_rate": 4.841212503568588e-06, "loss": 0.5123, "mean_token_accuracy": 0.8109042942523956, "step": 3924 }, { "epoch": 2.672114402451481, "grad_norm": 1.6877586841583252, "learning_rate": 4.838960972331019e-06, "loss": 0.498, "mean_token_accuracy": 0.8331122100353241, "step": 3925 }, { "epoch": 2.672795369424583, "grad_norm": 1.8195507526397705, "learning_rate": 4.836709473781686e-06, "loss": 0.5444, "mean_token_accuracy": 0.8121912479400635, "step": 3926 }, { "epoch": 2.673476336397685, "grad_norm": 1.7931350469589233, "learning_rate": 4.834458008377613e-06, "loss": 0.4198, "mean_token_accuracy": 0.8529432117938995, "step": 3927 }, { "epoch": 2.6741573033707864, "grad_norm": 1.904209852218628, "learning_rate": 4.832206576575809e-06, "loss": 0.4855, "mean_token_accuracy": 0.8347691297531128, "step": 3928 }, { "epoch": 2.6748382703438884, "grad_norm": 1.5984604358673096, "learning_rate": 4.829955178833273e-06, "loss": 0.5348, "mean_token_accuracy": 0.7860933244228363, "step": 3929 }, { "epoch": 2.67551923731699, "grad_norm": 1.791142463684082, "learning_rate": 4.827703815607005e-06, "loss": 0.517, "mean_token_accuracy": 0.8121091723442078, "step": 3930 }, { "epoch": 2.676200204290092, "grad_norm": 1.8014389276504517, "learning_rate": 4.82545248735399e-06, "loss": 0.4916, "mean_token_accuracy": 0.8316119313240051, "step": 3931 }, { "epoch": 2.6768811712631937, "grad_norm": 1.9675073623657227, "learning_rate": 4.8232011945312164e-06, "loss": 0.5612, "mean_token_accuracy": 0.7890365719795227, "step": 3932 }, { "epoch": 2.6775621382362953, "grad_norm": 1.6799383163452148, "learning_rate": 4.820949937595657e-06, "loss": 0.5895, "mean_token_accuracy": 0.7655537724494934, "step": 3933 }, { "epoch": 2.6782431052093973, "grad_norm": 1.9108551740646362, "learning_rate": 4.8186987170042754e-06, "loss": 0.4417, "mean_token_accuracy": 0.8437982201576233, "step": 3934 }, { "epoch": 2.6789240721824994, "grad_norm": 1.879565954208374, "learning_rate": 4.816447533214037e-06, "loss": 0.5822, "mean_token_accuracy": 0.7843514084815979, "step": 3935 }, { "epoch": 2.679605039155601, "grad_norm": 1.7149592638015747, "learning_rate": 4.814196386681891e-06, "loss": 0.5825, "mean_token_accuracy": 0.7936089634895325, "step": 3936 }, { "epoch": 2.6802860061287026, "grad_norm": 1.7891401052474976, "learning_rate": 4.811945277864785e-06, "loss": 0.5132, "mean_token_accuracy": 0.8011417388916016, "step": 3937 }, { "epoch": 2.6809669731018047, "grad_norm": 1.836135983467102, "learning_rate": 4.8096942072196565e-06, "loss": 0.4418, "mean_token_accuracy": 0.8468415439128876, "step": 3938 }, { "epoch": 2.6816479400749063, "grad_norm": 1.6587809324264526, "learning_rate": 4.807443175203432e-06, "loss": 0.6134, "mean_token_accuracy": 0.7966215908527374, "step": 3939 }, { "epoch": 2.6823289070480083, "grad_norm": 1.6238572597503662, "learning_rate": 4.80519218227304e-06, "loss": 0.5675, "mean_token_accuracy": 0.7845207154750824, "step": 3940 }, { "epoch": 2.68300987402111, "grad_norm": 1.8955618143081665, "learning_rate": 4.802941228885387e-06, "loss": 0.4303, "mean_token_accuracy": 0.8410269021987915, "step": 3941 }, { "epoch": 2.6836908409942115, "grad_norm": 1.9510003328323364, "learning_rate": 4.800690315497385e-06, "loss": 0.3712, "mean_token_accuracy": 0.8625737726688385, "step": 3942 }, { "epoch": 2.6843718079673136, "grad_norm": 1.7233765125274658, "learning_rate": 4.79843944256593e-06, "loss": 0.4718, "mean_token_accuracy": 0.8373497128486633, "step": 3943 }, { "epoch": 2.6850527749404156, "grad_norm": 1.957672357559204, "learning_rate": 4.796188610547909e-06, "loss": 0.4699, "mean_token_accuracy": 0.8327323198318481, "step": 3944 }, { "epoch": 2.685733741913517, "grad_norm": 1.64125657081604, "learning_rate": 4.793937819900209e-06, "loss": 0.6288, "mean_token_accuracy": 0.7618223130702972, "step": 3945 }, { "epoch": 2.686414708886619, "grad_norm": 2.0120179653167725, "learning_rate": 4.791687071079696e-06, "loss": 0.5052, "mean_token_accuracy": 0.8251870274543762, "step": 3946 }, { "epoch": 2.687095675859721, "grad_norm": 1.7025142908096313, "learning_rate": 4.789436364543242e-06, "loss": 0.6101, "mean_token_accuracy": 0.7406959533691406, "step": 3947 }, { "epoch": 2.6877766428328225, "grad_norm": 1.889710545539856, "learning_rate": 4.787185700747699e-06, "loss": 0.4837, "mean_token_accuracy": 0.8338494598865509, "step": 3948 }, { "epoch": 2.6884576098059245, "grad_norm": 1.9580037593841553, "learning_rate": 4.784935080149912e-06, "loss": 0.4704, "mean_token_accuracy": 0.8261103630065918, "step": 3949 }, { "epoch": 2.689138576779026, "grad_norm": 1.7542685270309448, "learning_rate": 4.782684503206724e-06, "loss": 0.4869, "mean_token_accuracy": 0.8246197998523712, "step": 3950 }, { "epoch": 2.689819543752128, "grad_norm": 1.899202585220337, "learning_rate": 4.780433970374961e-06, "loss": 0.4627, "mean_token_accuracy": 0.8386542201042175, "step": 3951 }, { "epoch": 2.6905005107252298, "grad_norm": 1.803371548652649, "learning_rate": 4.778183482111448e-06, "loss": 0.4215, "mean_token_accuracy": 0.843789279460907, "step": 3952 }, { "epoch": 2.691181477698332, "grad_norm": 1.7135429382324219, "learning_rate": 4.775933038872993e-06, "loss": 0.5413, "mean_token_accuracy": 0.8048631250858307, "step": 3953 }, { "epoch": 2.6918624446714334, "grad_norm": 1.5839035511016846, "learning_rate": 4.773682641116397e-06, "loss": 0.6671, "mean_token_accuracy": 0.8062560856342316, "step": 3954 }, { "epoch": 2.692543411644535, "grad_norm": 1.7584987878799438, "learning_rate": 4.77143228929846e-06, "loss": 0.4966, "mean_token_accuracy": 0.8104604184627533, "step": 3955 }, { "epoch": 2.693224378617637, "grad_norm": 1.848128318786621, "learning_rate": 4.769181983875958e-06, "loss": 0.5375, "mean_token_accuracy": 0.7883957326412201, "step": 3956 }, { "epoch": 2.6939053455907387, "grad_norm": 1.913416862487793, "learning_rate": 4.766931725305672e-06, "loss": 0.4397, "mean_token_accuracy": 0.8490997850894928, "step": 3957 }, { "epoch": 2.6945863125638407, "grad_norm": 1.8298577070236206, "learning_rate": 4.7646815140443625e-06, "loss": 0.4827, "mean_token_accuracy": 0.8293282091617584, "step": 3958 }, { "epoch": 2.6952672795369423, "grad_norm": 1.9652197360992432, "learning_rate": 4.762431350548788e-06, "loss": 0.3976, "mean_token_accuracy": 0.8602902591228485, "step": 3959 }, { "epoch": 2.6959482465100444, "grad_norm": 1.7676998376846313, "learning_rate": 4.7601812352756935e-06, "loss": 0.5056, "mean_token_accuracy": 0.8220214247703552, "step": 3960 }, { "epoch": 2.696629213483146, "grad_norm": 1.66977858543396, "learning_rate": 4.757931168681812e-06, "loss": 0.4614, "mean_token_accuracy": 0.8280594646930695, "step": 3961 }, { "epoch": 2.697310180456248, "grad_norm": 1.8321630954742432, "learning_rate": 4.755681151223873e-06, "loss": 0.424, "mean_token_accuracy": 0.8619999885559082, "step": 3962 }, { "epoch": 2.6979911474293496, "grad_norm": 1.6794830560684204, "learning_rate": 4.753431183358592e-06, "loss": 0.6178, "mean_token_accuracy": 0.7880316078662872, "step": 3963 }, { "epoch": 2.6986721144024512, "grad_norm": 1.7702252864837646, "learning_rate": 4.751181265542674e-06, "loss": 0.6165, "mean_token_accuracy": 0.7672969698905945, "step": 3964 }, { "epoch": 2.6993530813755533, "grad_norm": 1.9330880641937256, "learning_rate": 4.7489313982328165e-06, "loss": 0.5605, "mean_token_accuracy": 0.8188602924346924, "step": 3965 }, { "epoch": 2.7000340483486553, "grad_norm": 1.7280243635177612, "learning_rate": 4.746681581885702e-06, "loss": 0.3887, "mean_token_accuracy": 0.8591579496860504, "step": 3966 }, { "epoch": 2.700715015321757, "grad_norm": 1.8408695459365845, "learning_rate": 4.7444318169580096e-06, "loss": 0.5105, "mean_token_accuracy": 0.8042375445365906, "step": 3967 }, { "epoch": 2.7013959822948586, "grad_norm": 1.7445772886276245, "learning_rate": 4.742182103906399e-06, "loss": 0.3993, "mean_token_accuracy": 0.8530567288398743, "step": 3968 }, { "epoch": 2.7020769492679606, "grad_norm": 1.9366904497146606, "learning_rate": 4.739932443187531e-06, "loss": 0.4122, "mean_token_accuracy": 0.8526709973812103, "step": 3969 }, { "epoch": 2.702757916241062, "grad_norm": 1.8917131423950195, "learning_rate": 4.737682835258046e-06, "loss": 0.3343, "mean_token_accuracy": 0.8838472366333008, "step": 3970 }, { "epoch": 2.7034388832141643, "grad_norm": 1.92247474193573, "learning_rate": 4.735433280574574e-06, "loss": 0.4243, "mean_token_accuracy": 0.8588231801986694, "step": 3971 }, { "epoch": 2.704119850187266, "grad_norm": 1.747076153755188, "learning_rate": 4.733183779593742e-06, "loss": 0.4707, "mean_token_accuracy": 0.8341951966285706, "step": 3972 }, { "epoch": 2.7048008171603675, "grad_norm": 1.812290906906128, "learning_rate": 4.730934332772158e-06, "loss": 0.6232, "mean_token_accuracy": 0.7695786356925964, "step": 3973 }, { "epoch": 2.7054817841334695, "grad_norm": 1.953823447227478, "learning_rate": 4.7286849405664255e-06, "loss": 0.4079, "mean_token_accuracy": 0.8603721261024475, "step": 3974 }, { "epoch": 2.7061627511065716, "grad_norm": 1.7743034362792969, "learning_rate": 4.72643560343313e-06, "loss": 0.6157, "mean_token_accuracy": 0.7944780588150024, "step": 3975 }, { "epoch": 2.706843718079673, "grad_norm": 1.8575267791748047, "learning_rate": 4.724186321828851e-06, "loss": 0.5562, "mean_token_accuracy": 0.7962695062160492, "step": 3976 }, { "epoch": 2.7075246850527748, "grad_norm": 1.674201250076294, "learning_rate": 4.721937096210156e-06, "loss": 0.7111, "mean_token_accuracy": 0.7703113257884979, "step": 3977 }, { "epoch": 2.708205652025877, "grad_norm": 1.9471904039382935, "learning_rate": 4.719687927033599e-06, "loss": 0.5077, "mean_token_accuracy": 0.8083834946155548, "step": 3978 }, { "epoch": 2.7088866189989784, "grad_norm": 1.7311339378356934, "learning_rate": 4.717438814755726e-06, "loss": 0.562, "mean_token_accuracy": 0.7930735647678375, "step": 3979 }, { "epoch": 2.7095675859720805, "grad_norm": 1.8351259231567383, "learning_rate": 4.715189759833068e-06, "loss": 0.4626, "mean_token_accuracy": 0.8200312554836273, "step": 3980 }, { "epoch": 2.710248552945182, "grad_norm": 1.8089195489883423, "learning_rate": 4.7129407627221435e-06, "loss": 0.5153, "mean_token_accuracy": 0.828660637140274, "step": 3981 }, { "epoch": 2.710929519918284, "grad_norm": 1.8039835691452026, "learning_rate": 4.710691823879465e-06, "loss": 0.5338, "mean_token_accuracy": 0.806421548128128, "step": 3982 }, { "epoch": 2.7116104868913857, "grad_norm": 1.8094762563705444, "learning_rate": 4.708442943761527e-06, "loss": 0.5188, "mean_token_accuracy": 0.8220811486244202, "step": 3983 }, { "epoch": 2.712291453864488, "grad_norm": 1.6591143608093262, "learning_rate": 4.7061941228248165e-06, "loss": 0.5843, "mean_token_accuracy": 0.8068007230758667, "step": 3984 }, { "epoch": 2.7129724208375894, "grad_norm": 1.767225742340088, "learning_rate": 4.703945361525808e-06, "loss": 0.4063, "mean_token_accuracy": 0.8602088987827301, "step": 3985 }, { "epoch": 2.713653387810691, "grad_norm": 1.7054654359817505, "learning_rate": 4.701696660320957e-06, "loss": 0.4881, "mean_token_accuracy": 0.83112832903862, "step": 3986 }, { "epoch": 2.714334354783793, "grad_norm": 1.8095558881759644, "learning_rate": 4.699448019666719e-06, "loss": 0.3127, "mean_token_accuracy": 0.8936652839183807, "step": 3987 }, { "epoch": 2.7150153217568946, "grad_norm": 1.7023005485534668, "learning_rate": 4.697199440019526e-06, "loss": 0.7435, "mean_token_accuracy": 0.7619481682777405, "step": 3988 }, { "epoch": 2.7156962887299967, "grad_norm": 1.8889070749282837, "learning_rate": 4.694950921835806e-06, "loss": 0.4725, "mean_token_accuracy": 0.8143036365509033, "step": 3989 }, { "epoch": 2.7163772557030983, "grad_norm": 1.8719936609268188, "learning_rate": 4.692702465571967e-06, "loss": 0.479, "mean_token_accuracy": 0.8277013003826141, "step": 3990 }, { "epoch": 2.7170582226762003, "grad_norm": 1.8536829948425293, "learning_rate": 4.690454071684408e-06, "loss": 0.4976, "mean_token_accuracy": 0.8053595125675201, "step": 3991 }, { "epoch": 2.717739189649302, "grad_norm": 1.7299089431762695, "learning_rate": 4.68820574062952e-06, "loss": 0.5551, "mean_token_accuracy": 0.7837486565113068, "step": 3992 }, { "epoch": 2.718420156622404, "grad_norm": 1.8198221921920776, "learning_rate": 4.685957472863673e-06, "loss": 0.6615, "mean_token_accuracy": 0.7894697487354279, "step": 3993 }, { "epoch": 2.7191011235955056, "grad_norm": 1.8313740491867065, "learning_rate": 4.6837092688432305e-06, "loss": 0.4648, "mean_token_accuracy": 0.8382850289344788, "step": 3994 }, { "epoch": 2.719782090568607, "grad_norm": 1.6394917964935303, "learning_rate": 4.6814611290245385e-06, "loss": 0.5811, "mean_token_accuracy": 0.7810448706150055, "step": 3995 }, { "epoch": 2.7204630575417093, "grad_norm": 1.8033404350280762, "learning_rate": 4.679213053863931e-06, "loss": 0.4601, "mean_token_accuracy": 0.8306770026683807, "step": 3996 }, { "epoch": 2.7211440245148113, "grad_norm": 1.8551971912384033, "learning_rate": 4.676965043817733e-06, "loss": 0.4689, "mean_token_accuracy": 0.8284906446933746, "step": 3997 }, { "epoch": 2.721824991487913, "grad_norm": 1.8870601654052734, "learning_rate": 4.674717099342249e-06, "loss": 0.4136, "mean_token_accuracy": 0.8535536527633667, "step": 3998 }, { "epoch": 2.7225059584610145, "grad_norm": 1.6952531337738037, "learning_rate": 4.6724692208937785e-06, "loss": 0.5318, "mean_token_accuracy": 0.8204637169837952, "step": 3999 }, { "epoch": 2.7231869254341166, "grad_norm": 1.7061105966567993, "learning_rate": 4.6702214089286026e-06, "loss": 0.6253, "mean_token_accuracy": 0.7853263914585114, "step": 4000 }, { "epoch": 2.723867892407218, "grad_norm": 1.8291926383972168, "learning_rate": 4.667973663902986e-06, "loss": 0.4472, "mean_token_accuracy": 0.8336737155914307, "step": 4001 }, { "epoch": 2.72454885938032, "grad_norm": 1.7231117486953735, "learning_rate": 4.665725986273188e-06, "loss": 0.4844, "mean_token_accuracy": 0.8209269940853119, "step": 4002 }, { "epoch": 2.725229826353422, "grad_norm": 1.8808369636535645, "learning_rate": 4.663478376495446e-06, "loss": 0.5518, "mean_token_accuracy": 0.7911727130413055, "step": 4003 }, { "epoch": 2.7259107933265234, "grad_norm": 1.8144687414169312, "learning_rate": 4.661230835025992e-06, "loss": 0.3827, "mean_token_accuracy": 0.8735456168651581, "step": 4004 }, { "epoch": 2.7265917602996255, "grad_norm": 2.101097822189331, "learning_rate": 4.658983362321035e-06, "loss": 0.5214, "mean_token_accuracy": 0.8199900388717651, "step": 4005 }, { "epoch": 2.7272727272727275, "grad_norm": 1.7977267503738403, "learning_rate": 4.656735958836775e-06, "loss": 0.4824, "mean_token_accuracy": 0.8269371092319489, "step": 4006 }, { "epoch": 2.727953694245829, "grad_norm": 1.9387791156768799, "learning_rate": 4.654488625029401e-06, "loss": 0.396, "mean_token_accuracy": 0.8636562526226044, "step": 4007 }, { "epoch": 2.7286346612189307, "grad_norm": 1.6679004430770874, "learning_rate": 4.65224136135508e-06, "loss": 0.5918, "mean_token_accuracy": 0.7876146733760834, "step": 4008 }, { "epoch": 2.7293156281920328, "grad_norm": 1.8411452770233154, "learning_rate": 4.649994168269973e-06, "loss": 0.4717, "mean_token_accuracy": 0.831364095211029, "step": 4009 }, { "epoch": 2.7299965951651344, "grad_norm": 1.7692724466323853, "learning_rate": 4.64774704623022e-06, "loss": 0.5908, "mean_token_accuracy": 0.7804015278816223, "step": 4010 }, { "epoch": 2.7306775621382364, "grad_norm": 1.6596429347991943, "learning_rate": 4.6454999956919504e-06, "loss": 0.6093, "mean_token_accuracy": 0.7879738509654999, "step": 4011 }, { "epoch": 2.731358529111338, "grad_norm": 1.8646396398544312, "learning_rate": 4.643253017111279e-06, "loss": 0.5113, "mean_token_accuracy": 0.8184686601161957, "step": 4012 }, { "epoch": 2.7320394960844396, "grad_norm": 1.8395389318466187, "learning_rate": 4.641006110944301e-06, "loss": 0.4408, "mean_token_accuracy": 0.8486378490924835, "step": 4013 }, { "epoch": 2.7327204630575417, "grad_norm": 1.8138446807861328, "learning_rate": 4.638759277647106e-06, "loss": 0.4268, "mean_token_accuracy": 0.8563978374004364, "step": 4014 }, { "epoch": 2.7334014300306437, "grad_norm": 1.9518390893936157, "learning_rate": 4.636512517675763e-06, "loss": 0.3347, "mean_token_accuracy": 0.8814257979393005, "step": 4015 }, { "epoch": 2.7340823970037453, "grad_norm": 1.752939224243164, "learning_rate": 4.634265831486322e-06, "loss": 0.5724, "mean_token_accuracy": 0.8123253583908081, "step": 4016 }, { "epoch": 2.734763363976847, "grad_norm": 1.7276015281677246, "learning_rate": 4.632019219534829e-06, "loss": 0.5513, "mean_token_accuracy": 0.7781851291656494, "step": 4017 }, { "epoch": 2.735444330949949, "grad_norm": 1.6383370161056519, "learning_rate": 4.629772682277305e-06, "loss": 0.4551, "mean_token_accuracy": 0.8286090195178986, "step": 4018 }, { "epoch": 2.7361252979230506, "grad_norm": 1.7169932126998901, "learning_rate": 4.6275262201697615e-06, "loss": 0.5957, "mean_token_accuracy": 0.777912050485611, "step": 4019 }, { "epoch": 2.7368062648961526, "grad_norm": 1.7633445262908936, "learning_rate": 4.6252798336681925e-06, "loss": 0.4835, "mean_token_accuracy": 0.8262046277523041, "step": 4020 }, { "epoch": 2.7374872318692542, "grad_norm": 1.807492733001709, "learning_rate": 4.623033523228574e-06, "loss": 0.5155, "mean_token_accuracy": 0.8184539079666138, "step": 4021 }, { "epoch": 2.7381681988423563, "grad_norm": 1.8326109647750854, "learning_rate": 4.620787289306875e-06, "loss": 0.4953, "mean_token_accuracy": 0.8323909938335419, "step": 4022 }, { "epoch": 2.738849165815458, "grad_norm": 1.7274067401885986, "learning_rate": 4.61854113235904e-06, "loss": 0.68, "mean_token_accuracy": 0.770552009344101, "step": 4023 }, { "epoch": 2.73953013278856, "grad_norm": 1.7827035188674927, "learning_rate": 4.616295052841003e-06, "loss": 0.4664, "mean_token_accuracy": 0.8314473628997803, "step": 4024 }, { "epoch": 2.7402110997616616, "grad_norm": 1.7297861576080322, "learning_rate": 4.614049051208681e-06, "loss": 0.5031, "mean_token_accuracy": 0.8090635240077972, "step": 4025 }, { "epoch": 2.740892066734763, "grad_norm": 1.76603364944458, "learning_rate": 4.6118031279179724e-06, "loss": 0.5438, "mean_token_accuracy": 0.7992757856845856, "step": 4026 }, { "epoch": 2.741573033707865, "grad_norm": 1.7658898830413818, "learning_rate": 4.609557283424765e-06, "loss": 0.4981, "mean_token_accuracy": 0.8356324434280396, "step": 4027 }, { "epoch": 2.742254000680967, "grad_norm": 1.7123719453811646, "learning_rate": 4.607311518184925e-06, "loss": 0.5297, "mean_token_accuracy": 0.8003860116004944, "step": 4028 }, { "epoch": 2.742934967654069, "grad_norm": 1.8730525970458984, "learning_rate": 4.605065832654308e-06, "loss": 0.436, "mean_token_accuracy": 0.8403580784797668, "step": 4029 }, { "epoch": 2.7436159346271705, "grad_norm": 1.879523754119873, "learning_rate": 4.602820227288752e-06, "loss": 0.4104, "mean_token_accuracy": 0.8510375916957855, "step": 4030 }, { "epoch": 2.7442969016002725, "grad_norm": 1.7487196922302246, "learning_rate": 4.600574702544072e-06, "loss": 0.5127, "mean_token_accuracy": 0.8147161900997162, "step": 4031 }, { "epoch": 2.744977868573374, "grad_norm": 2.152038812637329, "learning_rate": 4.5983292588760766e-06, "loss": 0.3582, "mean_token_accuracy": 0.8803846538066864, "step": 4032 }, { "epoch": 2.745658835546476, "grad_norm": 1.7714073657989502, "learning_rate": 4.5960838967405515e-06, "loss": 0.4659, "mean_token_accuracy": 0.8439778089523315, "step": 4033 }, { "epoch": 2.7463398025195778, "grad_norm": 1.7930183410644531, "learning_rate": 4.593838616593271e-06, "loss": 0.5416, "mean_token_accuracy": 0.8105778694152832, "step": 4034 }, { "epoch": 2.7470207694926794, "grad_norm": 1.7075939178466797, "learning_rate": 4.591593418889987e-06, "loss": 0.604, "mean_token_accuracy": 0.7858490943908691, "step": 4035 }, { "epoch": 2.7477017364657814, "grad_norm": 1.8087172508239746, "learning_rate": 4.589348304086434e-06, "loss": 0.4928, "mean_token_accuracy": 0.8176507353782654, "step": 4036 }, { "epoch": 2.7483827034388835, "grad_norm": 1.6564656496047974, "learning_rate": 4.587103272638339e-06, "loss": 0.5902, "mean_token_accuracy": 0.778593122959137, "step": 4037 }, { "epoch": 2.749063670411985, "grad_norm": 1.5536346435546875, "learning_rate": 4.584858325001402e-06, "loss": 0.6726, "mean_token_accuracy": 0.760700136423111, "step": 4038 }, { "epoch": 2.7497446373850867, "grad_norm": 1.7467725276947021, "learning_rate": 4.5826134616313135e-06, "loss": 0.5179, "mean_token_accuracy": 0.8158760070800781, "step": 4039 }, { "epoch": 2.7504256043581887, "grad_norm": 1.783423900604248, "learning_rate": 4.58036868298374e-06, "loss": 0.4091, "mean_token_accuracy": 0.8609507381916046, "step": 4040 }, { "epoch": 2.7511065713312903, "grad_norm": 1.7419980764389038, "learning_rate": 4.578123989514335e-06, "loss": 0.6191, "mean_token_accuracy": 0.8015610575675964, "step": 4041 }, { "epoch": 2.7517875383043924, "grad_norm": 1.890120029449463, "learning_rate": 4.575879381678735e-06, "loss": 0.4803, "mean_token_accuracy": 0.8105383515357971, "step": 4042 }, { "epoch": 2.752468505277494, "grad_norm": 1.8393332958221436, "learning_rate": 4.573634859932556e-06, "loss": 0.5271, "mean_token_accuracy": 0.8031743466854095, "step": 4043 }, { "epoch": 2.7531494722505956, "grad_norm": 1.820481300354004, "learning_rate": 4.571390424731401e-06, "loss": 0.4558, "mean_token_accuracy": 0.8361891210079193, "step": 4044 }, { "epoch": 2.7538304392236976, "grad_norm": 1.6500802040100098, "learning_rate": 4.569146076530854e-06, "loss": 0.6033, "mean_token_accuracy": 0.7622035145759583, "step": 4045 }, { "epoch": 2.7545114061967997, "grad_norm": 1.7439976930618286, "learning_rate": 4.566901815786474e-06, "loss": 0.5979, "mean_token_accuracy": 0.7601568102836609, "step": 4046 }, { "epoch": 2.7551923731699013, "grad_norm": 1.8134303092956543, "learning_rate": 4.564657642953815e-06, "loss": 0.4811, "mean_token_accuracy": 0.8385555446147919, "step": 4047 }, { "epoch": 2.755873340143003, "grad_norm": 1.8066333532333374, "learning_rate": 4.562413558488403e-06, "loss": 0.4584, "mean_token_accuracy": 0.8314237892627716, "step": 4048 }, { "epoch": 2.756554307116105, "grad_norm": 1.713265299797058, "learning_rate": 4.5601695628457525e-06, "loss": 0.627, "mean_token_accuracy": 0.7656295299530029, "step": 4049 }, { "epoch": 2.7572352740892065, "grad_norm": 1.7719080448150635, "learning_rate": 4.557925656481356e-06, "loss": 0.5229, "mean_token_accuracy": 0.8240451216697693, "step": 4050 }, { "epoch": 2.7579162410623086, "grad_norm": 1.8945462703704834, "learning_rate": 4.555681839850687e-06, "loss": 0.4168, "mean_token_accuracy": 0.841518223285675, "step": 4051 }, { "epoch": 2.75859720803541, "grad_norm": 1.559177041053772, "learning_rate": 4.553438113409207e-06, "loss": 0.6498, "mean_token_accuracy": 0.7651937007904053, "step": 4052 }, { "epoch": 2.7592781750085122, "grad_norm": 1.9048378467559814, "learning_rate": 4.551194477612351e-06, "loss": 0.3847, "mean_token_accuracy": 0.8628165125846863, "step": 4053 }, { "epoch": 2.759959141981614, "grad_norm": 1.8253228664398193, "learning_rate": 4.548950932915545e-06, "loss": 0.5056, "mean_token_accuracy": 0.8251682221889496, "step": 4054 }, { "epoch": 2.760640108954716, "grad_norm": 1.8157124519348145, "learning_rate": 4.546707479774186e-06, "loss": 0.5097, "mean_token_accuracy": 0.8143572807312012, "step": 4055 }, { "epoch": 2.7613210759278175, "grad_norm": 1.770563006401062, "learning_rate": 4.5444641186436586e-06, "loss": 0.4956, "mean_token_accuracy": 0.8184590339660645, "step": 4056 }, { "epoch": 2.762002042900919, "grad_norm": 1.8566818237304688, "learning_rate": 4.542220849979331e-06, "loss": 0.3571, "mean_token_accuracy": 0.8777660131454468, "step": 4057 }, { "epoch": 2.762683009874021, "grad_norm": 1.8636220693588257, "learning_rate": 4.539977674236544e-06, "loss": 0.4413, "mean_token_accuracy": 0.8522592782974243, "step": 4058 }, { "epoch": 2.7633639768471228, "grad_norm": 1.6879010200500488, "learning_rate": 4.537734591870631e-06, "loss": 0.6086, "mean_token_accuracy": 0.7903042137622833, "step": 4059 }, { "epoch": 2.764044943820225, "grad_norm": 1.7672158479690552, "learning_rate": 4.535491603336895e-06, "loss": 0.4843, "mean_token_accuracy": 0.8431772589683533, "step": 4060 }, { "epoch": 2.7647259107933264, "grad_norm": 1.8393018245697021, "learning_rate": 4.53324870909063e-06, "loss": 0.3992, "mean_token_accuracy": 0.8600220084190369, "step": 4061 }, { "epoch": 2.7654068777664285, "grad_norm": 1.6508268117904663, "learning_rate": 4.531005909587103e-06, "loss": 0.5441, "mean_token_accuracy": 0.8095800578594208, "step": 4062 }, { "epoch": 2.76608784473953, "grad_norm": 1.8697447776794434, "learning_rate": 4.528763205281565e-06, "loss": 0.432, "mean_token_accuracy": 0.8498134016990662, "step": 4063 }, { "epoch": 2.766768811712632, "grad_norm": 1.8344095945358276, "learning_rate": 4.526520596629249e-06, "loss": 0.6222, "mean_token_accuracy": 0.7696931958198547, "step": 4064 }, { "epoch": 2.7674497786857337, "grad_norm": 1.8956172466278076, "learning_rate": 4.524278084085365e-06, "loss": 0.453, "mean_token_accuracy": 0.8485289812088013, "step": 4065 }, { "epoch": 2.7681307456588353, "grad_norm": 1.7656749486923218, "learning_rate": 4.52203566810511e-06, "loss": 0.4462, "mean_token_accuracy": 0.8523737788200378, "step": 4066 }, { "epoch": 2.7688117126319374, "grad_norm": 1.836476445198059, "learning_rate": 4.519793349143654e-06, "loss": 0.4995, "mean_token_accuracy": 0.8365556299686432, "step": 4067 }, { "epoch": 2.7694926796050394, "grad_norm": 1.7243705987930298, "learning_rate": 4.51755112765615e-06, "loss": 0.4526, "mean_token_accuracy": 0.8416636288166046, "step": 4068 }, { "epoch": 2.770173646578141, "grad_norm": 1.9125250577926636, "learning_rate": 4.515309004097733e-06, "loss": 0.4773, "mean_token_accuracy": 0.8170781433582306, "step": 4069 }, { "epoch": 2.7708546135512426, "grad_norm": 1.90720534324646, "learning_rate": 4.513066978923516e-06, "loss": 0.439, "mean_token_accuracy": 0.8538961112499237, "step": 4070 }, { "epoch": 2.7715355805243447, "grad_norm": 1.7914063930511475, "learning_rate": 4.510825052588594e-06, "loss": 0.6926, "mean_token_accuracy": 0.779641717672348, "step": 4071 }, { "epoch": 2.7722165474974463, "grad_norm": 1.8190771341323853, "learning_rate": 4.508583225548039e-06, "loss": 0.5209, "mean_token_accuracy": 0.8073728680610657, "step": 4072 }, { "epoch": 2.7728975144705483, "grad_norm": 1.9126510620117188, "learning_rate": 4.506341498256903e-06, "loss": 0.3396, "mean_token_accuracy": 0.8874619007110596, "step": 4073 }, { "epoch": 2.77357848144365, "grad_norm": 1.9387547969818115, "learning_rate": 4.504099871170225e-06, "loss": 0.4855, "mean_token_accuracy": 0.8079585134983063, "step": 4074 }, { "epoch": 2.7742594484167515, "grad_norm": 1.7980915307998657, "learning_rate": 4.501858344743012e-06, "loss": 0.436, "mean_token_accuracy": 0.8261218070983887, "step": 4075 }, { "epoch": 2.7749404153898536, "grad_norm": 1.7315934896469116, "learning_rate": 4.499616919430261e-06, "loss": 0.5459, "mean_token_accuracy": 0.7968368530273438, "step": 4076 }, { "epoch": 2.7756213823629556, "grad_norm": 2.0118491649627686, "learning_rate": 4.497375595686942e-06, "loss": 0.6282, "mean_token_accuracy": 0.7863713204860687, "step": 4077 }, { "epoch": 2.7763023493360572, "grad_norm": 1.8346518278121948, "learning_rate": 4.495134373968005e-06, "loss": 0.4177, "mean_token_accuracy": 0.8270801305770874, "step": 4078 }, { "epoch": 2.776983316309159, "grad_norm": 1.8147757053375244, "learning_rate": 4.492893254728383e-06, "loss": 0.483, "mean_token_accuracy": 0.8384182155132294, "step": 4079 }, { "epoch": 2.777664283282261, "grad_norm": 1.8728629350662231, "learning_rate": 4.490652238422984e-06, "loss": 0.4863, "mean_token_accuracy": 0.8230553269386292, "step": 4080 }, { "epoch": 2.7783452502553625, "grad_norm": 1.8988354206085205, "learning_rate": 4.488411325506699e-06, "loss": 0.4021, "mean_token_accuracy": 0.8503860831260681, "step": 4081 }, { "epoch": 2.7790262172284645, "grad_norm": 1.6952149868011475, "learning_rate": 4.486170516434396e-06, "loss": 0.502, "mean_token_accuracy": 0.8101128339767456, "step": 4082 }, { "epoch": 2.779707184201566, "grad_norm": 1.9685863256454468, "learning_rate": 4.483929811660919e-06, "loss": 0.42, "mean_token_accuracy": 0.8590014576911926, "step": 4083 }, { "epoch": 2.7803881511746678, "grad_norm": 1.6537184715270996, "learning_rate": 4.481689211641098e-06, "loss": 0.455, "mean_token_accuracy": 0.8445829153060913, "step": 4084 }, { "epoch": 2.78106911814777, "grad_norm": 1.9282530546188354, "learning_rate": 4.479448716829733e-06, "loss": 0.406, "mean_token_accuracy": 0.8508086800575256, "step": 4085 }, { "epoch": 2.781750085120872, "grad_norm": 1.6141513586044312, "learning_rate": 4.477208327681611e-06, "loss": 0.6658, "mean_token_accuracy": 0.7727101147174835, "step": 4086 }, { "epoch": 2.7824310520939735, "grad_norm": 1.4784250259399414, "learning_rate": 4.474968044651491e-06, "loss": 0.7245, "mean_token_accuracy": 0.7462041079998016, "step": 4087 }, { "epoch": 2.783112019067075, "grad_norm": 1.764312505722046, "learning_rate": 4.472727868194113e-06, "loss": 0.5927, "mean_token_accuracy": 0.7862542569637299, "step": 4088 }, { "epoch": 2.783792986040177, "grad_norm": 1.8596752882003784, "learning_rate": 4.470487798764199e-06, "loss": 0.5134, "mean_token_accuracy": 0.8196371793746948, "step": 4089 }, { "epoch": 2.7844739530132787, "grad_norm": 1.7250088453292847, "learning_rate": 4.468247836816442e-06, "loss": 0.5653, "mean_token_accuracy": 0.8113293051719666, "step": 4090 }, { "epoch": 2.7851549199863808, "grad_norm": 1.6664680242538452, "learning_rate": 4.4660079828055195e-06, "loss": 0.5452, "mean_token_accuracy": 0.8101063072681427, "step": 4091 }, { "epoch": 2.7858358869594824, "grad_norm": 1.8175610303878784, "learning_rate": 4.463768237186083e-06, "loss": 0.4283, "mean_token_accuracy": 0.8477451801300049, "step": 4092 }, { "epoch": 2.7865168539325844, "grad_norm": 1.8652026653289795, "learning_rate": 4.461528600412764e-06, "loss": 0.4628, "mean_token_accuracy": 0.8374438583850861, "step": 4093 }, { "epoch": 2.787197820905686, "grad_norm": 1.8206607103347778, "learning_rate": 4.459289072940172e-06, "loss": 0.5404, "mean_token_accuracy": 0.8239739239215851, "step": 4094 }, { "epoch": 2.787878787878788, "grad_norm": 1.6353833675384521, "learning_rate": 4.457049655222892e-06, "loss": 0.6674, "mean_token_accuracy": 0.7768183052539825, "step": 4095 }, { "epoch": 2.7885597548518897, "grad_norm": 1.7315068244934082, "learning_rate": 4.45481034771549e-06, "loss": 0.6542, "mean_token_accuracy": 0.7673913538455963, "step": 4096 }, { "epoch": 2.7892407218249913, "grad_norm": 1.638028621673584, "learning_rate": 4.452571150872509e-06, "loss": 0.6056, "mean_token_accuracy": 0.7781320810317993, "step": 4097 }, { "epoch": 2.7899216887980933, "grad_norm": 1.9342471361160278, "learning_rate": 4.450332065148467e-06, "loss": 0.4698, "mean_token_accuracy": 0.8516576290130615, "step": 4098 }, { "epoch": 2.7906026557711954, "grad_norm": 1.9050692319869995, "learning_rate": 4.448093090997862e-06, "loss": 0.418, "mean_token_accuracy": 0.8473788499832153, "step": 4099 }, { "epoch": 2.791283622744297, "grad_norm": 1.8329311609268188, "learning_rate": 4.4458542288751685e-06, "loss": 0.4459, "mean_token_accuracy": 0.834801197052002, "step": 4100 }, { "epoch": 2.7919645897173986, "grad_norm": 1.782348394393921, "learning_rate": 4.443615479234838e-06, "loss": 0.4536, "mean_token_accuracy": 0.8396697640419006, "step": 4101 }, { "epoch": 2.7926455566905006, "grad_norm": 1.890098214149475, "learning_rate": 4.441376842531299e-06, "loss": 0.4485, "mean_token_accuracy": 0.8312306702136993, "step": 4102 }, { "epoch": 2.7933265236636022, "grad_norm": 1.9979770183563232, "learning_rate": 4.4391383192189565e-06, "loss": 0.4935, "mean_token_accuracy": 0.8357146382331848, "step": 4103 }, { "epoch": 2.7940074906367043, "grad_norm": 1.6333616971969604, "learning_rate": 4.436899909752196e-06, "loss": 0.5552, "mean_token_accuracy": 0.8079958558082581, "step": 4104 }, { "epoch": 2.794688457609806, "grad_norm": 1.8932276964187622, "learning_rate": 4.434661614585375e-06, "loss": 0.4506, "mean_token_accuracy": 0.8528128862380981, "step": 4105 }, { "epoch": 2.7953694245829075, "grad_norm": 1.7361540794372559, "learning_rate": 4.432423434172833e-06, "loss": 0.6106, "mean_token_accuracy": 0.8088373839855194, "step": 4106 }, { "epoch": 2.7960503915560095, "grad_norm": 1.6718330383300781, "learning_rate": 4.430185368968881e-06, "loss": 0.4555, "mean_token_accuracy": 0.8209189474582672, "step": 4107 }, { "epoch": 2.7967313585291116, "grad_norm": 1.7155650854110718, "learning_rate": 4.427947419427809e-06, "loss": 0.4459, "mean_token_accuracy": 0.8553904592990875, "step": 4108 }, { "epoch": 2.797412325502213, "grad_norm": 1.7872196435928345, "learning_rate": 4.425709586003884e-06, "loss": 0.4283, "mean_token_accuracy": 0.8476147651672363, "step": 4109 }, { "epoch": 2.798093292475315, "grad_norm": 1.8055775165557861, "learning_rate": 4.423471869151348e-06, "loss": 0.517, "mean_token_accuracy": 0.8060789108276367, "step": 4110 }, { "epoch": 2.798774259448417, "grad_norm": 1.8905699253082275, "learning_rate": 4.421234269324423e-06, "loss": 0.5191, "mean_token_accuracy": 0.8071549832820892, "step": 4111 }, { "epoch": 2.7994552264215185, "grad_norm": 1.631226658821106, "learning_rate": 4.418996786977303e-06, "loss": 0.6673, "mean_token_accuracy": 0.771437793970108, "step": 4112 }, { "epoch": 2.8001361933946205, "grad_norm": 1.9170641899108887, "learning_rate": 4.416759422564157e-06, "loss": 0.4274, "mean_token_accuracy": 0.8385221362113953, "step": 4113 }, { "epoch": 2.800817160367722, "grad_norm": 1.7301523685455322, "learning_rate": 4.414522176539137e-06, "loss": 0.5279, "mean_token_accuracy": 0.7907504439353943, "step": 4114 }, { "epoch": 2.8014981273408237, "grad_norm": 1.744117259979248, "learning_rate": 4.412285049356364e-06, "loss": 0.5212, "mean_token_accuracy": 0.7993615865707397, "step": 4115 }, { "epoch": 2.8021790943139258, "grad_norm": 1.8195645809173584, "learning_rate": 4.410048041469938e-06, "loss": 0.5052, "mean_token_accuracy": 0.8180617094039917, "step": 4116 }, { "epoch": 2.802860061287028, "grad_norm": 1.8961495161056519, "learning_rate": 4.4078111533339365e-06, "loss": 0.5433, "mean_token_accuracy": 0.8053431808948517, "step": 4117 }, { "epoch": 2.8035410282601294, "grad_norm": 1.6586028337478638, "learning_rate": 4.405574385402405e-06, "loss": 0.6178, "mean_token_accuracy": 0.7721599340438843, "step": 4118 }, { "epoch": 2.804221995233231, "grad_norm": 1.7474985122680664, "learning_rate": 4.403337738129378e-06, "loss": 0.7222, "mean_token_accuracy": 0.7622912228107452, "step": 4119 }, { "epoch": 2.804902962206333, "grad_norm": 1.9574124813079834, "learning_rate": 4.40110121196885e-06, "loss": 0.456, "mean_token_accuracy": 0.8526291251182556, "step": 4120 }, { "epoch": 2.8055839291794347, "grad_norm": 1.7607680559158325, "learning_rate": 4.398864807374804e-06, "loss": 0.4987, "mean_token_accuracy": 0.8193449378013611, "step": 4121 }, { "epoch": 2.8062648961525367, "grad_norm": 1.696608543395996, "learning_rate": 4.396628524801192e-06, "loss": 0.5876, "mean_token_accuracy": 0.8037387132644653, "step": 4122 }, { "epoch": 2.8069458631256383, "grad_norm": 1.871781587600708, "learning_rate": 4.394392364701938e-06, "loss": 0.4247, "mean_token_accuracy": 0.8502293527126312, "step": 4123 }, { "epoch": 2.8076268300987404, "grad_norm": 1.7638705968856812, "learning_rate": 4.392156327530951e-06, "loss": 0.5653, "mean_token_accuracy": 0.7883833944797516, "step": 4124 }, { "epoch": 2.808307797071842, "grad_norm": 1.545868992805481, "learning_rate": 4.389920413742104e-06, "loss": 0.6069, "mean_token_accuracy": 0.7796165347099304, "step": 4125 }, { "epoch": 2.808988764044944, "grad_norm": 1.7669727802276611, "learning_rate": 4.3876846237892545e-06, "loss": 0.5306, "mean_token_accuracy": 0.8094373643398285, "step": 4126 }, { "epoch": 2.8096697310180456, "grad_norm": 1.7494205236434937, "learning_rate": 4.385448958126229e-06, "loss": 0.5676, "mean_token_accuracy": 0.7994155287742615, "step": 4127 }, { "epoch": 2.8103506979911472, "grad_norm": 1.7908951044082642, "learning_rate": 4.38321341720683e-06, "loss": 0.5207, "mean_token_accuracy": 0.8194534182548523, "step": 4128 }, { "epoch": 2.8110316649642493, "grad_norm": 1.6970624923706055, "learning_rate": 4.380978001484836e-06, "loss": 0.4592, "mean_token_accuracy": 0.8521476686000824, "step": 4129 }, { "epoch": 2.811712631937351, "grad_norm": 1.7028311491012573, "learning_rate": 4.378742711413998e-06, "loss": 0.5732, "mean_token_accuracy": 0.7971564531326294, "step": 4130 }, { "epoch": 2.812393598910453, "grad_norm": 1.9252166748046875, "learning_rate": 4.376507547448044e-06, "loss": 0.3047, "mean_token_accuracy": 0.8941620886325836, "step": 4131 }, { "epoch": 2.8130745658835545, "grad_norm": 1.924245834350586, "learning_rate": 4.374272510040675e-06, "loss": 0.5013, "mean_token_accuracy": 0.8055794537067413, "step": 4132 }, { "epoch": 2.8137555328566566, "grad_norm": 2.0187599658966064, "learning_rate": 4.372037599645562e-06, "loss": 0.4348, "mean_token_accuracy": 0.8476530015468597, "step": 4133 }, { "epoch": 2.814436499829758, "grad_norm": 2.0351438522338867, "learning_rate": 4.369802816716362e-06, "loss": 0.4992, "mean_token_accuracy": 0.8345335423946381, "step": 4134 }, { "epoch": 2.8151174668028602, "grad_norm": 1.9578043222427368, "learning_rate": 4.3675681617066935e-06, "loss": 0.4178, "mean_token_accuracy": 0.8419565260410309, "step": 4135 }, { "epoch": 2.815798433775962, "grad_norm": 1.8181085586547852, "learning_rate": 4.365333635070157e-06, "loss": 0.5235, "mean_token_accuracy": 0.8138409554958344, "step": 4136 }, { "epoch": 2.8164794007490634, "grad_norm": 1.6348092555999756, "learning_rate": 4.363099237260322e-06, "loss": 0.6052, "mean_token_accuracy": 0.791106790304184, "step": 4137 }, { "epoch": 2.8171603677221655, "grad_norm": 1.7952957153320312, "learning_rate": 4.360864968730735e-06, "loss": 0.5328, "mean_token_accuracy": 0.8058796525001526, "step": 4138 }, { "epoch": 2.8178413346952675, "grad_norm": 1.8314354419708252, "learning_rate": 4.358630829934917e-06, "loss": 0.5048, "mean_token_accuracy": 0.8070311546325684, "step": 4139 }, { "epoch": 2.818522301668369, "grad_norm": 1.8290075063705444, "learning_rate": 4.3563968213263556e-06, "loss": 0.4354, "mean_token_accuracy": 0.8528355360031128, "step": 4140 }, { "epoch": 2.8192032686414707, "grad_norm": 1.6251635551452637, "learning_rate": 4.354162943358524e-06, "loss": 0.5953, "mean_token_accuracy": 0.7825496196746826, "step": 4141 }, { "epoch": 2.819884235614573, "grad_norm": 1.7543619871139526, "learning_rate": 4.35192919648486e-06, "loss": 0.5452, "mean_token_accuracy": 0.8015954196453094, "step": 4142 }, { "epoch": 2.8205652025876744, "grad_norm": 1.8610485792160034, "learning_rate": 4.349695581158774e-06, "loss": 0.5044, "mean_token_accuracy": 0.8272472620010376, "step": 4143 }, { "epoch": 2.8212461695607765, "grad_norm": 1.82586669921875, "learning_rate": 4.347462097833656e-06, "loss": 0.4813, "mean_token_accuracy": 0.823549896478653, "step": 4144 }, { "epoch": 2.821927136533878, "grad_norm": 1.7665457725524902, "learning_rate": 4.345228746962865e-06, "loss": 0.3676, "mean_token_accuracy": 0.847511500120163, "step": 4145 }, { "epoch": 2.8226081035069797, "grad_norm": 1.760433316230774, "learning_rate": 4.342995528999734e-06, "loss": 0.552, "mean_token_accuracy": 0.8130869269371033, "step": 4146 }, { "epoch": 2.8232890704800817, "grad_norm": 1.6546013355255127, "learning_rate": 4.340762444397569e-06, "loss": 0.5985, "mean_token_accuracy": 0.7834160029888153, "step": 4147 }, { "epoch": 2.8239700374531838, "grad_norm": 1.7916358709335327, "learning_rate": 4.338529493609647e-06, "loss": 0.4098, "mean_token_accuracy": 0.8599533140659332, "step": 4148 }, { "epoch": 2.8246510044262854, "grad_norm": 1.7323201894760132, "learning_rate": 4.336296677089225e-06, "loss": 0.4411, "mean_token_accuracy": 0.8379731774330139, "step": 4149 }, { "epoch": 2.825331971399387, "grad_norm": 1.8223905563354492, "learning_rate": 4.3340639952895215e-06, "loss": 0.4752, "mean_token_accuracy": 0.8274110853672028, "step": 4150 }, { "epoch": 2.826012938372489, "grad_norm": 1.8006091117858887, "learning_rate": 4.331831448663739e-06, "loss": 0.3506, "mean_token_accuracy": 0.8821897506713867, "step": 4151 }, { "epoch": 2.8266939053455906, "grad_norm": 1.7095630168914795, "learning_rate": 4.329599037665044e-06, "loss": 0.7312, "mean_token_accuracy": 0.746852308511734, "step": 4152 }, { "epoch": 2.8273748723186927, "grad_norm": 1.9111988544464111, "learning_rate": 4.32736676274658e-06, "loss": 0.4183, "mean_token_accuracy": 0.8650788962841034, "step": 4153 }, { "epoch": 2.8280558392917943, "grad_norm": 1.957846999168396, "learning_rate": 4.325134624361461e-06, "loss": 0.598, "mean_token_accuracy": 0.7929465174674988, "step": 4154 }, { "epoch": 2.828736806264896, "grad_norm": 1.7548216581344604, "learning_rate": 4.322902622962772e-06, "loss": 0.5199, "mean_token_accuracy": 0.7725228071212769, "step": 4155 }, { "epoch": 2.829417773237998, "grad_norm": 1.923699140548706, "learning_rate": 4.3206707590035765e-06, "loss": 0.4635, "mean_token_accuracy": 0.8415305912494659, "step": 4156 }, { "epoch": 2.8300987402111, "grad_norm": 1.7533644437789917, "learning_rate": 4.318439032936903e-06, "loss": 0.5417, "mean_token_accuracy": 0.7917515933513641, "step": 4157 }, { "epoch": 2.8307797071842016, "grad_norm": 1.8578639030456543, "learning_rate": 4.316207445215756e-06, "loss": 0.4501, "mean_token_accuracy": 0.8381292223930359, "step": 4158 }, { "epoch": 2.831460674157303, "grad_norm": 1.728368878364563, "learning_rate": 4.31397599629311e-06, "loss": 0.663, "mean_token_accuracy": 0.755296379327774, "step": 4159 }, { "epoch": 2.8321416411304052, "grad_norm": 1.7240941524505615, "learning_rate": 4.3117446866219105e-06, "loss": 0.5237, "mean_token_accuracy": 0.8105067610740662, "step": 4160 }, { "epoch": 2.832822608103507, "grad_norm": 1.942743182182312, "learning_rate": 4.309513516655079e-06, "loss": 0.4642, "mean_token_accuracy": 0.8345344662666321, "step": 4161 }, { "epoch": 2.833503575076609, "grad_norm": 1.8477838039398193, "learning_rate": 4.307282486845502e-06, "loss": 0.3628, "mean_token_accuracy": 0.876635879278183, "step": 4162 }, { "epoch": 2.8341845420497105, "grad_norm": 1.8072881698608398, "learning_rate": 4.305051597646046e-06, "loss": 0.6412, "mean_token_accuracy": 0.7605000138282776, "step": 4163 }, { "epoch": 2.8348655090228125, "grad_norm": 1.8815871477127075, "learning_rate": 4.302820849509542e-06, "loss": 0.4561, "mean_token_accuracy": 0.8358868658542633, "step": 4164 }, { "epoch": 2.835546475995914, "grad_norm": 1.8747116327285767, "learning_rate": 4.300590242888794e-06, "loss": 0.5618, "mean_token_accuracy": 0.8020979762077332, "step": 4165 }, { "epoch": 2.836227442969016, "grad_norm": 1.8278900384902954, "learning_rate": 4.29835977823658e-06, "loss": 0.5019, "mean_token_accuracy": 0.8275561630725861, "step": 4166 }, { "epoch": 2.836908409942118, "grad_norm": 1.7116316556930542, "learning_rate": 4.296129456005645e-06, "loss": 0.5168, "mean_token_accuracy": 0.8339020013809204, "step": 4167 }, { "epoch": 2.8375893769152194, "grad_norm": 1.816969633102417, "learning_rate": 4.2938992766487095e-06, "loss": 0.5466, "mean_token_accuracy": 0.8150254786014557, "step": 4168 }, { "epoch": 2.8382703438883214, "grad_norm": 1.7218894958496094, "learning_rate": 4.291669240618461e-06, "loss": 0.5927, "mean_token_accuracy": 0.8053378164768219, "step": 4169 }, { "epoch": 2.8389513108614235, "grad_norm": 1.9048880338668823, "learning_rate": 4.289439348367559e-06, "loss": 0.4272, "mean_token_accuracy": 0.8439496457576752, "step": 4170 }, { "epoch": 2.839632277834525, "grad_norm": 1.9827202558517456, "learning_rate": 4.287209600348637e-06, "loss": 0.5053, "mean_token_accuracy": 0.771272599697113, "step": 4171 }, { "epoch": 2.8403132448076267, "grad_norm": 1.9304410219192505, "learning_rate": 4.284979997014294e-06, "loss": 0.5704, "mean_token_accuracy": 0.806671679019928, "step": 4172 }, { "epoch": 2.8409942117807288, "grad_norm": 1.9468516111373901, "learning_rate": 4.282750538817106e-06, "loss": 0.4323, "mean_token_accuracy": 0.8502166867256165, "step": 4173 }, { "epoch": 2.8416751787538304, "grad_norm": 1.8535219430923462, "learning_rate": 4.280521226209614e-06, "loss": 0.4187, "mean_token_accuracy": 0.848837822675705, "step": 4174 }, { "epoch": 2.8423561457269324, "grad_norm": 1.7671935558319092, "learning_rate": 4.27829205964433e-06, "loss": 0.5111, "mean_token_accuracy": 0.8048777878284454, "step": 4175 }, { "epoch": 2.843037112700034, "grad_norm": 1.548001766204834, "learning_rate": 4.2760630395737385e-06, "loss": 0.6709, "mean_token_accuracy": 0.7693336308002472, "step": 4176 }, { "epoch": 2.8437180796731356, "grad_norm": 1.7947759628295898, "learning_rate": 4.273834166450292e-06, "loss": 0.5246, "mean_token_accuracy": 0.8077475726604462, "step": 4177 }, { "epoch": 2.8443990466462377, "grad_norm": 1.6447950601577759, "learning_rate": 4.271605440726418e-06, "loss": 0.6653, "mean_token_accuracy": 0.7712818384170532, "step": 4178 }, { "epoch": 2.8450800136193397, "grad_norm": 1.8028513193130493, "learning_rate": 4.26937686285451e-06, "loss": 0.4295, "mean_token_accuracy": 0.8546448051929474, "step": 4179 }, { "epoch": 2.8457609805924413, "grad_norm": 1.8667936325073242, "learning_rate": 4.26714843328693e-06, "loss": 0.4826, "mean_token_accuracy": 0.8254448771476746, "step": 4180 }, { "epoch": 2.846441947565543, "grad_norm": 1.7897074222564697, "learning_rate": 4.264920152476015e-06, "loss": 0.5824, "mean_token_accuracy": 0.7864391803741455, "step": 4181 }, { "epoch": 2.847122914538645, "grad_norm": 1.6894559860229492, "learning_rate": 4.2626920208740644e-06, "loss": 0.6753, "mean_token_accuracy": 0.7588403522968292, "step": 4182 }, { "epoch": 2.8478038815117466, "grad_norm": 1.885414719581604, "learning_rate": 4.260464038933356e-06, "loss": 0.4937, "mean_token_accuracy": 0.8232785761356354, "step": 4183 }, { "epoch": 2.8484848484848486, "grad_norm": 1.8357234001159668, "learning_rate": 4.258236207106132e-06, "loss": 0.4123, "mean_token_accuracy": 0.8587775230407715, "step": 4184 }, { "epoch": 2.8491658154579502, "grad_norm": 1.8576886653900146, "learning_rate": 4.256008525844603e-06, "loss": 0.5758, "mean_token_accuracy": 0.7901118397712708, "step": 4185 }, { "epoch": 2.849846782431052, "grad_norm": 1.831027626991272, "learning_rate": 4.253780995600954e-06, "loss": 0.4677, "mean_token_accuracy": 0.8430272340774536, "step": 4186 }, { "epoch": 2.850527749404154, "grad_norm": 1.6860144138336182, "learning_rate": 4.251553616827336e-06, "loss": 0.5853, "mean_token_accuracy": 0.8088220059871674, "step": 4187 }, { "epoch": 2.851208716377256, "grad_norm": 1.8507416248321533, "learning_rate": 4.24932638997587e-06, "loss": 0.4884, "mean_token_accuracy": 0.8272199928760529, "step": 4188 }, { "epoch": 2.8518896833503575, "grad_norm": 1.6810672283172607, "learning_rate": 4.247099315498645e-06, "loss": 0.5885, "mean_token_accuracy": 0.7900225520133972, "step": 4189 }, { "epoch": 2.852570650323459, "grad_norm": 1.864790439605713, "learning_rate": 4.244872393847719e-06, "loss": 0.4366, "mean_token_accuracy": 0.8554016947746277, "step": 4190 }, { "epoch": 2.853251617296561, "grad_norm": 1.933195948600769, "learning_rate": 4.2426456254751224e-06, "loss": 0.6145, "mean_token_accuracy": 0.7922805547714233, "step": 4191 }, { "epoch": 2.853932584269663, "grad_norm": 1.785265326499939, "learning_rate": 4.240419010832849e-06, "loss": 0.4924, "mean_token_accuracy": 0.8069719970226288, "step": 4192 }, { "epoch": 2.854613551242765, "grad_norm": 1.9420619010925293, "learning_rate": 4.238192550372868e-06, "loss": 0.4208, "mean_token_accuracy": 0.847986251115799, "step": 4193 }, { "epoch": 2.8552945182158664, "grad_norm": 1.825312852859497, "learning_rate": 4.2359662445471135e-06, "loss": 0.4366, "mean_token_accuracy": 0.8487889468669891, "step": 4194 }, { "epoch": 2.8559754851889685, "grad_norm": 1.8553811311721802, "learning_rate": 4.233740093807485e-06, "loss": 0.5214, "mean_token_accuracy": 0.8095127940177917, "step": 4195 }, { "epoch": 2.85665645216207, "grad_norm": 1.6050833463668823, "learning_rate": 4.231514098605858e-06, "loss": 0.5417, "mean_token_accuracy": 0.7898389995098114, "step": 4196 }, { "epoch": 2.857337419135172, "grad_norm": 1.9033551216125488, "learning_rate": 4.2292882593940696e-06, "loss": 0.4565, "mean_token_accuracy": 0.8383890390396118, "step": 4197 }, { "epoch": 2.8580183861082737, "grad_norm": 1.7423880100250244, "learning_rate": 4.227062576623931e-06, "loss": 0.4493, "mean_token_accuracy": 0.8102967143058777, "step": 4198 }, { "epoch": 2.8586993530813753, "grad_norm": 1.6596667766571045, "learning_rate": 4.224837050747216e-06, "loss": 0.6331, "mean_token_accuracy": 0.7779891192913055, "step": 4199 }, { "epoch": 2.8593803200544774, "grad_norm": 1.976667881011963, "learning_rate": 4.22261168221567e-06, "loss": 0.4748, "mean_token_accuracy": 0.8374482095241547, "step": 4200 }, { "epoch": 2.860061287027579, "grad_norm": 1.7734335660934448, "learning_rate": 4.220386471481006e-06, "loss": 0.5379, "mean_token_accuracy": 0.8198884427547455, "step": 4201 }, { "epoch": 2.860742254000681, "grad_norm": 1.8069589138031006, "learning_rate": 4.218161418994905e-06, "loss": 0.4911, "mean_token_accuracy": 0.7940426766872406, "step": 4202 }, { "epoch": 2.8614232209737827, "grad_norm": 1.8036112785339355, "learning_rate": 4.2159365252090155e-06, "loss": 0.5333, "mean_token_accuracy": 0.8069603443145752, "step": 4203 }, { "epoch": 2.8621041879468847, "grad_norm": 1.8370689153671265, "learning_rate": 4.213711790574954e-06, "loss": 0.5352, "mean_token_accuracy": 0.815561980009079, "step": 4204 }, { "epoch": 2.8627851549199863, "grad_norm": 1.898891806602478, "learning_rate": 4.2114872155443035e-06, "loss": 0.3562, "mean_token_accuracy": 0.8774846196174622, "step": 4205 }, { "epoch": 2.8634661218930884, "grad_norm": 1.8157821893692017, "learning_rate": 4.209262800568618e-06, "loss": 0.4221, "mean_token_accuracy": 0.8544164001941681, "step": 4206 }, { "epoch": 2.86414708886619, "grad_norm": 1.81473708152771, "learning_rate": 4.207038546099412e-06, "loss": 0.5291, "mean_token_accuracy": 0.8180319368839264, "step": 4207 }, { "epoch": 2.8648280558392916, "grad_norm": 1.8034145832061768, "learning_rate": 4.204814452588178e-06, "loss": 0.4559, "mean_token_accuracy": 0.8421880602836609, "step": 4208 }, { "epoch": 2.8655090228123936, "grad_norm": 1.9170043468475342, "learning_rate": 4.202590520486368e-06, "loss": 0.5443, "mean_token_accuracy": 0.8157675862312317, "step": 4209 }, { "epoch": 2.8661899897854957, "grad_norm": 1.798509120941162, "learning_rate": 4.2003667502454e-06, "loss": 0.462, "mean_token_accuracy": 0.8293232321739197, "step": 4210 }, { "epoch": 2.8668709567585973, "grad_norm": 1.965796947479248, "learning_rate": 4.198143142316667e-06, "loss": 0.4538, "mean_token_accuracy": 0.8328068852424622, "step": 4211 }, { "epoch": 2.867551923731699, "grad_norm": 1.7375755310058594, "learning_rate": 4.19591969715152e-06, "loss": 0.5318, "mean_token_accuracy": 0.8223453760147095, "step": 4212 }, { "epoch": 2.868232890704801, "grad_norm": 1.7523311376571655, "learning_rate": 4.193696415201286e-06, "loss": 0.5214, "mean_token_accuracy": 0.8098921775817871, "step": 4213 }, { "epoch": 2.8689138576779025, "grad_norm": 1.7877167463302612, "learning_rate": 4.191473296917251e-06, "loss": 0.5259, "mean_token_accuracy": 0.8038459420204163, "step": 4214 }, { "epoch": 2.8695948246510046, "grad_norm": 1.736160159111023, "learning_rate": 4.189250342750669e-06, "loss": 0.4614, "mean_token_accuracy": 0.8306402862071991, "step": 4215 }, { "epoch": 2.870275791624106, "grad_norm": 1.6606780290603638, "learning_rate": 4.1870275531527685e-06, "loss": 0.5638, "mean_token_accuracy": 0.8019594252109528, "step": 4216 }, { "epoch": 2.870956758597208, "grad_norm": 1.8160747289657593, "learning_rate": 4.184804928574734e-06, "loss": 0.4963, "mean_token_accuracy": 0.8307451009750366, "step": 4217 }, { "epoch": 2.87163772557031, "grad_norm": 1.700251579284668, "learning_rate": 4.182582469467723e-06, "loss": 0.6304, "mean_token_accuracy": 0.7699013948440552, "step": 4218 }, { "epoch": 2.872318692543412, "grad_norm": 1.8243403434753418, "learning_rate": 4.180360176282859e-06, "loss": 0.4308, "mean_token_accuracy": 0.8457906246185303, "step": 4219 }, { "epoch": 2.8729996595165135, "grad_norm": 1.7983736991882324, "learning_rate": 4.178138049471228e-06, "loss": 0.433, "mean_token_accuracy": 0.8331515491008759, "step": 4220 }, { "epoch": 2.873680626489615, "grad_norm": 1.6987881660461426, "learning_rate": 4.1759160894838866e-06, "loss": 0.613, "mean_token_accuracy": 0.7843351066112518, "step": 4221 }, { "epoch": 2.874361593462717, "grad_norm": 1.909280776977539, "learning_rate": 4.173694296771853e-06, "loss": 0.4602, "mean_token_accuracy": 0.8398928344249725, "step": 4222 }, { "epoch": 2.8750425604358187, "grad_norm": 1.7985267639160156, "learning_rate": 4.171472671786118e-06, "loss": 0.459, "mean_token_accuracy": 0.8423614203929901, "step": 4223 }, { "epoch": 2.875723527408921, "grad_norm": 1.6573536396026611, "learning_rate": 4.169251214977632e-06, "loss": 0.588, "mean_token_accuracy": 0.7951467335224152, "step": 4224 }, { "epoch": 2.8764044943820224, "grad_norm": 1.7850966453552246, "learning_rate": 4.167029926797313e-06, "loss": 0.4672, "mean_token_accuracy": 0.8271413445472717, "step": 4225 }, { "epoch": 2.8770854613551244, "grad_norm": 1.7965419292449951, "learning_rate": 4.164808807696049e-06, "loss": 0.5289, "mean_token_accuracy": 0.8142670691013336, "step": 4226 }, { "epoch": 2.877766428328226, "grad_norm": 1.700174331665039, "learning_rate": 4.162587858124686e-06, "loss": 0.458, "mean_token_accuracy": 0.8280965387821198, "step": 4227 }, { "epoch": 2.878447395301328, "grad_norm": 1.7215266227722168, "learning_rate": 4.1603670785340435e-06, "loss": 0.56, "mean_token_accuracy": 0.8032138049602509, "step": 4228 }, { "epoch": 2.8791283622744297, "grad_norm": 1.8856459856033325, "learning_rate": 4.158146469374901e-06, "loss": 0.4469, "mean_token_accuracy": 0.8188730180263519, "step": 4229 }, { "epoch": 2.8798093292475313, "grad_norm": 1.924331784248352, "learning_rate": 4.155926031098002e-06, "loss": 0.418, "mean_token_accuracy": 0.8426766693592072, "step": 4230 }, { "epoch": 2.8804902962206334, "grad_norm": 2.0094053745269775, "learning_rate": 4.153705764154064e-06, "loss": 0.4535, "mean_token_accuracy": 0.8478063642978668, "step": 4231 }, { "epoch": 2.881171263193735, "grad_norm": 1.7069286108016968, "learning_rate": 4.151485668993762e-06, "loss": 0.6312, "mean_token_accuracy": 0.7798939347267151, "step": 4232 }, { "epoch": 2.881852230166837, "grad_norm": 1.9513858556747437, "learning_rate": 4.1492657460677375e-06, "loss": 0.5028, "mean_token_accuracy": 0.8138614594936371, "step": 4233 }, { "epoch": 2.8825331971399386, "grad_norm": 1.8367968797683716, "learning_rate": 4.1470459958265995e-06, "loss": 0.5334, "mean_token_accuracy": 0.7954498827457428, "step": 4234 }, { "epoch": 2.8832141641130407, "grad_norm": 1.712620496749878, "learning_rate": 4.144826418720919e-06, "loss": 0.4856, "mean_token_accuracy": 0.8186257481575012, "step": 4235 }, { "epoch": 2.8838951310861423, "grad_norm": 2.0190396308898926, "learning_rate": 4.1426070152012335e-06, "loss": 0.4663, "mean_token_accuracy": 0.8483730852603912, "step": 4236 }, { "epoch": 2.8845760980592443, "grad_norm": 1.7124433517456055, "learning_rate": 4.1403877857180445e-06, "loss": 0.6192, "mean_token_accuracy": 0.7766081094741821, "step": 4237 }, { "epoch": 2.885257065032346, "grad_norm": 1.6963036060333252, "learning_rate": 4.13816873072182e-06, "loss": 0.4684, "mean_token_accuracy": 0.8187283873558044, "step": 4238 }, { "epoch": 2.8859380320054475, "grad_norm": 1.7227321863174438, "learning_rate": 4.13594985066299e-06, "loss": 0.5324, "mean_token_accuracy": 0.8223837316036224, "step": 4239 }, { "epoch": 2.8866189989785496, "grad_norm": 1.7959167957305908, "learning_rate": 4.133731145991951e-06, "loss": 0.4981, "mean_token_accuracy": 0.8273895382881165, "step": 4240 }, { "epoch": 2.8872999659516516, "grad_norm": 1.8382227420806885, "learning_rate": 4.131512617159062e-06, "loss": 0.4508, "mean_token_accuracy": 0.8316396176815033, "step": 4241 }, { "epoch": 2.887980932924753, "grad_norm": 1.8495080471038818, "learning_rate": 4.1292942646146486e-06, "loss": 0.5145, "mean_token_accuracy": 0.8160418272018433, "step": 4242 }, { "epoch": 2.888661899897855, "grad_norm": 1.9742517471313477, "learning_rate": 4.127076088809e-06, "loss": 0.3741, "mean_token_accuracy": 0.8679704368114471, "step": 4243 }, { "epoch": 2.889342866870957, "grad_norm": 1.7982630729675293, "learning_rate": 4.1248580901923674e-06, "loss": 0.4454, "mean_token_accuracy": 0.8510293364524841, "step": 4244 }, { "epoch": 2.8900238338440585, "grad_norm": 1.7055981159210205, "learning_rate": 4.1226402692149655e-06, "loss": 0.5998, "mean_token_accuracy": 0.7700488567352295, "step": 4245 }, { "epoch": 2.8907048008171605, "grad_norm": 1.9223978519439697, "learning_rate": 4.120422626326979e-06, "loss": 0.4679, "mean_token_accuracy": 0.8251345753669739, "step": 4246 }, { "epoch": 2.891385767790262, "grad_norm": 1.8246005773544312, "learning_rate": 4.118205161978552e-06, "loss": 0.5112, "mean_token_accuracy": 0.7983720898628235, "step": 4247 }, { "epoch": 2.8920667347633637, "grad_norm": 1.840940237045288, "learning_rate": 4.115987876619792e-06, "loss": 0.4538, "mean_token_accuracy": 0.8336587846279144, "step": 4248 }, { "epoch": 2.892747701736466, "grad_norm": 1.9911842346191406, "learning_rate": 4.1137707707007715e-06, "loss": 0.4752, "mean_token_accuracy": 0.8430758416652679, "step": 4249 }, { "epoch": 2.893428668709568, "grad_norm": 1.910124659538269, "learning_rate": 4.111553844671524e-06, "loss": 0.4997, "mean_token_accuracy": 0.8367056846618652, "step": 4250 }, { "epoch": 2.8941096356826694, "grad_norm": 1.8473777770996094, "learning_rate": 4.10933709898205e-06, "loss": 0.3962, "mean_token_accuracy": 0.8714910447597504, "step": 4251 }, { "epoch": 2.894790602655771, "grad_norm": 1.8518849611282349, "learning_rate": 4.1071205340823115e-06, "loss": 0.4641, "mean_token_accuracy": 0.8240261673927307, "step": 4252 }, { "epoch": 2.895471569628873, "grad_norm": 1.6539890766143799, "learning_rate": 4.104904150422236e-06, "loss": 0.6413, "mean_token_accuracy": 0.7828200161457062, "step": 4253 }, { "epoch": 2.8961525366019747, "grad_norm": 1.585254192352295, "learning_rate": 4.102687948451711e-06, "loss": 0.6748, "mean_token_accuracy": 0.768392950296402, "step": 4254 }, { "epoch": 2.8968335035750767, "grad_norm": 1.869718074798584, "learning_rate": 4.100471928620589e-06, "loss": 0.5168, "mean_token_accuracy": 0.8240197002887726, "step": 4255 }, { "epoch": 2.8975144705481783, "grad_norm": 1.7076194286346436, "learning_rate": 4.098256091378684e-06, "loss": 0.5525, "mean_token_accuracy": 0.8267101347446442, "step": 4256 }, { "epoch": 2.89819543752128, "grad_norm": 1.8440693616867065, "learning_rate": 4.096040437175776e-06, "loss": 0.5449, "mean_token_accuracy": 0.808507889509201, "step": 4257 }, { "epoch": 2.898876404494382, "grad_norm": 1.8049652576446533, "learning_rate": 4.093824966461605e-06, "loss": 0.5189, "mean_token_accuracy": 0.8276647627353668, "step": 4258 }, { "epoch": 2.899557371467484, "grad_norm": 1.791693925857544, "learning_rate": 4.091609679685872e-06, "loss": 0.4743, "mean_token_accuracy": 0.8110148012638092, "step": 4259 }, { "epoch": 2.9002383384405857, "grad_norm": 1.8140355348587036, "learning_rate": 4.089394577298248e-06, "loss": 0.3311, "mean_token_accuracy": 0.8896600008010864, "step": 4260 }, { "epoch": 2.9009193054136873, "grad_norm": 1.9020270109176636, "learning_rate": 4.087179659748361e-06, "loss": 0.4401, "mean_token_accuracy": 0.843631237745285, "step": 4261 }, { "epoch": 2.9016002723867893, "grad_norm": 1.9207417964935303, "learning_rate": 4.084964927485799e-06, "loss": 0.4497, "mean_token_accuracy": 0.8430925011634827, "step": 4262 }, { "epoch": 2.902281239359891, "grad_norm": 1.974886178970337, "learning_rate": 4.08275038096012e-06, "loss": 0.4758, "mean_token_accuracy": 0.8389378488063812, "step": 4263 }, { "epoch": 2.902962206332993, "grad_norm": 1.6664589643478394, "learning_rate": 4.080536020620835e-06, "loss": 0.5924, "mean_token_accuracy": 0.7882455587387085, "step": 4264 }, { "epoch": 2.9036431733060946, "grad_norm": 1.6679320335388184, "learning_rate": 4.078321846917428e-06, "loss": 0.5932, "mean_token_accuracy": 0.7890698611736298, "step": 4265 }, { "epoch": 2.9043241402791966, "grad_norm": 1.776602029800415, "learning_rate": 4.076107860299336e-06, "loss": 0.4595, "mean_token_accuracy": 0.8498111665248871, "step": 4266 }, { "epoch": 2.905005107252298, "grad_norm": 1.8215088844299316, "learning_rate": 4.073894061215961e-06, "loss": 0.4857, "mean_token_accuracy": 0.8341466784477234, "step": 4267 }, { "epoch": 2.9056860742254003, "grad_norm": 1.8593121767044067, "learning_rate": 4.07168045011667e-06, "loss": 0.4873, "mean_token_accuracy": 0.8313123285770416, "step": 4268 }, { "epoch": 2.906367041198502, "grad_norm": 1.7450860738754272, "learning_rate": 4.0694670274507855e-06, "loss": 0.5935, "mean_token_accuracy": 0.8013095557689667, "step": 4269 }, { "epoch": 2.9070480081716035, "grad_norm": 1.720666527748108, "learning_rate": 4.0672537936676e-06, "loss": 0.6981, "mean_token_accuracy": 0.7546899914741516, "step": 4270 }, { "epoch": 2.9077289751447055, "grad_norm": 1.892319679260254, "learning_rate": 4.065040749216359e-06, "loss": 0.3745, "mean_token_accuracy": 0.8724323809146881, "step": 4271 }, { "epoch": 2.908409942117807, "grad_norm": 1.6938608884811401, "learning_rate": 4.062827894546275e-06, "loss": 0.6003, "mean_token_accuracy": 0.7757946252822876, "step": 4272 }, { "epoch": 2.909090909090909, "grad_norm": 1.8513832092285156, "learning_rate": 4.060615230106522e-06, "loss": 0.4858, "mean_token_accuracy": 0.826555609703064, "step": 4273 }, { "epoch": 2.9097718760640108, "grad_norm": 1.8370981216430664, "learning_rate": 4.058402756346232e-06, "loss": 0.5044, "mean_token_accuracy": 0.7936377823352814, "step": 4274 }, { "epoch": 2.910452843037113, "grad_norm": 1.6862258911132812, "learning_rate": 4.056190473714502e-06, "loss": 0.5237, "mean_token_accuracy": 0.8125124871730804, "step": 4275 }, { "epoch": 2.9111338100102144, "grad_norm": 1.8483190536499023, "learning_rate": 4.053978382660388e-06, "loss": 0.4243, "mean_token_accuracy": 0.8548761606216431, "step": 4276 }, { "epoch": 2.9118147769833165, "grad_norm": 1.7479592561721802, "learning_rate": 4.051766483632906e-06, "loss": 0.541, "mean_token_accuracy": 0.8012154698371887, "step": 4277 }, { "epoch": 2.912495743956418, "grad_norm": 1.8266876935958862, "learning_rate": 4.049554777081038e-06, "loss": 0.5374, "mean_token_accuracy": 0.8013880550861359, "step": 4278 }, { "epoch": 2.9131767109295197, "grad_norm": 1.9700167179107666, "learning_rate": 4.047343263453721e-06, "loss": 0.3645, "mean_token_accuracy": 0.8682436347007751, "step": 4279 }, { "epoch": 2.9138576779026217, "grad_norm": 1.937796711921692, "learning_rate": 4.0451319431998565e-06, "loss": 0.4495, "mean_token_accuracy": 0.8416050672531128, "step": 4280 }, { "epoch": 2.914538644875724, "grad_norm": 1.924264907836914, "learning_rate": 4.0429208167683055e-06, "loss": 0.4709, "mean_token_accuracy": 0.8022502064704895, "step": 4281 }, { "epoch": 2.9152196118488254, "grad_norm": 1.8400758504867554, "learning_rate": 4.0407098846078876e-06, "loss": 0.5216, "mean_token_accuracy": 0.8135822713375092, "step": 4282 }, { "epoch": 2.915900578821927, "grad_norm": 1.7041207551956177, "learning_rate": 4.03849914716739e-06, "loss": 0.5346, "mean_token_accuracy": 0.7971119284629822, "step": 4283 }, { "epoch": 2.916581545795029, "grad_norm": 1.853523850440979, "learning_rate": 4.0362886048955505e-06, "loss": 0.5554, "mean_token_accuracy": 0.800065815448761, "step": 4284 }, { "epoch": 2.9172625127681306, "grad_norm": 1.727640151977539, "learning_rate": 4.034078258241077e-06, "loss": 0.5714, "mean_token_accuracy": 0.8036634922027588, "step": 4285 }, { "epoch": 2.9179434797412327, "grad_norm": 2.0240540504455566, "learning_rate": 4.03186810765263e-06, "loss": 0.3941, "mean_token_accuracy": 0.8535207509994507, "step": 4286 }, { "epoch": 2.9186244467143343, "grad_norm": 1.8466461896896362, "learning_rate": 4.029658153578833e-06, "loss": 0.6321, "mean_token_accuracy": 0.758667379617691, "step": 4287 }, { "epoch": 2.919305413687436, "grad_norm": 1.6573731899261475, "learning_rate": 4.027448396468273e-06, "loss": 0.4845, "mean_token_accuracy": 0.8302209675312042, "step": 4288 }, { "epoch": 2.919986380660538, "grad_norm": 1.703972339630127, "learning_rate": 4.0252388367694895e-06, "loss": 0.5313, "mean_token_accuracy": 0.821701318025589, "step": 4289 }, { "epoch": 2.92066734763364, "grad_norm": 1.64605712890625, "learning_rate": 4.02302947493099e-06, "loss": 0.6424, "mean_token_accuracy": 0.7567314803600311, "step": 4290 }, { "epoch": 2.9213483146067416, "grad_norm": 1.6711187362670898, "learning_rate": 4.020820311401238e-06, "loss": 0.514, "mean_token_accuracy": 0.8154551386833191, "step": 4291 }, { "epoch": 2.922029281579843, "grad_norm": 1.783675193786621, "learning_rate": 4.018611346628654e-06, "loss": 0.5012, "mean_token_accuracy": 0.8230370879173279, "step": 4292 }, { "epoch": 2.9227102485529453, "grad_norm": 1.7991238832473755, "learning_rate": 4.0164025810616235e-06, "loss": 0.5636, "mean_token_accuracy": 0.7968301475048065, "step": 4293 }, { "epoch": 2.923391215526047, "grad_norm": 1.843888521194458, "learning_rate": 4.014194015148488e-06, "loss": 0.4395, "mean_token_accuracy": 0.834936112165451, "step": 4294 }, { "epoch": 2.924072182499149, "grad_norm": 1.990447759628296, "learning_rate": 4.01198564933755e-06, "loss": 0.4246, "mean_token_accuracy": 0.8564800024032593, "step": 4295 }, { "epoch": 2.9247531494722505, "grad_norm": 2.025158166885376, "learning_rate": 4.009777484077072e-06, "loss": 0.3937, "mean_token_accuracy": 0.8656134307384491, "step": 4296 }, { "epoch": 2.9254341164453526, "grad_norm": 1.6855300664901733, "learning_rate": 4.007569519815271e-06, "loss": 0.5759, "mean_token_accuracy": 0.8019188344478607, "step": 4297 }, { "epoch": 2.926115083418454, "grad_norm": 1.7537213563919067, "learning_rate": 4.005361757000333e-06, "loss": 0.5738, "mean_token_accuracy": 0.7809726893901825, "step": 4298 }, { "epoch": 2.926796050391556, "grad_norm": 1.8627188205718994, "learning_rate": 4.003154196080391e-06, "loss": 0.5472, "mean_token_accuracy": 0.7988318800926208, "step": 4299 }, { "epoch": 2.927477017364658, "grad_norm": 1.8316338062286377, "learning_rate": 4.000946837503549e-06, "loss": 0.4493, "mean_token_accuracy": 0.8481617867946625, "step": 4300 }, { "epoch": 2.9281579843377594, "grad_norm": 1.8386238813400269, "learning_rate": 3.998739681717859e-06, "loss": 0.5135, "mean_token_accuracy": 0.8061385452747345, "step": 4301 }, { "epoch": 2.9288389513108615, "grad_norm": 1.7927242517471313, "learning_rate": 3.99653272917134e-06, "loss": 0.3463, "mean_token_accuracy": 0.8725826740264893, "step": 4302 }, { "epoch": 2.929519918283963, "grad_norm": 1.6176739931106567, "learning_rate": 3.994325980311965e-06, "loss": 0.6365, "mean_token_accuracy": 0.7803708910942078, "step": 4303 }, { "epoch": 2.930200885257065, "grad_norm": 1.8446574211120605, "learning_rate": 3.992119435587666e-06, "loss": 0.4775, "mean_token_accuracy": 0.8097345232963562, "step": 4304 }, { "epoch": 2.9308818522301667, "grad_norm": 1.7932169437408447, "learning_rate": 3.989913095446338e-06, "loss": 0.5276, "mean_token_accuracy": 0.8008947670459747, "step": 4305 }, { "epoch": 2.9315628192032688, "grad_norm": 1.8464210033416748, "learning_rate": 3.9877069603358306e-06, "loss": 0.4095, "mean_token_accuracy": 0.8526546657085419, "step": 4306 }, { "epoch": 2.9322437861763704, "grad_norm": 1.8086328506469727, "learning_rate": 3.98550103070395e-06, "loss": 0.4124, "mean_token_accuracy": 0.8614432215690613, "step": 4307 }, { "epoch": 2.9329247531494724, "grad_norm": 1.5702863931655884, "learning_rate": 3.983295306998467e-06, "loss": 0.6944, "mean_token_accuracy": 0.7462605834007263, "step": 4308 }, { "epoch": 2.933605720122574, "grad_norm": 1.7063840627670288, "learning_rate": 3.981089789667102e-06, "loss": 0.5587, "mean_token_accuracy": 0.817766010761261, "step": 4309 }, { "epoch": 2.9342866870956756, "grad_norm": 1.762518286705017, "learning_rate": 3.978884479157543e-06, "loss": 0.5072, "mean_token_accuracy": 0.8385766744613647, "step": 4310 }, { "epoch": 2.9349676540687777, "grad_norm": 1.8472208976745605, "learning_rate": 3.9766793759174305e-06, "loss": 0.569, "mean_token_accuracy": 0.7941735088825226, "step": 4311 }, { "epoch": 2.9356486210418797, "grad_norm": 2.0079665184020996, "learning_rate": 3.9744744803943595e-06, "loss": 0.5405, "mean_token_accuracy": 0.8236522674560547, "step": 4312 }, { "epoch": 2.9363295880149813, "grad_norm": 1.8070951700210571, "learning_rate": 3.972269793035891e-06, "loss": 0.5858, "mean_token_accuracy": 0.7989346086978912, "step": 4313 }, { "epoch": 2.937010554988083, "grad_norm": 1.7500931024551392, "learning_rate": 3.970065314289539e-06, "loss": 0.3681, "mean_token_accuracy": 0.8725044429302216, "step": 4314 }, { "epoch": 2.937691521961185, "grad_norm": 1.8528331518173218, "learning_rate": 3.967861044602777e-06, "loss": 0.4507, "mean_token_accuracy": 0.8319549262523651, "step": 4315 }, { "epoch": 2.9383724889342866, "grad_norm": 1.794579029083252, "learning_rate": 3.9656569844230335e-06, "loss": 0.5347, "mean_token_accuracy": 0.8302786946296692, "step": 4316 }, { "epoch": 2.9390534559073886, "grad_norm": 1.7537637948989868, "learning_rate": 3.963453134197696e-06, "loss": 0.5389, "mean_token_accuracy": 0.801173061132431, "step": 4317 }, { "epoch": 2.9397344228804902, "grad_norm": 1.7067651748657227, "learning_rate": 3.9612494943741095e-06, "loss": 0.6464, "mean_token_accuracy": 0.7501866221427917, "step": 4318 }, { "epoch": 2.940415389853592, "grad_norm": 1.823914885520935, "learning_rate": 3.959046065399575e-06, "loss": 0.506, "mean_token_accuracy": 0.8129211962223053, "step": 4319 }, { "epoch": 2.941096356826694, "grad_norm": 1.7934231758117676, "learning_rate": 3.956842847721357e-06, "loss": 0.3536, "mean_token_accuracy": 0.8787241876125336, "step": 4320 }, { "epoch": 2.941777323799796, "grad_norm": 1.7828806638717651, "learning_rate": 3.954639841786667e-06, "loss": 0.4942, "mean_token_accuracy": 0.8019444048404694, "step": 4321 }, { "epoch": 2.9424582907728976, "grad_norm": 1.7462096214294434, "learning_rate": 3.952437048042679e-06, "loss": 0.6747, "mean_token_accuracy": 0.7584452629089355, "step": 4322 }, { "epoch": 2.943139257745999, "grad_norm": 1.8677386045455933, "learning_rate": 3.9502344669365256e-06, "loss": 0.5, "mean_token_accuracy": 0.8192739486694336, "step": 4323 }, { "epoch": 2.943820224719101, "grad_norm": 1.7657502889633179, "learning_rate": 3.948032098915291e-06, "loss": 0.5655, "mean_token_accuracy": 0.7869642078876495, "step": 4324 }, { "epoch": 2.944501191692203, "grad_norm": 1.8956341743469238, "learning_rate": 3.945829944426023e-06, "loss": 0.3617, "mean_token_accuracy": 0.8805360496044159, "step": 4325 }, { "epoch": 2.945182158665305, "grad_norm": 1.6673200130462646, "learning_rate": 3.94362800391572e-06, "loss": 0.5415, "mean_token_accuracy": 0.8110515177249908, "step": 4326 }, { "epoch": 2.9458631256384065, "grad_norm": 1.7338987588882446, "learning_rate": 3.941426277831338e-06, "loss": 0.476, "mean_token_accuracy": 0.8313222825527191, "step": 4327 }, { "epoch": 2.946544092611508, "grad_norm": 1.794656753540039, "learning_rate": 3.939224766619793e-06, "loss": 0.5762, "mean_token_accuracy": 0.8049722611904144, "step": 4328 }, { "epoch": 2.94722505958461, "grad_norm": 1.9159817695617676, "learning_rate": 3.937023470727955e-06, "loss": 0.4974, "mean_token_accuracy": 0.8270091116428375, "step": 4329 }, { "epoch": 2.947906026557712, "grad_norm": 1.7555806636810303, "learning_rate": 3.9348223906026505e-06, "loss": 0.5548, "mean_token_accuracy": 0.7851535677909851, "step": 4330 }, { "epoch": 2.9485869935308138, "grad_norm": 1.8152483701705933, "learning_rate": 3.932621526690661e-06, "loss": 0.4638, "mean_token_accuracy": 0.7957888245582581, "step": 4331 }, { "epoch": 2.9492679605039154, "grad_norm": 1.8441697359085083, "learning_rate": 3.9304208794387255e-06, "loss": 0.3389, "mean_token_accuracy": 0.8860127329826355, "step": 4332 }, { "epoch": 2.9499489274770174, "grad_norm": 1.6735668182373047, "learning_rate": 3.928220449293539e-06, "loss": 0.4607, "mean_token_accuracy": 0.8408998847007751, "step": 4333 }, { "epoch": 2.950629894450119, "grad_norm": 1.899572730064392, "learning_rate": 3.926020236701751e-06, "loss": 0.4385, "mean_token_accuracy": 0.8577788770198822, "step": 4334 }, { "epoch": 2.951310861423221, "grad_norm": 1.6964030265808105, "learning_rate": 3.92382024210997e-06, "loss": 0.5181, "mean_token_accuracy": 0.8117527663707733, "step": 4335 }, { "epoch": 2.9519918283963227, "grad_norm": 1.661312460899353, "learning_rate": 3.9216204659647585e-06, "loss": 0.5958, "mean_token_accuracy": 0.7973164021968842, "step": 4336 }, { "epoch": 2.9526727953694247, "grad_norm": 1.7235058546066284, "learning_rate": 3.919420908712631e-06, "loss": 0.4389, "mean_token_accuracy": 0.8414698541164398, "step": 4337 }, { "epoch": 2.9533537623425263, "grad_norm": 1.78335702419281, "learning_rate": 3.9172215708000655e-06, "loss": 0.5122, "mean_token_accuracy": 0.8079060912132263, "step": 4338 }, { "epoch": 2.9540347293156284, "grad_norm": 1.661738634109497, "learning_rate": 3.915022452673486e-06, "loss": 0.6239, "mean_token_accuracy": 0.7826867997646332, "step": 4339 }, { "epoch": 2.95471569628873, "grad_norm": 1.7976279258728027, "learning_rate": 3.912823554779284e-06, "loss": 0.677, "mean_token_accuracy": 0.7622909247875214, "step": 4340 }, { "epoch": 2.9553966632618316, "grad_norm": 1.9401600360870361, "learning_rate": 3.9106248775637926e-06, "loss": 0.4708, "mean_token_accuracy": 0.8257976472377777, "step": 4341 }, { "epoch": 2.9560776302349336, "grad_norm": 1.9607009887695312, "learning_rate": 3.9084264214733065e-06, "loss": 0.4067, "mean_token_accuracy": 0.8566299974918365, "step": 4342 }, { "epoch": 2.9567585972080352, "grad_norm": 1.695130705833435, "learning_rate": 3.9062281869540805e-06, "loss": 0.5266, "mean_token_accuracy": 0.814051479101181, "step": 4343 }, { "epoch": 2.9574395641811373, "grad_norm": 1.8518218994140625, "learning_rate": 3.904030174452315e-06, "loss": 0.5257, "mean_token_accuracy": 0.8078315854072571, "step": 4344 }, { "epoch": 2.958120531154239, "grad_norm": 1.6893078088760376, "learning_rate": 3.901832384414175e-06, "loss": 0.5833, "mean_token_accuracy": 0.8143641352653503, "step": 4345 }, { "epoch": 2.958801498127341, "grad_norm": 1.8204418420791626, "learning_rate": 3.8996348172857714e-06, "loss": 0.5422, "mean_token_accuracy": 0.8023016452789307, "step": 4346 }, { "epoch": 2.9594824651004425, "grad_norm": 1.886850357055664, "learning_rate": 3.897437473513174e-06, "loss": 0.5534, "mean_token_accuracy": 0.7968329191207886, "step": 4347 }, { "epoch": 2.9601634320735446, "grad_norm": 1.7956072092056274, "learning_rate": 3.8952403535424074e-06, "loss": 0.4936, "mean_token_accuracy": 0.8204759061336517, "step": 4348 }, { "epoch": 2.960844399046646, "grad_norm": 1.7391043901443481, "learning_rate": 3.8930434578194496e-06, "loss": 0.4354, "mean_token_accuracy": 0.8547083139419556, "step": 4349 }, { "epoch": 2.961525366019748, "grad_norm": 1.682998538017273, "learning_rate": 3.890846786790237e-06, "loss": 0.694, "mean_token_accuracy": 0.7566873729228973, "step": 4350 }, { "epoch": 2.96220633299285, "grad_norm": 1.7541495561599731, "learning_rate": 3.888650340900655e-06, "loss": 0.5299, "mean_token_accuracy": 0.7939137816429138, "step": 4351 }, { "epoch": 2.962887299965952, "grad_norm": 1.7450562715530396, "learning_rate": 3.886454120596544e-06, "loss": 0.4311, "mean_token_accuracy": 0.856687992811203, "step": 4352 }, { "epoch": 2.9635682669390535, "grad_norm": 1.8090530633926392, "learning_rate": 3.884258126323703e-06, "loss": 0.4472, "mean_token_accuracy": 0.8220162391662598, "step": 4353 }, { "epoch": 2.964249233912155, "grad_norm": 1.8807796239852905, "learning_rate": 3.882062358527879e-06, "loss": 0.4925, "mean_token_accuracy": 0.832300454378128, "step": 4354 }, { "epoch": 2.964930200885257, "grad_norm": 1.7798881530761719, "learning_rate": 3.879866817654781e-06, "loss": 0.4767, "mean_token_accuracy": 0.8059563934803009, "step": 4355 }, { "epoch": 2.9656111678583588, "grad_norm": 1.7522653341293335, "learning_rate": 3.877671504150065e-06, "loss": 0.5078, "mean_token_accuracy": 0.7830926477909088, "step": 4356 }, { "epoch": 2.966292134831461, "grad_norm": 1.6069170236587524, "learning_rate": 3.875476418459339e-06, "loss": 0.6375, "mean_token_accuracy": 0.7732480466365814, "step": 4357 }, { "epoch": 2.9669731018045624, "grad_norm": 1.7287918329238892, "learning_rate": 3.873281561028175e-06, "loss": 0.5465, "mean_token_accuracy": 0.7926326394081116, "step": 4358 }, { "epoch": 2.967654068777664, "grad_norm": 1.8891868591308594, "learning_rate": 3.87108693230209e-06, "loss": 0.4648, "mean_token_accuracy": 0.8326667547225952, "step": 4359 }, { "epoch": 2.968335035750766, "grad_norm": 1.902045726776123, "learning_rate": 3.868892532726556e-06, "loss": 0.475, "mean_token_accuracy": 0.8124721348285675, "step": 4360 }, { "epoch": 2.969016002723868, "grad_norm": 1.783096194267273, "learning_rate": 3.8666983627470004e-06, "loss": 0.4556, "mean_token_accuracy": 0.8488315045833588, "step": 4361 }, { "epoch": 2.9696969696969697, "grad_norm": 1.818708896636963, "learning_rate": 3.864504422808806e-06, "loss": 0.5599, "mean_token_accuracy": 0.7668633162975311, "step": 4362 }, { "epoch": 2.9703779366700713, "grad_norm": 2.0064187049865723, "learning_rate": 3.862310713357302e-06, "loss": 0.3656, "mean_token_accuracy": 0.8748612403869629, "step": 4363 }, { "epoch": 2.9710589036431734, "grad_norm": 1.8536696434020996, "learning_rate": 3.860117234837774e-06, "loss": 0.6496, "mean_token_accuracy": 0.7858555018901825, "step": 4364 }, { "epoch": 2.971739870616275, "grad_norm": 1.6229946613311768, "learning_rate": 3.857923987695466e-06, "loss": 0.6412, "mean_token_accuracy": 0.7888422310352325, "step": 4365 }, { "epoch": 2.972420837589377, "grad_norm": 1.7528440952301025, "learning_rate": 3.855730972375569e-06, "loss": 0.5399, "mean_token_accuracy": 0.8014926314353943, "step": 4366 }, { "epoch": 2.9731018045624786, "grad_norm": 1.7128384113311768, "learning_rate": 3.853538189323229e-06, "loss": 0.5175, "mean_token_accuracy": 0.8111095428466797, "step": 4367 }, { "epoch": 2.9737827715355807, "grad_norm": 1.9224621057510376, "learning_rate": 3.851345638983542e-06, "loss": 0.5065, "mean_token_accuracy": 0.8159014284610748, "step": 4368 }, { "epoch": 2.9744637385086823, "grad_norm": 1.8050507307052612, "learning_rate": 3.84915332180156e-06, "loss": 0.4472, "mean_token_accuracy": 0.8310812413692474, "step": 4369 }, { "epoch": 2.9751447054817843, "grad_norm": 1.7327497005462646, "learning_rate": 3.84696123822229e-06, "loss": 0.5582, "mean_token_accuracy": 0.8093438744544983, "step": 4370 }, { "epoch": 2.975825672454886, "grad_norm": 1.7255167961120605, "learning_rate": 3.844769388690684e-06, "loss": 0.6915, "mean_token_accuracy": 0.7860522866249084, "step": 4371 }, { "epoch": 2.9765066394279875, "grad_norm": 1.954118251800537, "learning_rate": 3.842577773651655e-06, "loss": 0.5445, "mean_token_accuracy": 0.7791869342327118, "step": 4372 }, { "epoch": 2.9771876064010896, "grad_norm": 1.927512288093567, "learning_rate": 3.840386393550062e-06, "loss": 0.3941, "mean_token_accuracy": 0.8653305768966675, "step": 4373 }, { "epoch": 2.977868573374191, "grad_norm": 1.7277034521102905, "learning_rate": 3.838195248830719e-06, "loss": 0.6549, "mean_token_accuracy": 0.7624233663082123, "step": 4374 }, { "epoch": 2.9785495403472932, "grad_norm": 1.9097590446472168, "learning_rate": 3.836004339938391e-06, "loss": 0.4892, "mean_token_accuracy": 0.8267151713371277, "step": 4375 }, { "epoch": 2.979230507320395, "grad_norm": 1.6505380868911743, "learning_rate": 3.833813667317798e-06, "loss": 0.6183, "mean_token_accuracy": 0.7825332581996918, "step": 4376 }, { "epoch": 2.979911474293497, "grad_norm": 1.7754390239715576, "learning_rate": 3.8316232314136095e-06, "loss": 0.4246, "mean_token_accuracy": 0.8611710965633392, "step": 4377 }, { "epoch": 2.9805924412665985, "grad_norm": 1.8696634769439697, "learning_rate": 3.8294330326704474e-06, "loss": 0.471, "mean_token_accuracy": 0.845815509557724, "step": 4378 }, { "epoch": 2.9812734082397006, "grad_norm": 1.9244575500488281, "learning_rate": 3.827243071532883e-06, "loss": 0.6063, "mean_token_accuracy": 0.7981785833835602, "step": 4379 }, { "epoch": 2.981954375212802, "grad_norm": 1.7963922023773193, "learning_rate": 3.825053348445447e-06, "loss": 0.5609, "mean_token_accuracy": 0.8094695508480072, "step": 4380 }, { "epoch": 2.9826353421859038, "grad_norm": 1.7898422479629517, "learning_rate": 3.822863863852612e-06, "loss": 0.3487, "mean_token_accuracy": 0.8796209096908569, "step": 4381 }, { "epoch": 2.983316309159006, "grad_norm": 1.874700665473938, "learning_rate": 3.82067461819881e-06, "loss": 0.5214, "mean_token_accuracy": 0.8105354011058807, "step": 4382 }, { "epoch": 2.983997276132108, "grad_norm": 1.7981653213500977, "learning_rate": 3.8184856119284205e-06, "loss": 0.5861, "mean_token_accuracy": 0.8025340735912323, "step": 4383 }, { "epoch": 2.9846782431052095, "grad_norm": 1.7945847511291504, "learning_rate": 3.816296845485773e-06, "loss": 0.4371, "mean_token_accuracy": 0.8299611508846283, "step": 4384 }, { "epoch": 2.985359210078311, "grad_norm": 1.7698657512664795, "learning_rate": 3.814108319315156e-06, "loss": 0.5288, "mean_token_accuracy": 0.804036021232605, "step": 4385 }, { "epoch": 2.986040177051413, "grad_norm": 1.9576576948165894, "learning_rate": 3.811920033860797e-06, "loss": 0.4549, "mean_token_accuracy": 0.8305718004703522, "step": 4386 }, { "epoch": 2.9867211440245147, "grad_norm": 1.7859166860580444, "learning_rate": 3.809731989566887e-06, "loss": 0.5269, "mean_token_accuracy": 0.8031294345855713, "step": 4387 }, { "epoch": 2.9874021109976168, "grad_norm": 1.9769809246063232, "learning_rate": 3.8075441868775612e-06, "loss": 0.5238, "mean_token_accuracy": 0.8122938871383667, "step": 4388 }, { "epoch": 2.9880830779707184, "grad_norm": 1.7485040426254272, "learning_rate": 3.805356626236906e-06, "loss": 0.575, "mean_token_accuracy": 0.788591742515564, "step": 4389 }, { "epoch": 2.98876404494382, "grad_norm": 1.9738788604736328, "learning_rate": 3.80316930808896e-06, "loss": 0.5311, "mean_token_accuracy": 0.8169296681880951, "step": 4390 }, { "epoch": 2.989445011916922, "grad_norm": 1.8271225690841675, "learning_rate": 3.8009822328777112e-06, "loss": 0.443, "mean_token_accuracy": 0.8490128815174103, "step": 4391 }, { "epoch": 2.990125978890024, "grad_norm": 1.52913236618042, "learning_rate": 3.7987954010471045e-06, "loss": 0.7006, "mean_token_accuracy": 0.7469435632228851, "step": 4392 }, { "epoch": 2.9908069458631257, "grad_norm": 1.7786153554916382, "learning_rate": 3.796608813041025e-06, "loss": 0.5623, "mean_token_accuracy": 0.7683692872524261, "step": 4393 }, { "epoch": 2.9914879128362273, "grad_norm": 1.7441359758377075, "learning_rate": 3.7944224693033137e-06, "loss": 0.5491, "mean_token_accuracy": 0.7896264791488647, "step": 4394 }, { "epoch": 2.9921688798093293, "grad_norm": 1.7617781162261963, "learning_rate": 3.7922363702777666e-06, "loss": 0.5562, "mean_token_accuracy": 0.79617840051651, "step": 4395 }, { "epoch": 2.992849846782431, "grad_norm": 1.8617125749588013, "learning_rate": 3.7900505164081204e-06, "loss": 0.4685, "mean_token_accuracy": 0.8423455953598022, "step": 4396 }, { "epoch": 2.993530813755533, "grad_norm": 1.7984544038772583, "learning_rate": 3.78786490813807e-06, "loss": 0.4425, "mean_token_accuracy": 0.8302551507949829, "step": 4397 }, { "epoch": 2.9942117807286346, "grad_norm": 1.8314207792282104, "learning_rate": 3.7856795459112574e-06, "loss": 0.5209, "mean_token_accuracy": 0.8186399340629578, "step": 4398 }, { "epoch": 2.994892747701736, "grad_norm": 1.8116627931594849, "learning_rate": 3.7834944301712715e-06, "loss": 0.4495, "mean_token_accuracy": 0.836386501789093, "step": 4399 }, { "epoch": 2.9955737146748382, "grad_norm": 1.8718091249465942, "learning_rate": 3.78130956136166e-06, "loss": 0.583, "mean_token_accuracy": 0.7935819923877716, "step": 4400 }, { "epoch": 2.9962546816479403, "grad_norm": 1.6879358291625977, "learning_rate": 3.7791249399259085e-06, "loss": 0.6073, "mean_token_accuracy": 0.7817090153694153, "step": 4401 }, { "epoch": 2.996935648621042, "grad_norm": 1.966092824935913, "learning_rate": 3.7769405663074643e-06, "loss": 0.4476, "mean_token_accuracy": 0.8343396484851837, "step": 4402 }, { "epoch": 2.9976166155941435, "grad_norm": 1.8380736112594604, "learning_rate": 3.774756440949716e-06, "loss": 0.4404, "mean_token_accuracy": 0.8229202032089233, "step": 4403 }, { "epoch": 2.9982975825672455, "grad_norm": 1.7239617109298706, "learning_rate": 3.7725725642960047e-06, "loss": 0.5757, "mean_token_accuracy": 0.7998368442058563, "step": 4404 }, { "epoch": 2.998978549540347, "grad_norm": 1.7984180450439453, "learning_rate": 3.7703889367896224e-06, "loss": 0.5685, "mean_token_accuracy": 0.7976301312446594, "step": 4405 }, { "epoch": 2.999659516513449, "grad_norm": 1.848793864250183, "learning_rate": 3.7682055588738066e-06, "loss": 0.4688, "mean_token_accuracy": 0.8298115730285645, "step": 4406 }, { "epoch": 3.0, "grad_norm": 2.5986766815185547, "learning_rate": 3.766022430991751e-06, "loss": 0.3482, "mean_token_accuracy": 0.8284061551094055, "step": 4407 } ], "logging_steps": 1.0, "max_steps": 7340, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.460844222785782e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }