{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 4810, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010395010395010396, "grad_norm": 16.988855361938477, "learning_rate": 0.0, "loss": 2.202, "step": 1 }, { "epoch": 0.002079002079002079, "grad_norm": 32.9708137512207, "learning_rate": 2.0790020790020793e-08, "loss": 2.4915, "step": 2 }, { "epoch": 0.0031185031185031187, "grad_norm": 19.217676162719727, "learning_rate": 4.158004158004159e-08, "loss": 2.2768, "step": 3 }, { "epoch": 0.004158004158004158, "grad_norm": 28.554401397705078, "learning_rate": 6.237006237006238e-08, "loss": 2.7587, "step": 4 }, { "epoch": 0.005197505197505198, "grad_norm": 16.13688087463379, "learning_rate": 8.316008316008317e-08, "loss": 2.0513, "step": 5 }, { "epoch": 0.006237006237006237, "grad_norm": 12.732123374938965, "learning_rate": 1.0395010395010397e-07, "loss": 1.8028, "step": 6 }, { "epoch": 0.007276507276507277, "grad_norm": 9.309935569763184, "learning_rate": 1.2474012474012475e-07, "loss": 1.9257, "step": 7 }, { "epoch": 0.008316008316008316, "grad_norm": 64.55175018310547, "learning_rate": 1.4553014553014554e-07, "loss": 2.121, "step": 8 }, { "epoch": 0.009355509355509356, "grad_norm": 21.101022720336914, "learning_rate": 1.6632016632016635e-07, "loss": 2.04, "step": 9 }, { "epoch": 0.010395010395010396, "grad_norm": 16.773162841796875, "learning_rate": 1.8711018711018713e-07, "loss": 2.189, "step": 10 }, { "epoch": 0.011434511434511435, "grad_norm": 18.621702194213867, "learning_rate": 2.0790020790020794e-07, "loss": 1.6861, "step": 11 }, { "epoch": 0.012474012474012475, "grad_norm": 19.770313262939453, "learning_rate": 2.2869022869022872e-07, "loss": 2.0745, "step": 12 }, { "epoch": 0.013513513513513514, "grad_norm": 9.97078800201416, "learning_rate": 2.494802494802495e-07, "loss": 1.7891, "step": 13 }, { "epoch": 0.014553014553014554, "grad_norm": 20.687963485717773, "learning_rate": 2.702702702702703e-07, "loss": 1.6521, "step": 14 }, { "epoch": 0.015592515592515593, "grad_norm": 11.27476692199707, "learning_rate": 2.910602910602911e-07, "loss": 1.6738, "step": 15 }, { "epoch": 0.016632016632016633, "grad_norm": 26.307353973388672, "learning_rate": 3.118503118503119e-07, "loss": 2.2328, "step": 16 }, { "epoch": 0.017671517671517672, "grad_norm": 15.594326972961426, "learning_rate": 3.326403326403327e-07, "loss": 2.0531, "step": 17 }, { "epoch": 0.018711018711018712, "grad_norm": 18.295095443725586, "learning_rate": 3.534303534303535e-07, "loss": 2.1244, "step": 18 }, { "epoch": 0.01975051975051975, "grad_norm": 30.941444396972656, "learning_rate": 3.7422037422037426e-07, "loss": 2.0255, "step": 19 }, { "epoch": 0.02079002079002079, "grad_norm": 28.534343719482422, "learning_rate": 3.9501039501039504e-07, "loss": 2.1367, "step": 20 }, { "epoch": 0.02182952182952183, "grad_norm": 105.21300506591797, "learning_rate": 4.158004158004159e-07, "loss": 2.2866, "step": 21 }, { "epoch": 0.02286902286902287, "grad_norm": 16.125764846801758, "learning_rate": 4.3659043659043666e-07, "loss": 2.4157, "step": 22 }, { "epoch": 0.02390852390852391, "grad_norm": 16.123016357421875, "learning_rate": 4.5738045738045745e-07, "loss": 2.5256, "step": 23 }, { "epoch": 0.02494802494802495, "grad_norm": 15.839056968688965, "learning_rate": 4.781704781704783e-07, "loss": 1.7972, "step": 24 }, { "epoch": 0.02598752598752599, "grad_norm": 21.773963928222656, "learning_rate": 4.98960498960499e-07, "loss": 2.2897, "step": 25 }, { "epoch": 0.02702702702702703, "grad_norm": 16.242046356201172, "learning_rate": 5.197505197505199e-07, "loss": 2.0965, "step": 26 }, { "epoch": 0.028066528066528068, "grad_norm": 12.057516098022461, "learning_rate": 5.405405405405406e-07, "loss": 1.9348, "step": 27 }, { "epoch": 0.029106029106029108, "grad_norm": 48.11043930053711, "learning_rate": 5.613305613305614e-07, "loss": 1.8615, "step": 28 }, { "epoch": 0.030145530145530147, "grad_norm": 28.504892349243164, "learning_rate": 5.821205821205821e-07, "loss": 2.5755, "step": 29 }, { "epoch": 0.031185031185031187, "grad_norm": 12.763325691223145, "learning_rate": 6.02910602910603e-07, "loss": 1.9018, "step": 30 }, { "epoch": 0.032224532224532226, "grad_norm": 13.687501907348633, "learning_rate": 6.237006237006238e-07, "loss": 2.444, "step": 31 }, { "epoch": 0.033264033264033266, "grad_norm": 8.045001029968262, "learning_rate": 6.444906444906446e-07, "loss": 1.7589, "step": 32 }, { "epoch": 0.034303534303534305, "grad_norm": 19.066606521606445, "learning_rate": 6.652806652806654e-07, "loss": 2.3628, "step": 33 }, { "epoch": 0.035343035343035345, "grad_norm": 13.532938003540039, "learning_rate": 6.860706860706861e-07, "loss": 2.3399, "step": 34 }, { "epoch": 0.036382536382536385, "grad_norm": 14.592718124389648, "learning_rate": 7.06860706860707e-07, "loss": 1.8444, "step": 35 }, { "epoch": 0.037422037422037424, "grad_norm": 19.344562530517578, "learning_rate": 7.276507276507278e-07, "loss": 2.4186, "step": 36 }, { "epoch": 0.038461538461538464, "grad_norm": 23.072294235229492, "learning_rate": 7.484407484407485e-07, "loss": 2.3203, "step": 37 }, { "epoch": 0.0395010395010395, "grad_norm": 20.00788116455078, "learning_rate": 7.692307692307694e-07, "loss": 1.9721, "step": 38 }, { "epoch": 0.04054054054054054, "grad_norm": 13.704610824584961, "learning_rate": 7.900207900207901e-07, "loss": 1.9419, "step": 39 }, { "epoch": 0.04158004158004158, "grad_norm": 35.170108795166016, "learning_rate": 8.108108108108109e-07, "loss": 2.1189, "step": 40 }, { "epoch": 0.04261954261954262, "grad_norm": 13.251538276672363, "learning_rate": 8.316008316008318e-07, "loss": 2.2046, "step": 41 }, { "epoch": 0.04365904365904366, "grad_norm": 37.792728424072266, "learning_rate": 8.523908523908525e-07, "loss": 2.3078, "step": 42 }, { "epoch": 0.0446985446985447, "grad_norm": 19.703136444091797, "learning_rate": 8.731808731808733e-07, "loss": 2.1895, "step": 43 }, { "epoch": 0.04573804573804574, "grad_norm": 25.02670669555664, "learning_rate": 8.939708939708941e-07, "loss": 2.209, "step": 44 }, { "epoch": 0.04677754677754678, "grad_norm": 11.132152557373047, "learning_rate": 9.147609147609149e-07, "loss": 1.8703, "step": 45 }, { "epoch": 0.04781704781704782, "grad_norm": 10.692375183105469, "learning_rate": 9.355509355509356e-07, "loss": 1.967, "step": 46 }, { "epoch": 0.04885654885654886, "grad_norm": 18.32746696472168, "learning_rate": 9.563409563409566e-07, "loss": 2.353, "step": 47 }, { "epoch": 0.0498960498960499, "grad_norm": 23.04522705078125, "learning_rate": 9.771309771309773e-07, "loss": 2.1668, "step": 48 }, { "epoch": 0.05093555093555094, "grad_norm": 13.668973922729492, "learning_rate": 9.97920997920998e-07, "loss": 1.7651, "step": 49 }, { "epoch": 0.05197505197505198, "grad_norm": 11.134869575500488, "learning_rate": 1.0187110187110188e-06, "loss": 1.8238, "step": 50 }, { "epoch": 0.05301455301455302, "grad_norm": 24.748937606811523, "learning_rate": 1.0395010395010397e-06, "loss": 2.0508, "step": 51 }, { "epoch": 0.05405405405405406, "grad_norm": 21.49311065673828, "learning_rate": 1.0602910602910604e-06, "loss": 1.9374, "step": 52 }, { "epoch": 0.0550935550935551, "grad_norm": 31.98406410217285, "learning_rate": 1.0810810810810812e-06, "loss": 2.3183, "step": 53 }, { "epoch": 0.056133056133056136, "grad_norm": 14.93075180053711, "learning_rate": 1.1018711018711021e-06, "loss": 1.918, "step": 54 }, { "epoch": 0.057172557172557176, "grad_norm": 22.455198287963867, "learning_rate": 1.1226611226611228e-06, "loss": 1.9401, "step": 55 }, { "epoch": 0.058212058212058215, "grad_norm": 11.494815826416016, "learning_rate": 1.1434511434511436e-06, "loss": 1.9454, "step": 56 }, { "epoch": 0.059251559251559255, "grad_norm": 15.472561836242676, "learning_rate": 1.1642411642411643e-06, "loss": 2.1118, "step": 57 }, { "epoch": 0.060291060291060294, "grad_norm": 15.340880393981934, "learning_rate": 1.1850311850311852e-06, "loss": 2.1039, "step": 58 }, { "epoch": 0.061330561330561334, "grad_norm": 53.742252349853516, "learning_rate": 1.205821205821206e-06, "loss": 2.7519, "step": 59 }, { "epoch": 0.062370062370062374, "grad_norm": 21.00668716430664, "learning_rate": 1.2266112266112267e-06, "loss": 2.4038, "step": 60 }, { "epoch": 0.06340956340956341, "grad_norm": 32.80131149291992, "learning_rate": 1.2474012474012476e-06, "loss": 2.5428, "step": 61 }, { "epoch": 0.06444906444906445, "grad_norm": 15.214197158813477, "learning_rate": 1.2681912681912684e-06, "loss": 2.0914, "step": 62 }, { "epoch": 0.06548856548856549, "grad_norm": 26.892454147338867, "learning_rate": 1.288981288981289e-06, "loss": 1.6108, "step": 63 }, { "epoch": 0.06652806652806653, "grad_norm": 13.933700561523438, "learning_rate": 1.30977130977131e-06, "loss": 1.6882, "step": 64 }, { "epoch": 0.06756756756756757, "grad_norm": 10.220669746398926, "learning_rate": 1.3305613305613308e-06, "loss": 1.798, "step": 65 }, { "epoch": 0.06860706860706861, "grad_norm": 21.804941177368164, "learning_rate": 1.3513513513513515e-06, "loss": 1.8424, "step": 66 }, { "epoch": 0.06964656964656965, "grad_norm": 23.250431060791016, "learning_rate": 1.3721413721413722e-06, "loss": 2.4271, "step": 67 }, { "epoch": 0.07068607068607069, "grad_norm": 11.683350563049316, "learning_rate": 1.3929313929313932e-06, "loss": 1.937, "step": 68 }, { "epoch": 0.07172557172557173, "grad_norm": 37.01130294799805, "learning_rate": 1.413721413721414e-06, "loss": 2.0699, "step": 69 }, { "epoch": 0.07276507276507277, "grad_norm": 12.468998908996582, "learning_rate": 1.4345114345114346e-06, "loss": 2.1341, "step": 70 }, { "epoch": 0.07380457380457381, "grad_norm": 15.133217811584473, "learning_rate": 1.4553014553014556e-06, "loss": 2.2831, "step": 71 }, { "epoch": 0.07484407484407485, "grad_norm": 16.24321174621582, "learning_rate": 1.4760914760914763e-06, "loss": 2.013, "step": 72 }, { "epoch": 0.07588357588357589, "grad_norm": 16.070463180541992, "learning_rate": 1.496881496881497e-06, "loss": 2.3446, "step": 73 }, { "epoch": 0.07692307692307693, "grad_norm": 16.41120719909668, "learning_rate": 1.5176715176715178e-06, "loss": 2.0695, "step": 74 }, { "epoch": 0.07796257796257797, "grad_norm": 14.749588012695312, "learning_rate": 1.5384615384615387e-06, "loss": 1.9838, "step": 75 }, { "epoch": 0.079002079002079, "grad_norm": 12.910445213317871, "learning_rate": 1.5592515592515594e-06, "loss": 1.9391, "step": 76 }, { "epoch": 0.08004158004158005, "grad_norm": 19.614336013793945, "learning_rate": 1.5800415800415802e-06, "loss": 1.9071, "step": 77 }, { "epoch": 0.08108108108108109, "grad_norm": 31.232622146606445, "learning_rate": 1.6008316008316011e-06, "loss": 1.905, "step": 78 }, { "epoch": 0.08212058212058213, "grad_norm": 12.483946800231934, "learning_rate": 1.6216216216216219e-06, "loss": 2.0306, "step": 79 }, { "epoch": 0.08316008316008316, "grad_norm": 12.690123558044434, "learning_rate": 1.6424116424116426e-06, "loss": 1.9655, "step": 80 }, { "epoch": 0.0841995841995842, "grad_norm": 13.518976211547852, "learning_rate": 1.6632016632016635e-06, "loss": 2.1174, "step": 81 }, { "epoch": 0.08523908523908524, "grad_norm": 19.20243263244629, "learning_rate": 1.6839916839916843e-06, "loss": 2.306, "step": 82 }, { "epoch": 0.08627858627858628, "grad_norm": 10.600049018859863, "learning_rate": 1.704781704781705e-06, "loss": 2.0379, "step": 83 }, { "epoch": 0.08731808731808732, "grad_norm": 6.616295337677002, "learning_rate": 1.7255717255717257e-06, "loss": 1.8567, "step": 84 }, { "epoch": 0.08835758835758836, "grad_norm": 9.49284553527832, "learning_rate": 1.7463617463617467e-06, "loss": 1.844, "step": 85 }, { "epoch": 0.0893970893970894, "grad_norm": 13.307085037231445, "learning_rate": 1.7671517671517674e-06, "loss": 1.9826, "step": 86 }, { "epoch": 0.09043659043659044, "grad_norm": 13.201308250427246, "learning_rate": 1.7879417879417881e-06, "loss": 2.1031, "step": 87 }, { "epoch": 0.09147609147609148, "grad_norm": 13.714674949645996, "learning_rate": 1.808731808731809e-06, "loss": 1.5715, "step": 88 }, { "epoch": 0.09251559251559252, "grad_norm": 14.892278671264648, "learning_rate": 1.8295218295218298e-06, "loss": 1.7279, "step": 89 }, { "epoch": 0.09355509355509356, "grad_norm": 7.037039279937744, "learning_rate": 1.8503118503118505e-06, "loss": 1.9976, "step": 90 }, { "epoch": 0.0945945945945946, "grad_norm": 14.051565170288086, "learning_rate": 1.8711018711018713e-06, "loss": 2.0268, "step": 91 }, { "epoch": 0.09563409563409564, "grad_norm": 7.723928928375244, "learning_rate": 1.8918918918918922e-06, "loss": 2.0009, "step": 92 }, { "epoch": 0.09667359667359668, "grad_norm": 14.955673217773438, "learning_rate": 1.912681912681913e-06, "loss": 1.9555, "step": 93 }, { "epoch": 0.09771309771309772, "grad_norm": 10.495637893676758, "learning_rate": 1.9334719334719337e-06, "loss": 2.0791, "step": 94 }, { "epoch": 0.09875259875259876, "grad_norm": 9.99687385559082, "learning_rate": 1.9542619542619546e-06, "loss": 1.8874, "step": 95 }, { "epoch": 0.0997920997920998, "grad_norm": 13.424286842346191, "learning_rate": 1.975051975051975e-06, "loss": 1.916, "step": 96 }, { "epoch": 0.10083160083160084, "grad_norm": 28.673757553100586, "learning_rate": 1.995841995841996e-06, "loss": 2.0923, "step": 97 }, { "epoch": 0.10187110187110188, "grad_norm": 12.696794509887695, "learning_rate": 2.016632016632017e-06, "loss": 2.1206, "step": 98 }, { "epoch": 0.10291060291060292, "grad_norm": 14.118012428283691, "learning_rate": 2.0374220374220375e-06, "loss": 1.9878, "step": 99 }, { "epoch": 0.10395010395010396, "grad_norm": 9.848310470581055, "learning_rate": 2.0582120582120585e-06, "loss": 1.8487, "step": 100 }, { "epoch": 0.104989604989605, "grad_norm": 10.640583038330078, "learning_rate": 2.0790020790020794e-06, "loss": 2.2879, "step": 101 }, { "epoch": 0.10602910602910603, "grad_norm": 13.701424598693848, "learning_rate": 2.0997920997921e-06, "loss": 1.8333, "step": 102 }, { "epoch": 0.10706860706860707, "grad_norm": 14.907086372375488, "learning_rate": 2.120582120582121e-06, "loss": 1.9786, "step": 103 }, { "epoch": 0.10810810810810811, "grad_norm": 11.484697341918945, "learning_rate": 2.141372141372142e-06, "loss": 1.9187, "step": 104 }, { "epoch": 0.10914760914760915, "grad_norm": 14.736360549926758, "learning_rate": 2.1621621621621623e-06, "loss": 1.8816, "step": 105 }, { "epoch": 0.1101871101871102, "grad_norm": 9.888904571533203, "learning_rate": 2.1829521829521833e-06, "loss": 1.9339, "step": 106 }, { "epoch": 0.11122661122661123, "grad_norm": 18.789710998535156, "learning_rate": 2.2037422037422042e-06, "loss": 1.8726, "step": 107 }, { "epoch": 0.11226611226611227, "grad_norm": 18.137393951416016, "learning_rate": 2.2245322245322247e-06, "loss": 2.1816, "step": 108 }, { "epoch": 0.11330561330561331, "grad_norm": 11.418164253234863, "learning_rate": 2.2453222453222457e-06, "loss": 1.8386, "step": 109 }, { "epoch": 0.11434511434511435, "grad_norm": 12.0963716506958, "learning_rate": 2.2661122661122666e-06, "loss": 1.4687, "step": 110 }, { "epoch": 0.11538461538461539, "grad_norm": 8.479762077331543, "learning_rate": 2.286902286902287e-06, "loss": 1.8498, "step": 111 }, { "epoch": 0.11642411642411643, "grad_norm": 15.996792793273926, "learning_rate": 2.307692307692308e-06, "loss": 2.0195, "step": 112 }, { "epoch": 0.11746361746361747, "grad_norm": 17.23140525817871, "learning_rate": 2.3284823284823286e-06, "loss": 1.8239, "step": 113 }, { "epoch": 0.11850311850311851, "grad_norm": 12.582114219665527, "learning_rate": 2.3492723492723495e-06, "loss": 1.7674, "step": 114 }, { "epoch": 0.11954261954261955, "grad_norm": 15.431148529052734, "learning_rate": 2.3700623700623705e-06, "loss": 1.9072, "step": 115 }, { "epoch": 0.12058212058212059, "grad_norm": 17.711864471435547, "learning_rate": 2.390852390852391e-06, "loss": 2.1458, "step": 116 }, { "epoch": 0.12162162162162163, "grad_norm": 12.348552703857422, "learning_rate": 2.411642411642412e-06, "loss": 1.706, "step": 117 }, { "epoch": 0.12266112266112267, "grad_norm": 20.33272361755371, "learning_rate": 2.432432432432433e-06, "loss": 1.6389, "step": 118 }, { "epoch": 0.12370062370062371, "grad_norm": 12.650636672973633, "learning_rate": 2.4532224532224534e-06, "loss": 2.1807, "step": 119 }, { "epoch": 0.12474012474012475, "grad_norm": 16.649805068969727, "learning_rate": 2.4740124740124743e-06, "loss": 2.1561, "step": 120 }, { "epoch": 0.1257796257796258, "grad_norm": 13.328505516052246, "learning_rate": 2.4948024948024953e-06, "loss": 1.7857, "step": 121 }, { "epoch": 0.12681912681912683, "grad_norm": 16.647127151489258, "learning_rate": 2.515592515592516e-06, "loss": 2.2226, "step": 122 }, { "epoch": 0.12785862785862787, "grad_norm": 13.633663177490234, "learning_rate": 2.5363825363825367e-06, "loss": 1.9883, "step": 123 }, { "epoch": 0.1288981288981289, "grad_norm": 11.40990161895752, "learning_rate": 2.5571725571725577e-06, "loss": 1.9498, "step": 124 }, { "epoch": 0.12993762993762994, "grad_norm": 11.874930381774902, "learning_rate": 2.577962577962578e-06, "loss": 1.7595, "step": 125 }, { "epoch": 0.13097713097713098, "grad_norm": 11.909989356994629, "learning_rate": 2.598752598752599e-06, "loss": 1.7103, "step": 126 }, { "epoch": 0.13201663201663202, "grad_norm": 9.262993812561035, "learning_rate": 2.61954261954262e-06, "loss": 1.8453, "step": 127 }, { "epoch": 0.13305613305613306, "grad_norm": 12.807122230529785, "learning_rate": 2.6403326403326406e-06, "loss": 1.8424, "step": 128 }, { "epoch": 0.1340956340956341, "grad_norm": 7.935920715332031, "learning_rate": 2.6611226611226616e-06, "loss": 1.6972, "step": 129 }, { "epoch": 0.13513513513513514, "grad_norm": 11.945698738098145, "learning_rate": 2.681912681912682e-06, "loss": 1.8897, "step": 130 }, { "epoch": 0.13617463617463618, "grad_norm": 12.887810707092285, "learning_rate": 2.702702702702703e-06, "loss": 1.9529, "step": 131 }, { "epoch": 0.13721413721413722, "grad_norm": 10.941514015197754, "learning_rate": 2.723492723492724e-06, "loss": 1.8406, "step": 132 }, { "epoch": 0.13825363825363826, "grad_norm": 10.435567855834961, "learning_rate": 2.7442827442827445e-06, "loss": 1.5383, "step": 133 }, { "epoch": 0.1392931392931393, "grad_norm": 9.996288299560547, "learning_rate": 2.7650727650727654e-06, "loss": 1.8487, "step": 134 }, { "epoch": 0.14033264033264034, "grad_norm": 10.20373249053955, "learning_rate": 2.7858627858627864e-06, "loss": 1.6566, "step": 135 }, { "epoch": 0.14137214137214138, "grad_norm": 13.138648986816406, "learning_rate": 2.806652806652807e-06, "loss": 1.6186, "step": 136 }, { "epoch": 0.14241164241164242, "grad_norm": 11.178861618041992, "learning_rate": 2.827442827442828e-06, "loss": 1.8976, "step": 137 }, { "epoch": 0.14345114345114346, "grad_norm": 7.92245626449585, "learning_rate": 2.8482328482328488e-06, "loss": 1.5972, "step": 138 }, { "epoch": 0.1444906444906445, "grad_norm": 7.310725212097168, "learning_rate": 2.8690228690228693e-06, "loss": 1.6067, "step": 139 }, { "epoch": 0.14553014553014554, "grad_norm": 11.522294998168945, "learning_rate": 2.8898128898128902e-06, "loss": 1.8074, "step": 140 }, { "epoch": 0.14656964656964658, "grad_norm": 11.914559364318848, "learning_rate": 2.910602910602911e-06, "loss": 1.9732, "step": 141 }, { "epoch": 0.14760914760914762, "grad_norm": 12.07341194152832, "learning_rate": 2.9313929313929317e-06, "loss": 1.7112, "step": 142 }, { "epoch": 0.14864864864864866, "grad_norm": 10.485044479370117, "learning_rate": 2.9521829521829526e-06, "loss": 1.844, "step": 143 }, { "epoch": 0.1496881496881497, "grad_norm": 7.178315162658691, "learning_rate": 2.9729729729729736e-06, "loss": 1.7565, "step": 144 }, { "epoch": 0.15072765072765074, "grad_norm": 9.397273063659668, "learning_rate": 2.993762993762994e-06, "loss": 1.8364, "step": 145 }, { "epoch": 0.15176715176715178, "grad_norm": 10.049165725708008, "learning_rate": 3.014553014553015e-06, "loss": 1.8335, "step": 146 }, { "epoch": 0.15280665280665282, "grad_norm": 7.683914661407471, "learning_rate": 3.0353430353430356e-06, "loss": 1.73, "step": 147 }, { "epoch": 0.15384615384615385, "grad_norm": 8.30390739440918, "learning_rate": 3.0561330561330565e-06, "loss": 1.843, "step": 148 }, { "epoch": 0.1548856548856549, "grad_norm": 8.528340339660645, "learning_rate": 3.0769230769230774e-06, "loss": 1.9296, "step": 149 }, { "epoch": 0.15592515592515593, "grad_norm": 11.0097017288208, "learning_rate": 3.097713097713098e-06, "loss": 2.0037, "step": 150 }, { "epoch": 0.15696465696465697, "grad_norm": 7.787356376647949, "learning_rate": 3.118503118503119e-06, "loss": 1.6986, "step": 151 }, { "epoch": 0.158004158004158, "grad_norm": 15.426385879516602, "learning_rate": 3.13929313929314e-06, "loss": 1.8589, "step": 152 }, { "epoch": 0.15904365904365905, "grad_norm": 9.391960144042969, "learning_rate": 3.1600831600831604e-06, "loss": 1.6876, "step": 153 }, { "epoch": 0.1600831600831601, "grad_norm": 12.089409828186035, "learning_rate": 3.1808731808731813e-06, "loss": 1.4551, "step": 154 }, { "epoch": 0.16112266112266113, "grad_norm": 7.866195201873779, "learning_rate": 3.2016632016632022e-06, "loss": 1.8804, "step": 155 }, { "epoch": 0.16216216216216217, "grad_norm": 9.8470458984375, "learning_rate": 3.2224532224532228e-06, "loss": 1.6656, "step": 156 }, { "epoch": 0.1632016632016632, "grad_norm": 7.687832355499268, "learning_rate": 3.2432432432432437e-06, "loss": 1.4208, "step": 157 }, { "epoch": 0.16424116424116425, "grad_norm": 12.187580108642578, "learning_rate": 3.2640332640332646e-06, "loss": 1.8139, "step": 158 }, { "epoch": 0.1652806652806653, "grad_norm": 7.861310958862305, "learning_rate": 3.284823284823285e-06, "loss": 1.6931, "step": 159 }, { "epoch": 0.16632016632016633, "grad_norm": 9.621845245361328, "learning_rate": 3.305613305613306e-06, "loss": 1.6181, "step": 160 }, { "epoch": 0.16735966735966737, "grad_norm": 9.091207504272461, "learning_rate": 3.326403326403327e-06, "loss": 1.6692, "step": 161 }, { "epoch": 0.1683991683991684, "grad_norm": 25.73870849609375, "learning_rate": 3.3471933471933476e-06, "loss": 1.9534, "step": 162 }, { "epoch": 0.16943866943866945, "grad_norm": 17.834957122802734, "learning_rate": 3.3679833679833685e-06, "loss": 1.8596, "step": 163 }, { "epoch": 0.1704781704781705, "grad_norm": 9.130378723144531, "learning_rate": 3.388773388773389e-06, "loss": 1.6516, "step": 164 }, { "epoch": 0.17151767151767153, "grad_norm": 11.076170921325684, "learning_rate": 3.40956340956341e-06, "loss": 2.0877, "step": 165 }, { "epoch": 0.17255717255717257, "grad_norm": 8.953692436218262, "learning_rate": 3.430353430353431e-06, "loss": 2.0112, "step": 166 }, { "epoch": 0.1735966735966736, "grad_norm": 9.42551326751709, "learning_rate": 3.4511434511434514e-06, "loss": 1.8069, "step": 167 }, { "epoch": 0.17463617463617465, "grad_norm": 7.4458794593811035, "learning_rate": 3.4719334719334724e-06, "loss": 1.5709, "step": 168 }, { "epoch": 0.17567567567567569, "grad_norm": 8.580041885375977, "learning_rate": 3.4927234927234933e-06, "loss": 1.912, "step": 169 }, { "epoch": 0.17671517671517672, "grad_norm": 9.437491416931152, "learning_rate": 3.513513513513514e-06, "loss": 1.6681, "step": 170 }, { "epoch": 0.17775467775467776, "grad_norm": 11.995328903198242, "learning_rate": 3.5343035343035348e-06, "loss": 1.9467, "step": 171 }, { "epoch": 0.1787941787941788, "grad_norm": 7.549195766448975, "learning_rate": 3.5550935550935557e-06, "loss": 1.7225, "step": 172 }, { "epoch": 0.17983367983367984, "grad_norm": 9.546483039855957, "learning_rate": 3.5758835758835762e-06, "loss": 1.8062, "step": 173 }, { "epoch": 0.18087318087318088, "grad_norm": 11.021419525146484, "learning_rate": 3.596673596673597e-06, "loss": 1.858, "step": 174 }, { "epoch": 0.18191268191268192, "grad_norm": 17.999780654907227, "learning_rate": 3.617463617463618e-06, "loss": 1.723, "step": 175 }, { "epoch": 0.18295218295218296, "grad_norm": 12.470070838928223, "learning_rate": 3.6382536382536386e-06, "loss": 1.6866, "step": 176 }, { "epoch": 0.183991683991684, "grad_norm": 12.866351127624512, "learning_rate": 3.6590436590436596e-06, "loss": 1.6734, "step": 177 }, { "epoch": 0.18503118503118504, "grad_norm": 7.818451404571533, "learning_rate": 3.6798336798336805e-06, "loss": 1.6018, "step": 178 }, { "epoch": 0.18607068607068608, "grad_norm": 10.40690803527832, "learning_rate": 3.700623700623701e-06, "loss": 1.6845, "step": 179 }, { "epoch": 0.18711018711018712, "grad_norm": 9.794620513916016, "learning_rate": 3.721413721413722e-06, "loss": 1.8355, "step": 180 }, { "epoch": 0.18814968814968816, "grad_norm": 8.024982452392578, "learning_rate": 3.7422037422037425e-06, "loss": 1.7645, "step": 181 }, { "epoch": 0.1891891891891892, "grad_norm": 7.918184280395508, "learning_rate": 3.7629937629937634e-06, "loss": 1.5322, "step": 182 }, { "epoch": 0.19022869022869024, "grad_norm": 8.833913803100586, "learning_rate": 3.7837837837837844e-06, "loss": 1.6939, "step": 183 }, { "epoch": 0.19126819126819128, "grad_norm": 9.3923978805542, "learning_rate": 3.804573804573805e-06, "loss": 1.453, "step": 184 }, { "epoch": 0.19230769230769232, "grad_norm": 10.067290306091309, "learning_rate": 3.825363825363826e-06, "loss": 1.759, "step": 185 }, { "epoch": 0.19334719334719336, "grad_norm": 6.426559925079346, "learning_rate": 3.846153846153847e-06, "loss": 1.627, "step": 186 }, { "epoch": 0.1943866943866944, "grad_norm": 8.969478607177734, "learning_rate": 3.866943866943867e-06, "loss": 1.6313, "step": 187 }, { "epoch": 0.19542619542619544, "grad_norm": 10.542095184326172, "learning_rate": 3.887733887733889e-06, "loss": 1.6337, "step": 188 }, { "epoch": 0.19646569646569648, "grad_norm": 8.737053871154785, "learning_rate": 3.908523908523909e-06, "loss": 1.5676, "step": 189 }, { "epoch": 0.19750519750519752, "grad_norm": 10.72559928894043, "learning_rate": 3.92931392931393e-06, "loss": 1.6693, "step": 190 }, { "epoch": 0.19854469854469856, "grad_norm": 8.796841621398926, "learning_rate": 3.95010395010395e-06, "loss": 1.6421, "step": 191 }, { "epoch": 0.1995841995841996, "grad_norm": 25.390539169311523, "learning_rate": 3.970893970893972e-06, "loss": 1.5609, "step": 192 }, { "epoch": 0.20062370062370063, "grad_norm": 9.860316276550293, "learning_rate": 3.991683991683992e-06, "loss": 1.8664, "step": 193 }, { "epoch": 0.20166320166320167, "grad_norm": 8.280574798583984, "learning_rate": 4.012474012474013e-06, "loss": 1.5831, "step": 194 }, { "epoch": 0.20270270270270271, "grad_norm": 7.029718399047852, "learning_rate": 4.033264033264034e-06, "loss": 1.6864, "step": 195 }, { "epoch": 0.20374220374220375, "grad_norm": 8.269538879394531, "learning_rate": 4.0540540540540545e-06, "loss": 1.5362, "step": 196 }, { "epoch": 0.2047817047817048, "grad_norm": 7.9130940437316895, "learning_rate": 4.074844074844075e-06, "loss": 1.7531, "step": 197 }, { "epoch": 0.20582120582120583, "grad_norm": 9.139872550964355, "learning_rate": 4.095634095634096e-06, "loss": 1.5782, "step": 198 }, { "epoch": 0.20686070686070687, "grad_norm": 8.888442039489746, "learning_rate": 4.116424116424117e-06, "loss": 1.531, "step": 199 }, { "epoch": 0.2079002079002079, "grad_norm": 9.606545448303223, "learning_rate": 4.1372141372141374e-06, "loss": 1.9533, "step": 200 }, { "epoch": 0.20893970893970895, "grad_norm": 7.717611789703369, "learning_rate": 4.158004158004159e-06, "loss": 1.6154, "step": 201 }, { "epoch": 0.20997920997921, "grad_norm": 10.033646583557129, "learning_rate": 4.178794178794179e-06, "loss": 1.5977, "step": 202 }, { "epoch": 0.21101871101871103, "grad_norm": 8.350449562072754, "learning_rate": 4.1995841995842e-06, "loss": 1.6015, "step": 203 }, { "epoch": 0.21205821205821207, "grad_norm": 11.233882904052734, "learning_rate": 4.220374220374221e-06, "loss": 1.9587, "step": 204 }, { "epoch": 0.2130977130977131, "grad_norm": 8.711499214172363, "learning_rate": 4.241164241164242e-06, "loss": 1.7817, "step": 205 }, { "epoch": 0.21413721413721415, "grad_norm": 12.727667808532715, "learning_rate": 4.261954261954262e-06, "loss": 1.9023, "step": 206 }, { "epoch": 0.2151767151767152, "grad_norm": 6.942087650299072, "learning_rate": 4.282744282744284e-06, "loss": 1.5327, "step": 207 }, { "epoch": 0.21621621621621623, "grad_norm": 8.110414505004883, "learning_rate": 4.303534303534304e-06, "loss": 1.6922, "step": 208 }, { "epoch": 0.21725571725571727, "grad_norm": 8.410806655883789, "learning_rate": 4.324324324324325e-06, "loss": 1.6059, "step": 209 }, { "epoch": 0.2182952182952183, "grad_norm": 10.630267143249512, "learning_rate": 4.345114345114346e-06, "loss": 1.5802, "step": 210 }, { "epoch": 0.21933471933471935, "grad_norm": 11.829726219177246, "learning_rate": 4.3659043659043665e-06, "loss": 1.856, "step": 211 }, { "epoch": 0.2203742203742204, "grad_norm": 8.306093215942383, "learning_rate": 4.386694386694387e-06, "loss": 1.494, "step": 212 }, { "epoch": 0.22141372141372143, "grad_norm": 9.279668807983398, "learning_rate": 4.4074844074844084e-06, "loss": 1.5234, "step": 213 }, { "epoch": 0.22245322245322247, "grad_norm": 18.21111297607422, "learning_rate": 4.428274428274429e-06, "loss": 2.0073, "step": 214 }, { "epoch": 0.2234927234927235, "grad_norm": 14.216645240783691, "learning_rate": 4.4490644490644495e-06, "loss": 1.334, "step": 215 }, { "epoch": 0.22453222453222454, "grad_norm": 8.353694915771484, "learning_rate": 4.469854469854471e-06, "loss": 1.7931, "step": 216 }, { "epoch": 0.22557172557172558, "grad_norm": 12.420600891113281, "learning_rate": 4.490644490644491e-06, "loss": 1.8834, "step": 217 }, { "epoch": 0.22661122661122662, "grad_norm": 7.537139892578125, "learning_rate": 4.511434511434512e-06, "loss": 1.5505, "step": 218 }, { "epoch": 0.22765072765072766, "grad_norm": 8.061205863952637, "learning_rate": 4.532224532224533e-06, "loss": 1.7889, "step": 219 }, { "epoch": 0.2286902286902287, "grad_norm": 7.974506378173828, "learning_rate": 4.553014553014554e-06, "loss": 1.5875, "step": 220 }, { "epoch": 0.22972972972972974, "grad_norm": 8.587332725524902, "learning_rate": 4.573804573804574e-06, "loss": 1.1923, "step": 221 }, { "epoch": 0.23076923076923078, "grad_norm": 7.924431800842285, "learning_rate": 4.594594594594596e-06, "loss": 1.1502, "step": 222 }, { "epoch": 0.23180873180873182, "grad_norm": 11.359457015991211, "learning_rate": 4.615384615384616e-06, "loss": 1.6597, "step": 223 }, { "epoch": 0.23284823284823286, "grad_norm": 11.588187217712402, "learning_rate": 4.636174636174637e-06, "loss": 1.9984, "step": 224 }, { "epoch": 0.2338877338877339, "grad_norm": 9.252629280090332, "learning_rate": 4.656964656964657e-06, "loss": 1.8034, "step": 225 }, { "epoch": 0.23492723492723494, "grad_norm": 8.870009422302246, "learning_rate": 4.6777546777546786e-06, "loss": 1.7183, "step": 226 }, { "epoch": 0.23596673596673598, "grad_norm": 6.966862678527832, "learning_rate": 4.698544698544699e-06, "loss": 1.6209, "step": 227 }, { "epoch": 0.23700623700623702, "grad_norm": 6.796176433563232, "learning_rate": 4.71933471933472e-06, "loss": 1.9011, "step": 228 }, { "epoch": 0.23804573804573806, "grad_norm": 8.598974227905273, "learning_rate": 4.740124740124741e-06, "loss": 1.6843, "step": 229 }, { "epoch": 0.2390852390852391, "grad_norm": 8.327461242675781, "learning_rate": 4.7609147609147615e-06, "loss": 1.737, "step": 230 }, { "epoch": 0.24012474012474014, "grad_norm": 9.38319206237793, "learning_rate": 4.781704781704782e-06, "loss": 1.6075, "step": 231 }, { "epoch": 0.24116424116424118, "grad_norm": 5.437087059020996, "learning_rate": 4.802494802494803e-06, "loss": 1.1498, "step": 232 }, { "epoch": 0.24220374220374222, "grad_norm": 8.21915054321289, "learning_rate": 4.823284823284824e-06, "loss": 1.6603, "step": 233 }, { "epoch": 0.24324324324324326, "grad_norm": 8.151537895202637, "learning_rate": 4.844074844074844e-06, "loss": 1.4857, "step": 234 }, { "epoch": 0.2442827442827443, "grad_norm": 7.525641918182373, "learning_rate": 4.864864864864866e-06, "loss": 1.5105, "step": 235 }, { "epoch": 0.24532224532224534, "grad_norm": 7.916234970092773, "learning_rate": 4.885654885654886e-06, "loss": 1.5264, "step": 236 }, { "epoch": 0.24636174636174638, "grad_norm": 9.898544311523438, "learning_rate": 4.906444906444907e-06, "loss": 1.782, "step": 237 }, { "epoch": 0.24740124740124741, "grad_norm": 9.978106498718262, "learning_rate": 4.927234927234928e-06, "loss": 1.5498, "step": 238 }, { "epoch": 0.24844074844074845, "grad_norm": 5.927715301513672, "learning_rate": 4.948024948024949e-06, "loss": 1.1964, "step": 239 }, { "epoch": 0.2494802494802495, "grad_norm": 8.933344841003418, "learning_rate": 4.968814968814969e-06, "loss": 1.5373, "step": 240 }, { "epoch": 0.2505197505197505, "grad_norm": 18.588769912719727, "learning_rate": 4.9896049896049906e-06, "loss": 1.8564, "step": 241 }, { "epoch": 0.2515592515592516, "grad_norm": 8.533500671386719, "learning_rate": 5.01039501039501e-06, "loss": 1.7192, "step": 242 }, { "epoch": 0.2525987525987526, "grad_norm": 9.05661392211914, "learning_rate": 5.031185031185032e-06, "loss": 1.7023, "step": 243 }, { "epoch": 0.25363825363825365, "grad_norm": 7.37769079208374, "learning_rate": 5.051975051975052e-06, "loss": 1.5155, "step": 244 }, { "epoch": 0.25467775467775466, "grad_norm": 8.445049285888672, "learning_rate": 5.0727650727650735e-06, "loss": 1.4876, "step": 245 }, { "epoch": 0.25571725571725573, "grad_norm": 10.22825813293457, "learning_rate": 5.093555093555094e-06, "loss": 1.658, "step": 246 }, { "epoch": 0.25675675675675674, "grad_norm": 8.475323677062988, "learning_rate": 5.114345114345115e-06, "loss": 1.9086, "step": 247 }, { "epoch": 0.2577962577962578, "grad_norm": 9.246431350708008, "learning_rate": 5.135135135135135e-06, "loss": 1.4162, "step": 248 }, { "epoch": 0.2588357588357588, "grad_norm": 11.256653785705566, "learning_rate": 5.155925155925156e-06, "loss": 1.6352, "step": 249 }, { "epoch": 0.2598752598752599, "grad_norm": 8.728947639465332, "learning_rate": 5.176715176715177e-06, "loss": 1.4578, "step": 250 }, { "epoch": 0.2609147609147609, "grad_norm": 10.811807632446289, "learning_rate": 5.197505197505198e-06, "loss": 1.5422, "step": 251 }, { "epoch": 0.26195426195426197, "grad_norm": 8.760584831237793, "learning_rate": 5.218295218295219e-06, "loss": 1.6168, "step": 252 }, { "epoch": 0.262993762993763, "grad_norm": 11.621368408203125, "learning_rate": 5.23908523908524e-06, "loss": 1.7856, "step": 253 }, { "epoch": 0.26403326403326405, "grad_norm": 9.020323753356934, "learning_rate": 5.25987525987526e-06, "loss": 1.2882, "step": 254 }, { "epoch": 0.26507276507276506, "grad_norm": 9.391657829284668, "learning_rate": 5.280665280665281e-06, "loss": 1.7343, "step": 255 }, { "epoch": 0.2661122661122661, "grad_norm": 9.435623168945312, "learning_rate": 5.301455301455302e-06, "loss": 1.4503, "step": 256 }, { "epoch": 0.26715176715176714, "grad_norm": 9.689167022705078, "learning_rate": 5.322245322245323e-06, "loss": 1.8933, "step": 257 }, { "epoch": 0.2681912681912682, "grad_norm": 10.120408058166504, "learning_rate": 5.343035343035344e-06, "loss": 1.6382, "step": 258 }, { "epoch": 0.2692307692307692, "grad_norm": 8.142711639404297, "learning_rate": 5.363825363825364e-06, "loss": 1.5524, "step": 259 }, { "epoch": 0.2702702702702703, "grad_norm": 7.626992702484131, "learning_rate": 5.384615384615385e-06, "loss": 1.1425, "step": 260 }, { "epoch": 0.2713097713097713, "grad_norm": 11.374751091003418, "learning_rate": 5.405405405405406e-06, "loss": 1.36, "step": 261 }, { "epoch": 0.27234927234927236, "grad_norm": 8.555777549743652, "learning_rate": 5.4261954261954265e-06, "loss": 1.4073, "step": 262 }, { "epoch": 0.2733887733887734, "grad_norm": 8.723259925842285, "learning_rate": 5.446985446985448e-06, "loss": 1.2645, "step": 263 }, { "epoch": 0.27442827442827444, "grad_norm": 12.614055633544922, "learning_rate": 5.467775467775468e-06, "loss": 1.5528, "step": 264 }, { "epoch": 0.27546777546777546, "grad_norm": 13.636921882629395, "learning_rate": 5.488565488565489e-06, "loss": 1.841, "step": 265 }, { "epoch": 0.2765072765072765, "grad_norm": 11.47486686706543, "learning_rate": 5.5093555093555095e-06, "loss": 1.7953, "step": 266 }, { "epoch": 0.27754677754677753, "grad_norm": 9.173731803894043, "learning_rate": 5.530145530145531e-06, "loss": 1.2862, "step": 267 }, { "epoch": 0.2785862785862786, "grad_norm": 12.41637897491455, "learning_rate": 5.550935550935551e-06, "loss": 1.5515, "step": 268 }, { "epoch": 0.2796257796257796, "grad_norm": 11.649749755859375, "learning_rate": 5.571725571725573e-06, "loss": 1.8494, "step": 269 }, { "epoch": 0.2806652806652807, "grad_norm": 12.394052505493164, "learning_rate": 5.592515592515592e-06, "loss": 1.5484, "step": 270 }, { "epoch": 0.2817047817047817, "grad_norm": 9.108813285827637, "learning_rate": 5.613305613305614e-06, "loss": 1.348, "step": 271 }, { "epoch": 0.28274428274428276, "grad_norm": 10.887696266174316, "learning_rate": 5.634095634095634e-06, "loss": 1.3352, "step": 272 }, { "epoch": 0.28378378378378377, "grad_norm": 13.775224685668945, "learning_rate": 5.654885654885656e-06, "loss": 1.3866, "step": 273 }, { "epoch": 0.28482328482328484, "grad_norm": 12.727477073669434, "learning_rate": 5.675675675675676e-06, "loss": 1.1752, "step": 274 }, { "epoch": 0.28586278586278585, "grad_norm": 14.530426979064941, "learning_rate": 5.6964656964656975e-06, "loss": 1.6287, "step": 275 }, { "epoch": 0.2869022869022869, "grad_norm": 12.435869216918945, "learning_rate": 5.717255717255717e-06, "loss": 1.9149, "step": 276 }, { "epoch": 0.28794178794178793, "grad_norm": 9.951990127563477, "learning_rate": 5.7380457380457386e-06, "loss": 1.3319, "step": 277 }, { "epoch": 0.288981288981289, "grad_norm": 9.536114692687988, "learning_rate": 5.758835758835759e-06, "loss": 1.2908, "step": 278 }, { "epoch": 0.29002079002079, "grad_norm": 10.469128608703613, "learning_rate": 5.7796257796257805e-06, "loss": 1.3363, "step": 279 }, { "epoch": 0.2910602910602911, "grad_norm": 9.150140762329102, "learning_rate": 5.800415800415801e-06, "loss": 1.376, "step": 280 }, { "epoch": 0.2920997920997921, "grad_norm": 9.672096252441406, "learning_rate": 5.821205821205822e-06, "loss": 1.4166, "step": 281 }, { "epoch": 0.29313929313929316, "grad_norm": 7.898129940032959, "learning_rate": 5.841995841995842e-06, "loss": 1.1823, "step": 282 }, { "epoch": 0.29417879417879417, "grad_norm": 12.452738761901855, "learning_rate": 5.862785862785863e-06, "loss": 1.5592, "step": 283 }, { "epoch": 0.29521829521829523, "grad_norm": 8.472002029418945, "learning_rate": 5.883575883575884e-06, "loss": 1.2292, "step": 284 }, { "epoch": 0.29625779625779625, "grad_norm": 10.24071979522705, "learning_rate": 5.904365904365905e-06, "loss": 1.4567, "step": 285 }, { "epoch": 0.2972972972972973, "grad_norm": 13.903212547302246, "learning_rate": 5.925155925155926e-06, "loss": 1.7304, "step": 286 }, { "epoch": 0.2983367983367983, "grad_norm": 13.026294708251953, "learning_rate": 5.945945945945947e-06, "loss": 1.3533, "step": 287 }, { "epoch": 0.2993762993762994, "grad_norm": 8.777679443359375, "learning_rate": 5.966735966735967e-06, "loss": 1.3465, "step": 288 }, { "epoch": 0.3004158004158004, "grad_norm": 8.598836898803711, "learning_rate": 5.987525987525988e-06, "loss": 1.5693, "step": 289 }, { "epoch": 0.30145530145530147, "grad_norm": 12.184210777282715, "learning_rate": 6.008316008316009e-06, "loss": 1.3762, "step": 290 }, { "epoch": 0.3024948024948025, "grad_norm": 10.718639373779297, "learning_rate": 6.02910602910603e-06, "loss": 1.7274, "step": 291 }, { "epoch": 0.30353430353430355, "grad_norm": 7.593832492828369, "learning_rate": 6.049896049896051e-06, "loss": 1.625, "step": 292 }, { "epoch": 0.30457380457380456, "grad_norm": 11.308578491210938, "learning_rate": 6.070686070686071e-06, "loss": 1.3614, "step": 293 }, { "epoch": 0.30561330561330563, "grad_norm": 8.035221099853516, "learning_rate": 6.091476091476092e-06, "loss": 1.2422, "step": 294 }, { "epoch": 0.30665280665280664, "grad_norm": 8.732400894165039, "learning_rate": 6.112266112266113e-06, "loss": 1.6587, "step": 295 }, { "epoch": 0.3076923076923077, "grad_norm": 8.046781539916992, "learning_rate": 6.1330561330561335e-06, "loss": 1.4965, "step": 296 }, { "epoch": 0.3087318087318087, "grad_norm": 7.799867153167725, "learning_rate": 6.153846153846155e-06, "loss": 1.2193, "step": 297 }, { "epoch": 0.3097713097713098, "grad_norm": 10.253771781921387, "learning_rate": 6.1746361746361745e-06, "loss": 1.7422, "step": 298 }, { "epoch": 0.3108108108108108, "grad_norm": 11.052000999450684, "learning_rate": 6.195426195426196e-06, "loss": 1.1786, "step": 299 }, { "epoch": 0.31185031185031187, "grad_norm": 13.860453605651855, "learning_rate": 6.2162162162162164e-06, "loss": 1.4553, "step": 300 }, { "epoch": 0.3128898128898129, "grad_norm": 10.874279022216797, "learning_rate": 6.237006237006238e-06, "loss": 1.2194, "step": 301 }, { "epoch": 0.31392931392931395, "grad_norm": 8.688791275024414, "learning_rate": 6.257796257796258e-06, "loss": 1.3588, "step": 302 }, { "epoch": 0.31496881496881496, "grad_norm": 12.43751335144043, "learning_rate": 6.27858627858628e-06, "loss": 1.1144, "step": 303 }, { "epoch": 0.316008316008316, "grad_norm": 11.306876182556152, "learning_rate": 6.299376299376299e-06, "loss": 1.4284, "step": 304 }, { "epoch": 0.31704781704781704, "grad_norm": 11.36539077758789, "learning_rate": 6.320166320166321e-06, "loss": 1.4414, "step": 305 }, { "epoch": 0.3180873180873181, "grad_norm": 11.967350006103516, "learning_rate": 6.340956340956341e-06, "loss": 1.43, "step": 306 }, { "epoch": 0.3191268191268191, "grad_norm": 8.277191162109375, "learning_rate": 6.361746361746363e-06, "loss": 1.0795, "step": 307 }, { "epoch": 0.3201663201663202, "grad_norm": 13.161870002746582, "learning_rate": 6.382536382536383e-06, "loss": 1.4126, "step": 308 }, { "epoch": 0.3212058212058212, "grad_norm": 8.901982307434082, "learning_rate": 6.4033264033264045e-06, "loss": 1.0459, "step": 309 }, { "epoch": 0.32224532224532226, "grad_norm": 9.05133056640625, "learning_rate": 6.424116424116424e-06, "loss": 0.8949, "step": 310 }, { "epoch": 0.3232848232848233, "grad_norm": 11.73928165435791, "learning_rate": 6.4449064449064455e-06, "loss": 1.1428, "step": 311 }, { "epoch": 0.32432432432432434, "grad_norm": 11.88116455078125, "learning_rate": 6.465696465696466e-06, "loss": 1.2491, "step": 312 }, { "epoch": 0.32536382536382535, "grad_norm": 8.605231285095215, "learning_rate": 6.486486486486487e-06, "loss": 1.0301, "step": 313 }, { "epoch": 0.3264033264033264, "grad_norm": 12.880175590515137, "learning_rate": 6.507276507276508e-06, "loss": 1.3983, "step": 314 }, { "epoch": 0.32744282744282743, "grad_norm": 8.701872825622559, "learning_rate": 6.528066528066529e-06, "loss": 1.5113, "step": 315 }, { "epoch": 0.3284823284823285, "grad_norm": 8.418421745300293, "learning_rate": 6.548856548856549e-06, "loss": 1.1952, "step": 316 }, { "epoch": 0.3295218295218295, "grad_norm": 12.062923431396484, "learning_rate": 6.56964656964657e-06, "loss": 1.3726, "step": 317 }, { "epoch": 0.3305613305613306, "grad_norm": 10.043243408203125, "learning_rate": 6.590436590436591e-06, "loss": 1.295, "step": 318 }, { "epoch": 0.3316008316008316, "grad_norm": 11.718816757202148, "learning_rate": 6.611226611226612e-06, "loss": 0.8623, "step": 319 }, { "epoch": 0.33264033264033266, "grad_norm": 11.69416332244873, "learning_rate": 6.632016632016633e-06, "loss": 0.9903, "step": 320 }, { "epoch": 0.33367983367983367, "grad_norm": 12.75631046295166, "learning_rate": 6.652806652806654e-06, "loss": 0.8313, "step": 321 }, { "epoch": 0.33471933471933474, "grad_norm": 8.642829895019531, "learning_rate": 6.673596673596674e-06, "loss": 0.8052, "step": 322 }, { "epoch": 0.33575883575883575, "grad_norm": 7.5102152824401855, "learning_rate": 6.694386694386695e-06, "loss": 0.812, "step": 323 }, { "epoch": 0.3367983367983368, "grad_norm": 15.602274894714355, "learning_rate": 6.715176715176716e-06, "loss": 0.8896, "step": 324 }, { "epoch": 0.33783783783783783, "grad_norm": 19.65711784362793, "learning_rate": 6.735966735966737e-06, "loss": 1.3914, "step": 325 }, { "epoch": 0.3388773388773389, "grad_norm": 8.210640907287598, "learning_rate": 6.7567567567567575e-06, "loss": 1.0448, "step": 326 }, { "epoch": 0.3399168399168399, "grad_norm": 15.147570610046387, "learning_rate": 6.777546777546778e-06, "loss": 1.1706, "step": 327 }, { "epoch": 0.340956340956341, "grad_norm": 29.13080406188965, "learning_rate": 6.7983367983367986e-06, "loss": 1.3041, "step": 328 }, { "epoch": 0.341995841995842, "grad_norm": 16.381385803222656, "learning_rate": 6.81912681912682e-06, "loss": 1.0187, "step": 329 }, { "epoch": 0.34303534303534305, "grad_norm": 13.570858001708984, "learning_rate": 6.8399168399168405e-06, "loss": 1.3256, "step": 330 }, { "epoch": 0.34407484407484407, "grad_norm": 15.603581428527832, "learning_rate": 6.860706860706862e-06, "loss": 1.0822, "step": 331 }, { "epoch": 0.34511434511434513, "grad_norm": 21.508617401123047, "learning_rate": 6.8814968814968815e-06, "loss": 1.5841, "step": 332 }, { "epoch": 0.34615384615384615, "grad_norm": 9.773581504821777, "learning_rate": 6.902286902286903e-06, "loss": 0.7203, "step": 333 }, { "epoch": 0.3471933471933472, "grad_norm": 17.405637741088867, "learning_rate": 6.923076923076923e-06, "loss": 0.9002, "step": 334 }, { "epoch": 0.3482328482328482, "grad_norm": 16.036029815673828, "learning_rate": 6.943866943866945e-06, "loss": 1.1254, "step": 335 }, { "epoch": 0.3492723492723493, "grad_norm": 8.478259086608887, "learning_rate": 6.964656964656965e-06, "loss": 0.7698, "step": 336 }, { "epoch": 0.3503118503118503, "grad_norm": 11.398589134216309, "learning_rate": 6.985446985446987e-06, "loss": 0.8371, "step": 337 }, { "epoch": 0.35135135135135137, "grad_norm": 10.781030654907227, "learning_rate": 7.006237006237006e-06, "loss": 1.2204, "step": 338 }, { "epoch": 0.3523908523908524, "grad_norm": 13.479138374328613, "learning_rate": 7.027027027027028e-06, "loss": 1.3343, "step": 339 }, { "epoch": 0.35343035343035345, "grad_norm": 10.525655746459961, "learning_rate": 7.047817047817048e-06, "loss": 1.4545, "step": 340 }, { "epoch": 0.35446985446985446, "grad_norm": 11.65915298461914, "learning_rate": 7.0686070686070696e-06, "loss": 0.7949, "step": 341 }, { "epoch": 0.35550935550935553, "grad_norm": 20.972801208496094, "learning_rate": 7.08939708939709e-06, "loss": 1.2358, "step": 342 }, { "epoch": 0.35654885654885654, "grad_norm": 12.074674606323242, "learning_rate": 7.1101871101871114e-06, "loss": 0.8076, "step": 343 }, { "epoch": 0.3575883575883576, "grad_norm": 12.374785423278809, "learning_rate": 7.130977130977131e-06, "loss": 1.2175, "step": 344 }, { "epoch": 0.3586278586278586, "grad_norm": 12.02391242980957, "learning_rate": 7.1517671517671525e-06, "loss": 0.6736, "step": 345 }, { "epoch": 0.3596673596673597, "grad_norm": 14.542448043823242, "learning_rate": 7.172557172557173e-06, "loss": 0.9352, "step": 346 }, { "epoch": 0.3607068607068607, "grad_norm": 25.29374122619629, "learning_rate": 7.193347193347194e-06, "loss": 1.2332, "step": 347 }, { "epoch": 0.36174636174636177, "grad_norm": 10.751561164855957, "learning_rate": 7.214137214137215e-06, "loss": 1.0363, "step": 348 }, { "epoch": 0.3627858627858628, "grad_norm": 6.808762550354004, "learning_rate": 7.234927234927236e-06, "loss": 0.5515, "step": 349 }, { "epoch": 0.36382536382536385, "grad_norm": 15.856203079223633, "learning_rate": 7.255717255717256e-06, "loss": 0.9221, "step": 350 }, { "epoch": 0.36486486486486486, "grad_norm": 9.114971160888672, "learning_rate": 7.276507276507277e-06, "loss": 0.8534, "step": 351 }, { "epoch": 0.3659043659043659, "grad_norm": 14.44951057434082, "learning_rate": 7.297297297297298e-06, "loss": 1.0257, "step": 352 }, { "epoch": 0.36694386694386694, "grad_norm": 10.981499671936035, "learning_rate": 7.318087318087319e-06, "loss": 1.1816, "step": 353 }, { "epoch": 0.367983367983368, "grad_norm": 11.531734466552734, "learning_rate": 7.33887733887734e-06, "loss": 1.1302, "step": 354 }, { "epoch": 0.369022869022869, "grad_norm": 14.686858177185059, "learning_rate": 7.359667359667361e-06, "loss": 1.2317, "step": 355 }, { "epoch": 0.3700623700623701, "grad_norm": 6.359785079956055, "learning_rate": 7.380457380457381e-06, "loss": 0.6505, "step": 356 }, { "epoch": 0.3711018711018711, "grad_norm": 9.21711540222168, "learning_rate": 7.401247401247402e-06, "loss": 0.902, "step": 357 }, { "epoch": 0.37214137214137216, "grad_norm": 7.829357147216797, "learning_rate": 7.422037422037423e-06, "loss": 0.7133, "step": 358 }, { "epoch": 0.3731808731808732, "grad_norm": 14.824617385864258, "learning_rate": 7.442827442827444e-06, "loss": 1.5177, "step": 359 }, { "epoch": 0.37422037422037424, "grad_norm": 13.023225784301758, "learning_rate": 7.4636174636174645e-06, "loss": 1.1179, "step": 360 }, { "epoch": 0.37525987525987525, "grad_norm": 11.518128395080566, "learning_rate": 7.484407484407485e-06, "loss": 0.9857, "step": 361 }, { "epoch": 0.3762993762993763, "grad_norm": 9.43225383758545, "learning_rate": 7.5051975051975055e-06, "loss": 0.9234, "step": 362 }, { "epoch": 0.37733887733887733, "grad_norm": 10.689541816711426, "learning_rate": 7.525987525987527e-06, "loss": 0.9689, "step": 363 }, { "epoch": 0.3783783783783784, "grad_norm": 12.340954780578613, "learning_rate": 7.546777546777547e-06, "loss": 0.887, "step": 364 }, { "epoch": 0.3794178794178794, "grad_norm": 7.20005464553833, "learning_rate": 7.567567567567569e-06, "loss": 0.5674, "step": 365 }, { "epoch": 0.3804573804573805, "grad_norm": 10.14612865447998, "learning_rate": 7.5883575883575885e-06, "loss": 0.7206, "step": 366 }, { "epoch": 0.3814968814968815, "grad_norm": 19.117307662963867, "learning_rate": 7.60914760914761e-06, "loss": 1.5382, "step": 367 }, { "epoch": 0.38253638253638256, "grad_norm": 13.663631439208984, "learning_rate": 7.629937629937631e-06, "loss": 1.1351, "step": 368 }, { "epoch": 0.38357588357588357, "grad_norm": 10.310964584350586, "learning_rate": 7.650727650727653e-06, "loss": 1.2692, "step": 369 }, { "epoch": 0.38461538461538464, "grad_norm": 7.663285255432129, "learning_rate": 7.671517671517672e-06, "loss": 0.4554, "step": 370 }, { "epoch": 0.38565488565488565, "grad_norm": 10.936558723449707, "learning_rate": 7.692307692307694e-06, "loss": 0.8166, "step": 371 }, { "epoch": 0.3866943866943867, "grad_norm": 6.133451461791992, "learning_rate": 7.713097713097713e-06, "loss": 0.6886, "step": 372 }, { "epoch": 0.3877338877338877, "grad_norm": 9.430103302001953, "learning_rate": 7.733887733887735e-06, "loss": 1.0831, "step": 373 }, { "epoch": 0.3887733887733888, "grad_norm": 6.969508647918701, "learning_rate": 7.754677754677756e-06, "loss": 0.65, "step": 374 }, { "epoch": 0.3898128898128898, "grad_norm": 8.69268798828125, "learning_rate": 7.775467775467777e-06, "loss": 1.1201, "step": 375 }, { "epoch": 0.3908523908523909, "grad_norm": 6.384047031402588, "learning_rate": 7.796257796257797e-06, "loss": 0.5161, "step": 376 }, { "epoch": 0.3918918918918919, "grad_norm": 8.49499797821045, "learning_rate": 7.817047817047818e-06, "loss": 0.6475, "step": 377 }, { "epoch": 0.39293139293139295, "grad_norm": 7.770305633544922, "learning_rate": 7.837837837837838e-06, "loss": 0.597, "step": 378 }, { "epoch": 0.39397089397089397, "grad_norm": 10.950555801391602, "learning_rate": 7.85862785862786e-06, "loss": 1.07, "step": 379 }, { "epoch": 0.39501039501039503, "grad_norm": 9.929052352905273, "learning_rate": 7.879417879417879e-06, "loss": 0.9038, "step": 380 }, { "epoch": 0.39604989604989604, "grad_norm": 10.960211753845215, "learning_rate": 7.9002079002079e-06, "loss": 1.0922, "step": 381 }, { "epoch": 0.3970893970893971, "grad_norm": 13.324213027954102, "learning_rate": 7.920997920997922e-06, "loss": 1.3425, "step": 382 }, { "epoch": 0.3981288981288981, "grad_norm": 11.701374053955078, "learning_rate": 7.941787941787943e-06, "loss": 0.9576, "step": 383 }, { "epoch": 0.3991683991683992, "grad_norm": 11.246499061584473, "learning_rate": 7.962577962577963e-06, "loss": 1.0604, "step": 384 }, { "epoch": 0.4002079002079002, "grad_norm": 6.898358345031738, "learning_rate": 7.983367983367984e-06, "loss": 0.4263, "step": 385 }, { "epoch": 0.40124740124740127, "grad_norm": 7.588034152984619, "learning_rate": 8.004158004158004e-06, "loss": 0.6515, "step": 386 }, { "epoch": 0.4022869022869023, "grad_norm": 8.791478157043457, "learning_rate": 8.024948024948025e-06, "loss": 0.5638, "step": 387 }, { "epoch": 0.40332640332640335, "grad_norm": 12.265979766845703, "learning_rate": 8.045738045738047e-06, "loss": 0.9777, "step": 388 }, { "epoch": 0.40436590436590436, "grad_norm": 12.479912757873535, "learning_rate": 8.066528066528068e-06, "loss": 1.2196, "step": 389 }, { "epoch": 0.40540540540540543, "grad_norm": 10.367191314697266, "learning_rate": 8.087318087318088e-06, "loss": 1.1416, "step": 390 }, { "epoch": 0.40644490644490644, "grad_norm": 6.253833770751953, "learning_rate": 8.108108108108109e-06, "loss": 0.4802, "step": 391 }, { "epoch": 0.4074844074844075, "grad_norm": 14.616800308227539, "learning_rate": 8.128898128898129e-06, "loss": 1.2278, "step": 392 }, { "epoch": 0.4085239085239085, "grad_norm": 12.594634056091309, "learning_rate": 8.14968814968815e-06, "loss": 1.0137, "step": 393 }, { "epoch": 0.4095634095634096, "grad_norm": 11.438323020935059, "learning_rate": 8.170478170478171e-06, "loss": 1.019, "step": 394 }, { "epoch": 0.4106029106029106, "grad_norm": 13.236550331115723, "learning_rate": 8.191268191268193e-06, "loss": 0.7806, "step": 395 }, { "epoch": 0.41164241164241167, "grad_norm": 4.43021297454834, "learning_rate": 8.212058212058212e-06, "loss": 0.2711, "step": 396 }, { "epoch": 0.4126819126819127, "grad_norm": 7.386447429656982, "learning_rate": 8.232848232848234e-06, "loss": 0.5482, "step": 397 }, { "epoch": 0.41372141372141374, "grad_norm": 12.167444229125977, "learning_rate": 8.253638253638254e-06, "loss": 1.1778, "step": 398 }, { "epoch": 0.41476091476091476, "grad_norm": 12.794252395629883, "learning_rate": 8.274428274428275e-06, "loss": 0.8015, "step": 399 }, { "epoch": 0.4158004158004158, "grad_norm": 10.782106399536133, "learning_rate": 8.295218295218296e-06, "loss": 1.0166, "step": 400 }, { "epoch": 0.41683991683991684, "grad_norm": 9.652565956115723, "learning_rate": 8.316008316008318e-06, "loss": 0.6135, "step": 401 }, { "epoch": 0.4178794178794179, "grad_norm": 10.046876907348633, "learning_rate": 8.336798336798337e-06, "loss": 0.6397, "step": 402 }, { "epoch": 0.4189189189189189, "grad_norm": 3.3654232025146484, "learning_rate": 8.357588357588359e-06, "loss": 0.1272, "step": 403 }, { "epoch": 0.41995841995842, "grad_norm": 8.895442008972168, "learning_rate": 8.378378378378378e-06, "loss": 0.3683, "step": 404 }, { "epoch": 0.420997920997921, "grad_norm": 8.768726348876953, "learning_rate": 8.3991683991684e-06, "loss": 0.4138, "step": 405 }, { "epoch": 0.42203742203742206, "grad_norm": 8.897137641906738, "learning_rate": 8.419958419958421e-06, "loss": 0.5625, "step": 406 }, { "epoch": 0.4230769230769231, "grad_norm": 9.527857780456543, "learning_rate": 8.440748440748442e-06, "loss": 0.5531, "step": 407 }, { "epoch": 0.42411642411642414, "grad_norm": 21.51705551147461, "learning_rate": 8.461538461538462e-06, "loss": 1.2941, "step": 408 }, { "epoch": 0.42515592515592515, "grad_norm": 22.675201416015625, "learning_rate": 8.482328482328483e-06, "loss": 1.3033, "step": 409 }, { "epoch": 0.4261954261954262, "grad_norm": 7.007496356964111, "learning_rate": 8.503118503118503e-06, "loss": 0.2945, "step": 410 }, { "epoch": 0.42723492723492723, "grad_norm": 9.356853485107422, "learning_rate": 8.523908523908525e-06, "loss": 0.4295, "step": 411 }, { "epoch": 0.4282744282744283, "grad_norm": 12.781023025512695, "learning_rate": 8.544698544698546e-06, "loss": 0.8006, "step": 412 }, { "epoch": 0.4293139293139293, "grad_norm": 15.798759460449219, "learning_rate": 8.565488565488567e-06, "loss": 0.871, "step": 413 }, { "epoch": 0.4303534303534304, "grad_norm": 12.773930549621582, "learning_rate": 8.586278586278587e-06, "loss": 0.5821, "step": 414 }, { "epoch": 0.4313929313929314, "grad_norm": 10.541467666625977, "learning_rate": 8.607068607068608e-06, "loss": 0.6077, "step": 415 }, { "epoch": 0.43243243243243246, "grad_norm": 9.462098121643066, "learning_rate": 8.627858627858628e-06, "loss": 0.3352, "step": 416 }, { "epoch": 0.43347193347193347, "grad_norm": 10.985498428344727, "learning_rate": 8.64864864864865e-06, "loss": 0.4576, "step": 417 }, { "epoch": 0.43451143451143454, "grad_norm": 15.112650871276855, "learning_rate": 8.66943866943867e-06, "loss": 0.8339, "step": 418 }, { "epoch": 0.43555093555093555, "grad_norm": 13.53800106048584, "learning_rate": 8.690228690228692e-06, "loss": 0.7046, "step": 419 }, { "epoch": 0.4365904365904366, "grad_norm": 15.45318603515625, "learning_rate": 8.711018711018712e-06, "loss": 0.6397, "step": 420 }, { "epoch": 0.4376299376299376, "grad_norm": 1.3829221725463867, "learning_rate": 8.731808731808733e-06, "loss": 0.0236, "step": 421 }, { "epoch": 0.4386694386694387, "grad_norm": 5.065361976623535, "learning_rate": 8.752598752598753e-06, "loss": 0.161, "step": 422 }, { "epoch": 0.4397089397089397, "grad_norm": 6.4144368171691895, "learning_rate": 8.773388773388774e-06, "loss": 0.1571, "step": 423 }, { "epoch": 0.4407484407484408, "grad_norm": 11.337523460388184, "learning_rate": 8.794178794178795e-06, "loss": 0.4098, "step": 424 }, { "epoch": 0.4417879417879418, "grad_norm": 10.023104667663574, "learning_rate": 8.814968814968817e-06, "loss": 0.6923, "step": 425 }, { "epoch": 0.44282744282744285, "grad_norm": 3.946394920349121, "learning_rate": 8.835758835758837e-06, "loss": 0.0945, "step": 426 }, { "epoch": 0.44386694386694386, "grad_norm": 17.10584259033203, "learning_rate": 8.856548856548858e-06, "loss": 0.8002, "step": 427 }, { "epoch": 0.44490644490644493, "grad_norm": 11.162398338317871, "learning_rate": 8.877338877338878e-06, "loss": 0.3048, "step": 428 }, { "epoch": 0.44594594594594594, "grad_norm": 9.242036819458008, "learning_rate": 8.898128898128899e-06, "loss": 0.1708, "step": 429 }, { "epoch": 0.446985446985447, "grad_norm": 1.562695860862732, "learning_rate": 8.91891891891892e-06, "loss": 0.0244, "step": 430 }, { "epoch": 0.448024948024948, "grad_norm": 9.712590217590332, "learning_rate": 8.939708939708942e-06, "loss": 0.1978, "step": 431 }, { "epoch": 0.4490644490644491, "grad_norm": 12.932109832763672, "learning_rate": 8.960498960498961e-06, "loss": 0.3598, "step": 432 }, { "epoch": 0.4501039501039501, "grad_norm": 7.306901454925537, "learning_rate": 8.981288981288983e-06, "loss": 0.1072, "step": 433 }, { "epoch": 0.45114345114345117, "grad_norm": 18.84465980529785, "learning_rate": 9.002079002079002e-06, "loss": 0.3701, "step": 434 }, { "epoch": 0.4521829521829522, "grad_norm": 13.76640796661377, "learning_rate": 9.022869022869024e-06, "loss": 0.3655, "step": 435 }, { "epoch": 0.45322245322245325, "grad_norm": 11.665372848510742, "learning_rate": 9.043659043659045e-06, "loss": 0.5584, "step": 436 }, { "epoch": 0.45426195426195426, "grad_norm": 15.73073959350586, "learning_rate": 9.064449064449066e-06, "loss": 0.7605, "step": 437 }, { "epoch": 0.4553014553014553, "grad_norm": 11.39462661743164, "learning_rate": 9.085239085239086e-06, "loss": 0.1828, "step": 438 }, { "epoch": 0.45634095634095634, "grad_norm": 13.786553382873535, "learning_rate": 9.106029106029107e-06, "loss": 0.4993, "step": 439 }, { "epoch": 0.4573804573804574, "grad_norm": 19.499631881713867, "learning_rate": 9.126819126819127e-06, "loss": 1.396, "step": 440 }, { "epoch": 0.4584199584199584, "grad_norm": 20.219745635986328, "learning_rate": 9.147609147609149e-06, "loss": 0.7873, "step": 441 }, { "epoch": 0.4594594594594595, "grad_norm": 18.218852996826172, "learning_rate": 9.16839916839917e-06, "loss": 0.4743, "step": 442 }, { "epoch": 0.4604989604989605, "grad_norm": 19.086950302124023, "learning_rate": 9.189189189189191e-06, "loss": 0.3544, "step": 443 }, { "epoch": 0.46153846153846156, "grad_norm": 18.145187377929688, "learning_rate": 9.209979209979211e-06, "loss": 0.6076, "step": 444 }, { "epoch": 0.4625779625779626, "grad_norm": 7.627259731292725, "learning_rate": 9.230769230769232e-06, "loss": 0.1382, "step": 445 }, { "epoch": 0.46361746361746364, "grad_norm": 6.8412017822265625, "learning_rate": 9.251559251559252e-06, "loss": 0.1242, "step": 446 }, { "epoch": 0.46465696465696466, "grad_norm": 16.168813705444336, "learning_rate": 9.272349272349273e-06, "loss": 0.8019, "step": 447 }, { "epoch": 0.4656964656964657, "grad_norm": 13.182543754577637, "learning_rate": 9.293139293139293e-06, "loss": 0.2616, "step": 448 }, { "epoch": 0.46673596673596673, "grad_norm": 13.589136123657227, "learning_rate": 9.313929313929314e-06, "loss": 0.5943, "step": 449 }, { "epoch": 0.4677754677754678, "grad_norm": 15.259722709655762, "learning_rate": 9.334719334719336e-06, "loss": 0.2538, "step": 450 }, { "epoch": 0.4688149688149688, "grad_norm": 21.816892623901367, "learning_rate": 9.355509355509357e-06, "loss": 0.9268, "step": 451 }, { "epoch": 0.4698544698544699, "grad_norm": 13.662321090698242, "learning_rate": 9.376299376299377e-06, "loss": 0.4554, "step": 452 }, { "epoch": 0.4708939708939709, "grad_norm": 19.907556533813477, "learning_rate": 9.397089397089398e-06, "loss": 0.8886, "step": 453 }, { "epoch": 0.47193347193347196, "grad_norm": 11.883735656738281, "learning_rate": 9.417879417879418e-06, "loss": 0.1615, "step": 454 }, { "epoch": 0.47297297297297297, "grad_norm": 2.11840558052063, "learning_rate": 9.43866943866944e-06, "loss": 0.0334, "step": 455 }, { "epoch": 0.47401247401247404, "grad_norm": 23.978952407836914, "learning_rate": 9.45945945945946e-06, "loss": 0.9978, "step": 456 }, { "epoch": 0.47505197505197505, "grad_norm": 17.58618927001953, "learning_rate": 9.480249480249482e-06, "loss": 0.5197, "step": 457 }, { "epoch": 0.4760914760914761, "grad_norm": 7.399749279022217, "learning_rate": 9.501039501039502e-06, "loss": 0.1385, "step": 458 }, { "epoch": 0.47713097713097713, "grad_norm": 2.330472469329834, "learning_rate": 9.521829521829523e-06, "loss": 0.0326, "step": 459 }, { "epoch": 0.4781704781704782, "grad_norm": 8.512271881103516, "learning_rate": 9.542619542619543e-06, "loss": 0.2566, "step": 460 }, { "epoch": 0.4792099792099792, "grad_norm": 15.216105461120605, "learning_rate": 9.563409563409564e-06, "loss": 0.3799, "step": 461 }, { "epoch": 0.4802494802494803, "grad_norm": 27.830585479736328, "learning_rate": 9.584199584199585e-06, "loss": 0.7251, "step": 462 }, { "epoch": 0.4812889812889813, "grad_norm": 11.564122200012207, "learning_rate": 9.604989604989607e-06, "loss": 0.2673, "step": 463 }, { "epoch": 0.48232848232848236, "grad_norm": 19.60334587097168, "learning_rate": 9.625779625779626e-06, "loss": 0.5008, "step": 464 }, { "epoch": 0.48336798336798337, "grad_norm": 6.509657382965088, "learning_rate": 9.646569646569648e-06, "loss": 0.1444, "step": 465 }, { "epoch": 0.48440748440748443, "grad_norm": 11.038702011108398, "learning_rate": 9.667359667359667e-06, "loss": 0.3259, "step": 466 }, { "epoch": 0.48544698544698545, "grad_norm": 13.629114151000977, "learning_rate": 9.688149688149689e-06, "loss": 0.4144, "step": 467 }, { "epoch": 0.4864864864864865, "grad_norm": 3.2071421146392822, "learning_rate": 9.70893970893971e-06, "loss": 0.0468, "step": 468 }, { "epoch": 0.4875259875259875, "grad_norm": 14.734481811523438, "learning_rate": 9.729729729729732e-06, "loss": 0.4032, "step": 469 }, { "epoch": 0.4885654885654886, "grad_norm": 11.064592361450195, "learning_rate": 9.750519750519751e-06, "loss": 0.2775, "step": 470 }, { "epoch": 0.4896049896049896, "grad_norm": 18.813310623168945, "learning_rate": 9.771309771309773e-06, "loss": 0.7688, "step": 471 }, { "epoch": 0.49064449064449067, "grad_norm": 13.356475830078125, "learning_rate": 9.792099792099792e-06, "loss": 0.2939, "step": 472 }, { "epoch": 0.4916839916839917, "grad_norm": 6.419248104095459, "learning_rate": 9.812889812889814e-06, "loss": 0.1099, "step": 473 }, { "epoch": 0.49272349272349275, "grad_norm": 16.916677474975586, "learning_rate": 9.833679833679835e-06, "loss": 0.5852, "step": 474 }, { "epoch": 0.49376299376299376, "grad_norm": 16.16600799560547, "learning_rate": 9.854469854469856e-06, "loss": 0.3559, "step": 475 }, { "epoch": 0.49480249480249483, "grad_norm": 18.88541030883789, "learning_rate": 9.875259875259876e-06, "loss": 0.475, "step": 476 }, { "epoch": 0.49584199584199584, "grad_norm": 7.591237545013428, "learning_rate": 9.896049896049897e-06, "loss": 0.1283, "step": 477 }, { "epoch": 0.4968814968814969, "grad_norm": 23.891620635986328, "learning_rate": 9.916839916839917e-06, "loss": 0.7828, "step": 478 }, { "epoch": 0.4979209979209979, "grad_norm": 7.804129600524902, "learning_rate": 9.937629937629938e-06, "loss": 0.0996, "step": 479 }, { "epoch": 0.498960498960499, "grad_norm": 15.474300384521484, "learning_rate": 9.95841995841996e-06, "loss": 0.4721, "step": 480 }, { "epoch": 0.5, "grad_norm": 8.639959335327148, "learning_rate": 9.979209979209981e-06, "loss": 0.1883, "step": 481 }, { "epoch": 0.501039501039501, "grad_norm": 19.550806045532227, "learning_rate": 1e-05, "loss": 0.6499, "step": 482 }, { "epoch": 0.502079002079002, "grad_norm": 17.1366024017334, "learning_rate": 9.997689997689999e-06, "loss": 0.6938, "step": 483 }, { "epoch": 0.5031185031185031, "grad_norm": 10.473664283752441, "learning_rate": 9.995379995379996e-06, "loss": 0.3482, "step": 484 }, { "epoch": 0.5041580041580042, "grad_norm": 35.813411712646484, "learning_rate": 9.993069993069994e-06, "loss": 1.2533, "step": 485 }, { "epoch": 0.5051975051975052, "grad_norm": 10.875179290771484, "learning_rate": 9.990759990759992e-06, "loss": 0.4058, "step": 486 }, { "epoch": 0.5062370062370062, "grad_norm": 7.9114580154418945, "learning_rate": 9.98844998844999e-06, "loss": 0.1588, "step": 487 }, { "epoch": 0.5072765072765073, "grad_norm": 21.617277145385742, "learning_rate": 9.986139986139987e-06, "loss": 0.5731, "step": 488 }, { "epoch": 0.5083160083160083, "grad_norm": 11.489331245422363, "learning_rate": 9.983829983829985e-06, "loss": 0.1674, "step": 489 }, { "epoch": 0.5093555093555093, "grad_norm": 23.25141716003418, "learning_rate": 9.981519981519983e-06, "loss": 0.4789, "step": 490 }, { "epoch": 0.5103950103950103, "grad_norm": 14.97988224029541, "learning_rate": 9.979209979209981e-06, "loss": 0.2813, "step": 491 }, { "epoch": 0.5114345114345115, "grad_norm": 21.924203872680664, "learning_rate": 9.976899976899978e-06, "loss": 0.7728, "step": 492 }, { "epoch": 0.5124740124740125, "grad_norm": 21.031352996826172, "learning_rate": 9.974589974589976e-06, "loss": 0.6349, "step": 493 }, { "epoch": 0.5135135135135135, "grad_norm": 6.328756332397461, "learning_rate": 9.972279972279974e-06, "loss": 0.0946, "step": 494 }, { "epoch": 0.5145530145530145, "grad_norm": 14.493115425109863, "learning_rate": 9.96996996996997e-06, "loss": 0.2316, "step": 495 }, { "epoch": 0.5155925155925156, "grad_norm": 19.657384872436523, "learning_rate": 9.967659967659969e-06, "loss": 0.5465, "step": 496 }, { "epoch": 0.5166320166320166, "grad_norm": 9.537667274475098, "learning_rate": 9.965349965349967e-06, "loss": 0.1388, "step": 497 }, { "epoch": 0.5176715176715176, "grad_norm": 4.219407081604004, "learning_rate": 9.963039963039965e-06, "loss": 0.1253, "step": 498 }, { "epoch": 0.5187110187110187, "grad_norm": 9.209030151367188, "learning_rate": 9.960729960729962e-06, "loss": 0.1181, "step": 499 }, { "epoch": 0.5197505197505198, "grad_norm": 4.694278717041016, "learning_rate": 9.95841995841996e-06, "loss": 0.1103, "step": 500 }, { "epoch": 0.5207900207900208, "grad_norm": 16.673931121826172, "learning_rate": 9.956109956109958e-06, "loss": 0.5307, "step": 501 }, { "epoch": 0.5218295218295218, "grad_norm": 0.8699054718017578, "learning_rate": 9.953799953799954e-06, "loss": 0.0086, "step": 502 }, { "epoch": 0.5228690228690228, "grad_norm": 7.444122791290283, "learning_rate": 9.951489951489953e-06, "loss": 0.2038, "step": 503 }, { "epoch": 0.5239085239085239, "grad_norm": 20.159868240356445, "learning_rate": 9.949179949179951e-06, "loss": 0.4744, "step": 504 }, { "epoch": 0.524948024948025, "grad_norm": 21.489547729492188, "learning_rate": 9.946869946869947e-06, "loss": 0.688, "step": 505 }, { "epoch": 0.525987525987526, "grad_norm": 31.612947463989258, "learning_rate": 9.944559944559946e-06, "loss": 1.2621, "step": 506 }, { "epoch": 0.527027027027027, "grad_norm": 14.285310745239258, "learning_rate": 9.942249942249944e-06, "loss": 0.2198, "step": 507 }, { "epoch": 0.5280665280665281, "grad_norm": 18.622798919677734, "learning_rate": 9.93993993993994e-06, "loss": 0.6707, "step": 508 }, { "epoch": 0.5291060291060291, "grad_norm": 8.829988479614258, "learning_rate": 9.937629937629938e-06, "loss": 0.2278, "step": 509 }, { "epoch": 0.5301455301455301, "grad_norm": 17.411701202392578, "learning_rate": 9.935319935319937e-06, "loss": 0.3917, "step": 510 }, { "epoch": 0.5311850311850311, "grad_norm": 10.52638053894043, "learning_rate": 9.933009933009933e-06, "loss": 0.317, "step": 511 }, { "epoch": 0.5322245322245323, "grad_norm": 9.510506629943848, "learning_rate": 9.930699930699931e-06, "loss": 0.1242, "step": 512 }, { "epoch": 0.5332640332640333, "grad_norm": 9.235204696655273, "learning_rate": 9.92838992838993e-06, "loss": 0.1194, "step": 513 }, { "epoch": 0.5343035343035343, "grad_norm": 2.6380372047424316, "learning_rate": 9.926079926079926e-06, "loss": 0.0322, "step": 514 }, { "epoch": 0.5353430353430353, "grad_norm": 8.815104484558105, "learning_rate": 9.923769923769924e-06, "loss": 0.2188, "step": 515 }, { "epoch": 0.5363825363825364, "grad_norm": 13.807114601135254, "learning_rate": 9.921459921459922e-06, "loss": 0.3807, "step": 516 }, { "epoch": 0.5374220374220374, "grad_norm": 2.663893222808838, "learning_rate": 9.919149919149919e-06, "loss": 0.0458, "step": 517 }, { "epoch": 0.5384615384615384, "grad_norm": 1.582839846611023, "learning_rate": 9.916839916839917e-06, "loss": 0.0466, "step": 518 }, { "epoch": 0.5395010395010394, "grad_norm": 15.220515251159668, "learning_rate": 9.914529914529915e-06, "loss": 0.3472, "step": 519 }, { "epoch": 0.5405405405405406, "grad_norm": 14.371718406677246, "learning_rate": 9.912219912219912e-06, "loss": 0.2532, "step": 520 }, { "epoch": 0.5415800415800416, "grad_norm": 7.996894359588623, "learning_rate": 9.90990990990991e-06, "loss": 0.1664, "step": 521 }, { "epoch": 0.5426195426195426, "grad_norm": 20.90623664855957, "learning_rate": 9.907599907599908e-06, "loss": 0.8354, "step": 522 }, { "epoch": 0.5436590436590436, "grad_norm": 13.043829917907715, "learning_rate": 9.905289905289906e-06, "loss": 0.1403, "step": 523 }, { "epoch": 0.5446985446985447, "grad_norm": 6.000689506530762, "learning_rate": 9.902979902979903e-06, "loss": 0.0659, "step": 524 }, { "epoch": 0.5457380457380457, "grad_norm": 19.005416870117188, "learning_rate": 9.900669900669901e-06, "loss": 0.783, "step": 525 }, { "epoch": 0.5467775467775468, "grad_norm": 9.024373054504395, "learning_rate": 9.8983598983599e-06, "loss": 0.1836, "step": 526 }, { "epoch": 0.5478170478170478, "grad_norm": 18.351057052612305, "learning_rate": 9.896049896049897e-06, "loss": 0.277, "step": 527 }, { "epoch": 0.5488565488565489, "grad_norm": 14.205464363098145, "learning_rate": 9.893739893739894e-06, "loss": 0.3695, "step": 528 }, { "epoch": 0.5498960498960499, "grad_norm": 15.938841819763184, "learning_rate": 9.891429891429892e-06, "loss": 0.5653, "step": 529 }, { "epoch": 0.5509355509355509, "grad_norm": 11.979267120361328, "learning_rate": 9.88911988911989e-06, "loss": 0.2696, "step": 530 }, { "epoch": 0.5519750519750519, "grad_norm": 22.4202880859375, "learning_rate": 9.886809886809887e-06, "loss": 0.899, "step": 531 }, { "epoch": 0.553014553014553, "grad_norm": 11.355657577514648, "learning_rate": 9.884499884499885e-06, "loss": 0.2037, "step": 532 }, { "epoch": 0.5540540540540541, "grad_norm": 3.479240655899048, "learning_rate": 9.882189882189883e-06, "loss": 0.0641, "step": 533 }, { "epoch": 0.5550935550935551, "grad_norm": 4.595727920532227, "learning_rate": 9.879879879879881e-06, "loss": 0.0648, "step": 534 }, { "epoch": 0.5561330561330561, "grad_norm": 10.914381980895996, "learning_rate": 9.877569877569878e-06, "loss": 0.2337, "step": 535 }, { "epoch": 0.5571725571725572, "grad_norm": 16.96885108947754, "learning_rate": 9.875259875259876e-06, "loss": 0.5575, "step": 536 }, { "epoch": 0.5582120582120582, "grad_norm": 11.56965160369873, "learning_rate": 9.872949872949874e-06, "loss": 0.2081, "step": 537 }, { "epoch": 0.5592515592515592, "grad_norm": 5.558108806610107, "learning_rate": 9.87063987063987e-06, "loss": 0.0941, "step": 538 }, { "epoch": 0.5602910602910602, "grad_norm": 27.092920303344727, "learning_rate": 9.868329868329869e-06, "loss": 1.0123, "step": 539 }, { "epoch": 0.5613305613305614, "grad_norm": 13.499934196472168, "learning_rate": 9.866019866019867e-06, "loss": 0.2519, "step": 540 }, { "epoch": 0.5623700623700624, "grad_norm": 1.136060118675232, "learning_rate": 9.863709863709865e-06, "loss": 0.0161, "step": 541 }, { "epoch": 0.5634095634095634, "grad_norm": 13.566397666931152, "learning_rate": 9.861399861399862e-06, "loss": 0.3632, "step": 542 }, { "epoch": 0.5644490644490644, "grad_norm": 8.34858512878418, "learning_rate": 9.85908985908986e-06, "loss": 0.1934, "step": 543 }, { "epoch": 0.5654885654885655, "grad_norm": 12.781171798706055, "learning_rate": 9.856779856779858e-06, "loss": 0.185, "step": 544 }, { "epoch": 0.5665280665280665, "grad_norm": 15.231748580932617, "learning_rate": 9.854469854469856e-06, "loss": 0.4749, "step": 545 }, { "epoch": 0.5675675675675675, "grad_norm": 9.513189315795898, "learning_rate": 9.852159852159853e-06, "loss": 0.2114, "step": 546 }, { "epoch": 0.5686070686070686, "grad_norm": 3.4976556301116943, "learning_rate": 9.849849849849851e-06, "loss": 0.0485, "step": 547 }, { "epoch": 0.5696465696465697, "grad_norm": 16.309349060058594, "learning_rate": 9.84753984753985e-06, "loss": 0.2677, "step": 548 }, { "epoch": 0.5706860706860707, "grad_norm": 14.880147933959961, "learning_rate": 9.845229845229846e-06, "loss": 0.4456, "step": 549 }, { "epoch": 0.5717255717255717, "grad_norm": 16.281221389770508, "learning_rate": 9.842919842919844e-06, "loss": 0.4056, "step": 550 }, { "epoch": 0.5727650727650727, "grad_norm": 20.214174270629883, "learning_rate": 9.840609840609842e-06, "loss": 0.4839, "step": 551 }, { "epoch": 0.5738045738045738, "grad_norm": 22.193727493286133, "learning_rate": 9.83829983829984e-06, "loss": 1.3147, "step": 552 }, { "epoch": 0.5748440748440748, "grad_norm": 18.85350227355957, "learning_rate": 9.835989835989837e-06, "loss": 0.3605, "step": 553 }, { "epoch": 0.5758835758835759, "grad_norm": 10.782029151916504, "learning_rate": 9.833679833679835e-06, "loss": 0.202, "step": 554 }, { "epoch": 0.5769230769230769, "grad_norm": 12.252769470214844, "learning_rate": 9.831369831369833e-06, "loss": 0.2616, "step": 555 }, { "epoch": 0.577962577962578, "grad_norm": 15.310741424560547, "learning_rate": 9.82905982905983e-06, "loss": 0.3184, "step": 556 }, { "epoch": 0.579002079002079, "grad_norm": 1.3405869007110596, "learning_rate": 9.826749826749828e-06, "loss": 0.0113, "step": 557 }, { "epoch": 0.58004158004158, "grad_norm": 5.590155124664307, "learning_rate": 9.824439824439826e-06, "loss": 0.1065, "step": 558 }, { "epoch": 0.581081081081081, "grad_norm": 16.842506408691406, "learning_rate": 9.822129822129823e-06, "loss": 0.3817, "step": 559 }, { "epoch": 0.5821205821205822, "grad_norm": 10.099876403808594, "learning_rate": 9.81981981981982e-06, "loss": 0.2883, "step": 560 }, { "epoch": 0.5831600831600832, "grad_norm": 21.479961395263672, "learning_rate": 9.817509817509819e-06, "loss": 1.5162, "step": 561 }, { "epoch": 0.5841995841995842, "grad_norm": 7.661498546600342, "learning_rate": 9.815199815199815e-06, "loss": 0.1604, "step": 562 }, { "epoch": 0.5852390852390852, "grad_norm": 11.383598327636719, "learning_rate": 9.812889812889814e-06, "loss": 0.2646, "step": 563 }, { "epoch": 0.5862785862785863, "grad_norm": 20.18816375732422, "learning_rate": 9.810579810579812e-06, "loss": 0.1702, "step": 564 }, { "epoch": 0.5873180873180873, "grad_norm": 16.200801849365234, "learning_rate": 9.808269808269808e-06, "loss": 1.7284, "step": 565 }, { "epoch": 0.5883575883575883, "grad_norm": 3.830944776535034, "learning_rate": 9.805959805959806e-06, "loss": 0.0424, "step": 566 }, { "epoch": 0.5893970893970893, "grad_norm": 13.652180671691895, "learning_rate": 9.803649803649805e-06, "loss": 0.3261, "step": 567 }, { "epoch": 0.5904365904365905, "grad_norm": 1.004449486732483, "learning_rate": 9.801339801339801e-06, "loss": 0.0188, "step": 568 }, { "epoch": 0.5914760914760915, "grad_norm": 1.3249083757400513, "learning_rate": 9.7990297990298e-06, "loss": 0.0224, "step": 569 }, { "epoch": 0.5925155925155925, "grad_norm": 1.8351414203643799, "learning_rate": 9.796719796719798e-06, "loss": 0.0229, "step": 570 }, { "epoch": 0.5935550935550935, "grad_norm": 3.9487857818603516, "learning_rate": 9.794409794409794e-06, "loss": 0.2799, "step": 571 }, { "epoch": 0.5945945945945946, "grad_norm": 3.0859389305114746, "learning_rate": 9.792099792099792e-06, "loss": 0.0573, "step": 572 }, { "epoch": 0.5956340956340956, "grad_norm": 17.17660903930664, "learning_rate": 9.78978978978979e-06, "loss": 0.6055, "step": 573 }, { "epoch": 0.5966735966735967, "grad_norm": 0.18140429258346558, "learning_rate": 9.787479787479787e-06, "loss": 0.0015, "step": 574 }, { "epoch": 0.5977130977130977, "grad_norm": 29.232139587402344, "learning_rate": 9.785169785169785e-06, "loss": 1.2985, "step": 575 }, { "epoch": 0.5987525987525988, "grad_norm": 19.850553512573242, "learning_rate": 9.782859782859783e-06, "loss": 0.5502, "step": 576 }, { "epoch": 0.5997920997920998, "grad_norm": 11.339086532592773, "learning_rate": 9.780549780549782e-06, "loss": 0.2392, "step": 577 }, { "epoch": 0.6008316008316008, "grad_norm": 27.784828186035156, "learning_rate": 9.778239778239778e-06, "loss": 1.231, "step": 578 }, { "epoch": 0.6018711018711018, "grad_norm": 8.059019088745117, "learning_rate": 9.775929775929776e-06, "loss": 0.1051, "step": 579 }, { "epoch": 0.6029106029106029, "grad_norm": 22.404436111450195, "learning_rate": 9.773619773619774e-06, "loss": 0.8404, "step": 580 }, { "epoch": 0.603950103950104, "grad_norm": 19.912567138671875, "learning_rate": 9.771309771309773e-06, "loss": 1.4489, "step": 581 }, { "epoch": 0.604989604989605, "grad_norm": 15.487265586853027, "learning_rate": 9.768999768999769e-06, "loss": 0.63, "step": 582 }, { "epoch": 0.606029106029106, "grad_norm": 21.618928909301758, "learning_rate": 9.766689766689767e-06, "loss": 1.554, "step": 583 }, { "epoch": 0.6070686070686071, "grad_norm": 13.416857719421387, "learning_rate": 9.764379764379765e-06, "loss": 0.1677, "step": 584 }, { "epoch": 0.6081081081081081, "grad_norm": 2.621568441390991, "learning_rate": 9.762069762069762e-06, "loss": 0.0388, "step": 585 }, { "epoch": 0.6091476091476091, "grad_norm": 9.277111053466797, "learning_rate": 9.75975975975976e-06, "loss": 0.2088, "step": 586 }, { "epoch": 0.6101871101871101, "grad_norm": 4.179928779602051, "learning_rate": 9.757449757449758e-06, "loss": 0.0586, "step": 587 }, { "epoch": 0.6112266112266113, "grad_norm": 6.063162326812744, "learning_rate": 9.755139755139757e-06, "loss": 0.1165, "step": 588 }, { "epoch": 0.6122661122661123, "grad_norm": 10.404489517211914, "learning_rate": 9.752829752829753e-06, "loss": 0.2205, "step": 589 }, { "epoch": 0.6133056133056133, "grad_norm": 9.35464859008789, "learning_rate": 9.750519750519751e-06, "loss": 0.1712, "step": 590 }, { "epoch": 0.6143451143451143, "grad_norm": 11.338526725769043, "learning_rate": 9.74820974820975e-06, "loss": 0.3342, "step": 591 }, { "epoch": 0.6153846153846154, "grad_norm": 10.987810134887695, "learning_rate": 9.745899745899746e-06, "loss": 0.1355, "step": 592 }, { "epoch": 0.6164241164241164, "grad_norm": 18.870922088623047, "learning_rate": 9.743589743589744e-06, "loss": 0.8299, "step": 593 }, { "epoch": 0.6174636174636174, "grad_norm": 18.805932998657227, "learning_rate": 9.741279741279742e-06, "loss": 0.6295, "step": 594 }, { "epoch": 0.6185031185031185, "grad_norm": 32.01840591430664, "learning_rate": 9.73896973896974e-06, "loss": 2.1948, "step": 595 }, { "epoch": 0.6195426195426196, "grad_norm": 16.707277297973633, "learning_rate": 9.736659736659737e-06, "loss": 0.3787, "step": 596 }, { "epoch": 0.6205821205821206, "grad_norm": 7.447889804840088, "learning_rate": 9.734349734349735e-06, "loss": 0.1948, "step": 597 }, { "epoch": 0.6216216216216216, "grad_norm": 18.314401626586914, "learning_rate": 9.732039732039733e-06, "loss": 1.1807, "step": 598 }, { "epoch": 0.6226611226611226, "grad_norm": 2.9496583938598633, "learning_rate": 9.729729729729732e-06, "loss": 0.0373, "step": 599 }, { "epoch": 0.6237006237006237, "grad_norm": 7.875022888183594, "learning_rate": 9.727419727419728e-06, "loss": 0.1404, "step": 600 }, { "epoch": 0.6247401247401247, "grad_norm": 22.426799774169922, "learning_rate": 9.725109725109726e-06, "loss": 0.5067, "step": 601 }, { "epoch": 0.6257796257796258, "grad_norm": 2.4129889011383057, "learning_rate": 9.722799722799724e-06, "loss": 0.0515, "step": 602 }, { "epoch": 0.6268191268191268, "grad_norm": 0.15341880917549133, "learning_rate": 9.720489720489721e-06, "loss": 0.0028, "step": 603 }, { "epoch": 0.6278586278586279, "grad_norm": 3.9162347316741943, "learning_rate": 9.718179718179719e-06, "loss": 0.0555, "step": 604 }, { "epoch": 0.6288981288981289, "grad_norm": 7.17239236831665, "learning_rate": 9.715869715869717e-06, "loss": 0.1233, "step": 605 }, { "epoch": 0.6299376299376299, "grad_norm": 2.595475673675537, "learning_rate": 9.713559713559715e-06, "loss": 0.0454, "step": 606 }, { "epoch": 0.6309771309771309, "grad_norm": 19.02918243408203, "learning_rate": 9.711249711249712e-06, "loss": 0.4381, "step": 607 }, { "epoch": 0.632016632016632, "grad_norm": 12.635756492614746, "learning_rate": 9.70893970893971e-06, "loss": 0.221, "step": 608 }, { "epoch": 0.6330561330561331, "grad_norm": 19.13248634338379, "learning_rate": 9.706629706629708e-06, "loss": 0.5459, "step": 609 }, { "epoch": 0.6340956340956341, "grad_norm": 19.797473907470703, "learning_rate": 9.704319704319705e-06, "loss": 1.0616, "step": 610 }, { "epoch": 0.6351351351351351, "grad_norm": 8.363163948059082, "learning_rate": 9.702009702009703e-06, "loss": 0.3124, "step": 611 }, { "epoch": 0.6361746361746362, "grad_norm": 14.620701789855957, "learning_rate": 9.699699699699701e-06, "loss": 0.9523, "step": 612 }, { "epoch": 0.6372141372141372, "grad_norm": 0.11327558010816574, "learning_rate": 9.6973896973897e-06, "loss": 0.0025, "step": 613 }, { "epoch": 0.6382536382536382, "grad_norm": 7.28572416305542, "learning_rate": 9.695079695079696e-06, "loss": 0.1897, "step": 614 }, { "epoch": 0.6392931392931392, "grad_norm": 14.20917797088623, "learning_rate": 9.692769692769694e-06, "loss": 0.7104, "step": 615 }, { "epoch": 0.6403326403326404, "grad_norm": 3.038438320159912, "learning_rate": 9.690459690459692e-06, "loss": 0.0491, "step": 616 }, { "epoch": 0.6413721413721414, "grad_norm": 2.8274102210998535, "learning_rate": 9.688149688149689e-06, "loss": 0.0379, "step": 617 }, { "epoch": 0.6424116424116424, "grad_norm": 8.627979278564453, "learning_rate": 9.685839685839687e-06, "loss": 0.153, "step": 618 }, { "epoch": 0.6434511434511434, "grad_norm": 11.217168807983398, "learning_rate": 9.683529683529685e-06, "loss": 0.2238, "step": 619 }, { "epoch": 0.6444906444906445, "grad_norm": 8.23984432220459, "learning_rate": 9.681219681219682e-06, "loss": 0.1156, "step": 620 }, { "epoch": 0.6455301455301455, "grad_norm": 1.6440142393112183, "learning_rate": 9.67890967890968e-06, "loss": 0.021, "step": 621 }, { "epoch": 0.6465696465696466, "grad_norm": 1.043070673942566, "learning_rate": 9.676599676599678e-06, "loss": 0.0115, "step": 622 }, { "epoch": 0.6476091476091476, "grad_norm": 10.603458404541016, "learning_rate": 9.674289674289675e-06, "loss": 0.2493, "step": 623 }, { "epoch": 0.6486486486486487, "grad_norm": 11.63265609741211, "learning_rate": 9.671979671979673e-06, "loss": 0.3605, "step": 624 }, { "epoch": 0.6496881496881497, "grad_norm": 21.488149642944336, "learning_rate": 9.669669669669671e-06, "loss": 1.1281, "step": 625 }, { "epoch": 0.6507276507276507, "grad_norm": 5.428545951843262, "learning_rate": 9.667359667359667e-06, "loss": 0.0537, "step": 626 }, { "epoch": 0.6517671517671517, "grad_norm": 13.269102096557617, "learning_rate": 9.665049665049666e-06, "loss": 0.336, "step": 627 }, { "epoch": 0.6528066528066528, "grad_norm": 6.222421646118164, "learning_rate": 9.662739662739664e-06, "loss": 0.0742, "step": 628 }, { "epoch": 0.6538461538461539, "grad_norm": 2.2896013259887695, "learning_rate": 9.66042966042966e-06, "loss": 0.0411, "step": 629 }, { "epoch": 0.6548856548856549, "grad_norm": 14.19056224822998, "learning_rate": 9.658119658119659e-06, "loss": 0.4373, "step": 630 }, { "epoch": 0.6559251559251559, "grad_norm": 14.150191307067871, "learning_rate": 9.655809655809657e-06, "loss": 0.5048, "step": 631 }, { "epoch": 0.656964656964657, "grad_norm": 8.00671672821045, "learning_rate": 9.653499653499653e-06, "loss": 0.1181, "step": 632 }, { "epoch": 0.658004158004158, "grad_norm": 14.54723072052002, "learning_rate": 9.651189651189651e-06, "loss": 0.9277, "step": 633 }, { "epoch": 0.659043659043659, "grad_norm": 0.5207761526107788, "learning_rate": 9.64887964887965e-06, "loss": 0.0086, "step": 634 }, { "epoch": 0.66008316008316, "grad_norm": 15.758940696716309, "learning_rate": 9.646569646569648e-06, "loss": 0.226, "step": 635 }, { "epoch": 0.6611226611226612, "grad_norm": 17.640363693237305, "learning_rate": 9.644259644259644e-06, "loss": 0.4552, "step": 636 }, { "epoch": 0.6621621621621622, "grad_norm": 17.001461029052734, "learning_rate": 9.641949641949642e-06, "loss": 0.5642, "step": 637 }, { "epoch": 0.6632016632016632, "grad_norm": 14.583857536315918, "learning_rate": 9.63963963963964e-06, "loss": 0.2794, "step": 638 }, { "epoch": 0.6642411642411642, "grad_norm": 16.399160385131836, "learning_rate": 9.637329637329637e-06, "loss": 0.5423, "step": 639 }, { "epoch": 0.6652806652806653, "grad_norm": 65.82854461669922, "learning_rate": 9.635019635019635e-06, "loss": 0.6206, "step": 640 }, { "epoch": 0.6663201663201663, "grad_norm": 0.8936071991920471, "learning_rate": 9.632709632709634e-06, "loss": 0.0147, "step": 641 }, { "epoch": 0.6673596673596673, "grad_norm": 8.22929859161377, "learning_rate": 9.630399630399632e-06, "loss": 0.1999, "step": 642 }, { "epoch": 0.6683991683991684, "grad_norm": 4.461599349975586, "learning_rate": 9.628089628089628e-06, "loss": 0.0999, "step": 643 }, { "epoch": 0.6694386694386695, "grad_norm": 6.589395046234131, "learning_rate": 9.625779625779626e-06, "loss": 0.1157, "step": 644 }, { "epoch": 0.6704781704781705, "grad_norm": 10.626900672912598, "learning_rate": 9.623469623469625e-06, "loss": 0.3657, "step": 645 }, { "epoch": 0.6715176715176715, "grad_norm": 21.5227108001709, "learning_rate": 9.621159621159621e-06, "loss": 0.9061, "step": 646 }, { "epoch": 0.6725571725571725, "grad_norm": 30.95146942138672, "learning_rate": 9.61884961884962e-06, "loss": 1.335, "step": 647 }, { "epoch": 0.6735966735966736, "grad_norm": 9.175050735473633, "learning_rate": 9.616539616539617e-06, "loss": 0.1422, "step": 648 }, { "epoch": 0.6746361746361746, "grad_norm": 14.263113975524902, "learning_rate": 9.614229614229616e-06, "loss": 1.1993, "step": 649 }, { "epoch": 0.6756756756756757, "grad_norm": 5.766702175140381, "learning_rate": 9.611919611919612e-06, "loss": 0.1147, "step": 650 }, { "epoch": 0.6767151767151767, "grad_norm": 10.668768882751465, "learning_rate": 9.60960960960961e-06, "loss": 0.4003, "step": 651 }, { "epoch": 0.6777546777546778, "grad_norm": 10.931303024291992, "learning_rate": 9.607299607299609e-06, "loss": 0.1814, "step": 652 }, { "epoch": 0.6787941787941788, "grad_norm": 4.092346668243408, "learning_rate": 9.604989604989607e-06, "loss": 0.1016, "step": 653 }, { "epoch": 0.6798336798336798, "grad_norm": 5.187844753265381, "learning_rate": 9.602679602679603e-06, "loss": 0.1261, "step": 654 }, { "epoch": 0.6808731808731808, "grad_norm": 21.333641052246094, "learning_rate": 9.600369600369601e-06, "loss": 0.782, "step": 655 }, { "epoch": 0.681912681912682, "grad_norm": 19.176406860351562, "learning_rate": 9.5980595980596e-06, "loss": 0.9153, "step": 656 }, { "epoch": 0.682952182952183, "grad_norm": 8.19583797454834, "learning_rate": 9.595749595749596e-06, "loss": 0.1717, "step": 657 }, { "epoch": 0.683991683991684, "grad_norm": 6.486777305603027, "learning_rate": 9.593439593439594e-06, "loss": 0.1231, "step": 658 }, { "epoch": 0.685031185031185, "grad_norm": 0.35394906997680664, "learning_rate": 9.591129591129592e-06, "loss": 0.0059, "step": 659 }, { "epoch": 0.6860706860706861, "grad_norm": 8.42319393157959, "learning_rate": 9.58881958881959e-06, "loss": 0.2822, "step": 660 }, { "epoch": 0.6871101871101871, "grad_norm": 19.60753059387207, "learning_rate": 9.586509586509587e-06, "loss": 0.6453, "step": 661 }, { "epoch": 0.6881496881496881, "grad_norm": 5.556179523468018, "learning_rate": 9.584199584199585e-06, "loss": 0.0812, "step": 662 }, { "epoch": 0.6891891891891891, "grad_norm": 11.209933280944824, "learning_rate": 9.581889581889584e-06, "loss": 0.3448, "step": 663 }, { "epoch": 0.6902286902286903, "grad_norm": 19.909692764282227, "learning_rate": 9.57957957957958e-06, "loss": 1.5944, "step": 664 }, { "epoch": 0.6912681912681913, "grad_norm": 11.300826072692871, "learning_rate": 9.577269577269578e-06, "loss": 0.4473, "step": 665 }, { "epoch": 0.6923076923076923, "grad_norm": 12.868157386779785, "learning_rate": 9.574959574959576e-06, "loss": 0.3579, "step": 666 }, { "epoch": 0.6933471933471933, "grad_norm": 13.064730644226074, "learning_rate": 9.572649572649575e-06, "loss": 0.5545, "step": 667 }, { "epoch": 0.6943866943866944, "grad_norm": 12.726256370544434, "learning_rate": 9.570339570339571e-06, "loss": 0.2416, "step": 668 }, { "epoch": 0.6954261954261954, "grad_norm": 5.459465980529785, "learning_rate": 9.56802956802957e-06, "loss": 0.0982, "step": 669 }, { "epoch": 0.6964656964656964, "grad_norm": 19.966798782348633, "learning_rate": 9.565719565719567e-06, "loss": 0.8197, "step": 670 }, { "epoch": 0.6975051975051975, "grad_norm": 12.257415771484375, "learning_rate": 9.563409563409564e-06, "loss": 0.6301, "step": 671 }, { "epoch": 0.6985446985446986, "grad_norm": 10.835104942321777, "learning_rate": 9.561099561099562e-06, "loss": 0.5715, "step": 672 }, { "epoch": 0.6995841995841996, "grad_norm": 14.587878227233887, "learning_rate": 9.55878955878956e-06, "loss": 0.2015, "step": 673 }, { "epoch": 0.7006237006237006, "grad_norm": 3.3721587657928467, "learning_rate": 9.556479556479557e-06, "loss": 0.0399, "step": 674 }, { "epoch": 0.7016632016632016, "grad_norm": 6.011256217956543, "learning_rate": 9.554169554169555e-06, "loss": 0.0979, "step": 675 }, { "epoch": 0.7027027027027027, "grad_norm": 28.068397521972656, "learning_rate": 9.551859551859553e-06, "loss": 0.4636, "step": 676 }, { "epoch": 0.7037422037422038, "grad_norm": 9.368738174438477, "learning_rate": 9.54954954954955e-06, "loss": 0.2166, "step": 677 }, { "epoch": 0.7047817047817048, "grad_norm": 13.36036205291748, "learning_rate": 9.547239547239548e-06, "loss": 0.3903, "step": 678 }, { "epoch": 0.7058212058212058, "grad_norm": 3.302922248840332, "learning_rate": 9.544929544929546e-06, "loss": 0.0396, "step": 679 }, { "epoch": 0.7068607068607069, "grad_norm": 8.110228538513184, "learning_rate": 9.542619542619543e-06, "loss": 0.1293, "step": 680 }, { "epoch": 0.7079002079002079, "grad_norm": 8.792784690856934, "learning_rate": 9.54030954030954e-06, "loss": 0.3419, "step": 681 }, { "epoch": 0.7089397089397089, "grad_norm": 8.643064498901367, "learning_rate": 9.537999537999539e-06, "loss": 0.2469, "step": 682 }, { "epoch": 0.7099792099792099, "grad_norm": 10.04169750213623, "learning_rate": 9.535689535689536e-06, "loss": 0.1695, "step": 683 }, { "epoch": 0.7110187110187111, "grad_norm": 6.619424343109131, "learning_rate": 9.533379533379534e-06, "loss": 0.2722, "step": 684 }, { "epoch": 0.7120582120582121, "grad_norm": 9.977747917175293, "learning_rate": 9.531069531069532e-06, "loss": 0.1084, "step": 685 }, { "epoch": 0.7130977130977131, "grad_norm": 7.454771518707275, "learning_rate": 9.528759528759528e-06, "loss": 0.1774, "step": 686 }, { "epoch": 0.7141372141372141, "grad_norm": 18.019685745239258, "learning_rate": 9.526449526449527e-06, "loss": 0.2778, "step": 687 }, { "epoch": 0.7151767151767152, "grad_norm": 15.221109390258789, "learning_rate": 9.524139524139525e-06, "loss": 0.6243, "step": 688 }, { "epoch": 0.7162162162162162, "grad_norm": 19.62790870666504, "learning_rate": 9.521829521829523e-06, "loss": 0.9515, "step": 689 }, { "epoch": 0.7172557172557172, "grad_norm": 13.727465629577637, "learning_rate": 9.51951951951952e-06, "loss": 0.2306, "step": 690 }, { "epoch": 0.7182952182952183, "grad_norm": 25.633102416992188, "learning_rate": 9.517209517209518e-06, "loss": 2.2608, "step": 691 }, { "epoch": 0.7193347193347194, "grad_norm": 7.654299736022949, "learning_rate": 9.514899514899516e-06, "loss": 0.0771, "step": 692 }, { "epoch": 0.7203742203742204, "grad_norm": 10.022356986999512, "learning_rate": 9.512589512589512e-06, "loss": 0.4251, "step": 693 }, { "epoch": 0.7214137214137214, "grad_norm": 7.501932621002197, "learning_rate": 9.51027951027951e-06, "loss": 0.2412, "step": 694 }, { "epoch": 0.7224532224532224, "grad_norm": 9.365520477294922, "learning_rate": 9.507969507969509e-06, "loss": 0.2679, "step": 695 }, { "epoch": 0.7234927234927235, "grad_norm": 1.676113247871399, "learning_rate": 9.505659505659507e-06, "loss": 0.0282, "step": 696 }, { "epoch": 0.7245322245322245, "grad_norm": 0.15936121344566345, "learning_rate": 9.503349503349503e-06, "loss": 0.0032, "step": 697 }, { "epoch": 0.7255717255717256, "grad_norm": 6.274758338928223, "learning_rate": 9.501039501039502e-06, "loss": 0.1157, "step": 698 }, { "epoch": 0.7266112266112266, "grad_norm": 16.684329986572266, "learning_rate": 9.4987294987295e-06, "loss": 1.1177, "step": 699 }, { "epoch": 0.7276507276507277, "grad_norm": 16.026220321655273, "learning_rate": 9.496419496419496e-06, "loss": 0.7498, "step": 700 }, { "epoch": 0.7286902286902287, "grad_norm": 9.84882640838623, "learning_rate": 9.494109494109494e-06, "loss": 0.2845, "step": 701 }, { "epoch": 0.7297297297297297, "grad_norm": 18.592073440551758, "learning_rate": 9.491799491799493e-06, "loss": 1.047, "step": 702 }, { "epoch": 0.7307692307692307, "grad_norm": 1.2101701498031616, "learning_rate": 9.489489489489491e-06, "loss": 0.0184, "step": 703 }, { "epoch": 0.7318087318087318, "grad_norm": 1.962371826171875, "learning_rate": 9.487179487179487e-06, "loss": 0.0255, "step": 704 }, { "epoch": 0.7328482328482329, "grad_norm": 8.123842239379883, "learning_rate": 9.484869484869486e-06, "loss": 0.1329, "step": 705 }, { "epoch": 0.7338877338877339, "grad_norm": 0.8697485327720642, "learning_rate": 9.482559482559484e-06, "loss": 0.008, "step": 706 }, { "epoch": 0.7349272349272349, "grad_norm": 12.17270278930664, "learning_rate": 9.480249480249482e-06, "loss": 0.3108, "step": 707 }, { "epoch": 0.735966735966736, "grad_norm": 10.54183578491211, "learning_rate": 9.477939477939478e-06, "loss": 0.4662, "step": 708 }, { "epoch": 0.737006237006237, "grad_norm": 28.778478622436523, "learning_rate": 9.475629475629477e-06, "loss": 0.3674, "step": 709 }, { "epoch": 0.738045738045738, "grad_norm": 7.773111820220947, "learning_rate": 9.473319473319475e-06, "loss": 0.2856, "step": 710 }, { "epoch": 0.739085239085239, "grad_norm": 4.889204502105713, "learning_rate": 9.471009471009471e-06, "loss": 0.0689, "step": 711 }, { "epoch": 0.7401247401247402, "grad_norm": 11.898750305175781, "learning_rate": 9.46869946869947e-06, "loss": 0.505, "step": 712 }, { "epoch": 0.7411642411642412, "grad_norm": 1.3363757133483887, "learning_rate": 9.466389466389468e-06, "loss": 0.021, "step": 713 }, { "epoch": 0.7422037422037422, "grad_norm": 9.92277717590332, "learning_rate": 9.464079464079466e-06, "loss": 0.2794, "step": 714 }, { "epoch": 0.7432432432432432, "grad_norm": 16.263818740844727, "learning_rate": 9.461769461769462e-06, "loss": 1.2749, "step": 715 }, { "epoch": 0.7442827442827443, "grad_norm": 4.112827777862549, "learning_rate": 9.45945945945946e-06, "loss": 0.0872, "step": 716 }, { "epoch": 0.7453222453222453, "grad_norm": 12.491456031799316, "learning_rate": 9.457149457149459e-06, "loss": 0.7282, "step": 717 }, { "epoch": 0.7463617463617463, "grad_norm": 8.743971824645996, "learning_rate": 9.454839454839455e-06, "loss": 0.1595, "step": 718 }, { "epoch": 0.7474012474012474, "grad_norm": 3.7295806407928467, "learning_rate": 9.452529452529453e-06, "loss": 0.0636, "step": 719 }, { "epoch": 0.7484407484407485, "grad_norm": 10.543453216552734, "learning_rate": 9.450219450219452e-06, "loss": 0.1802, "step": 720 }, { "epoch": 0.7494802494802495, "grad_norm": 3.609959840774536, "learning_rate": 9.44790944790945e-06, "loss": 0.1445, "step": 721 }, { "epoch": 0.7505197505197505, "grad_norm": 1.9326978921890259, "learning_rate": 9.445599445599446e-06, "loss": 0.0255, "step": 722 }, { "epoch": 0.7515592515592515, "grad_norm": 8.692984580993652, "learning_rate": 9.443289443289445e-06, "loss": 0.5295, "step": 723 }, { "epoch": 0.7525987525987526, "grad_norm": 13.966374397277832, "learning_rate": 9.440979440979443e-06, "loss": 0.5148, "step": 724 }, { "epoch": 0.7536382536382537, "grad_norm": 5.916383743286133, "learning_rate": 9.43866943866944e-06, "loss": 0.1337, "step": 725 }, { "epoch": 0.7546777546777547, "grad_norm": 0.8488734364509583, "learning_rate": 9.436359436359437e-06, "loss": 0.0174, "step": 726 }, { "epoch": 0.7557172557172557, "grad_norm": 0.8473299145698547, "learning_rate": 9.434049434049436e-06, "loss": 0.016, "step": 727 }, { "epoch": 0.7567567567567568, "grad_norm": 0.2719501554965973, "learning_rate": 9.431739431739432e-06, "loss": 0.0036, "step": 728 }, { "epoch": 0.7577962577962578, "grad_norm": 12.92343807220459, "learning_rate": 9.42942942942943e-06, "loss": 0.5638, "step": 729 }, { "epoch": 0.7588357588357588, "grad_norm": 5.92317533493042, "learning_rate": 9.427119427119428e-06, "loss": 0.1339, "step": 730 }, { "epoch": 0.7598752598752598, "grad_norm": 14.713382720947266, "learning_rate": 9.424809424809425e-06, "loss": 0.3023, "step": 731 }, { "epoch": 0.760914760914761, "grad_norm": 8.398733139038086, "learning_rate": 9.422499422499423e-06, "loss": 0.3296, "step": 732 }, { "epoch": 0.761954261954262, "grad_norm": 7.095760822296143, "learning_rate": 9.420189420189421e-06, "loss": 0.1501, "step": 733 }, { "epoch": 0.762993762993763, "grad_norm": 9.981751441955566, "learning_rate": 9.417879417879418e-06, "loss": 0.5741, "step": 734 }, { "epoch": 0.764033264033264, "grad_norm": 0.5671398043632507, "learning_rate": 9.415569415569416e-06, "loss": 0.0087, "step": 735 }, { "epoch": 0.7650727650727651, "grad_norm": 17.42526626586914, "learning_rate": 9.413259413259414e-06, "loss": 0.6538, "step": 736 }, { "epoch": 0.7661122661122661, "grad_norm": 8.388923645019531, "learning_rate": 9.41094941094941e-06, "loss": 0.2159, "step": 737 }, { "epoch": 0.7671517671517671, "grad_norm": 14.789589881896973, "learning_rate": 9.408639408639409e-06, "loss": 0.2748, "step": 738 }, { "epoch": 0.7681912681912682, "grad_norm": 12.18989086151123, "learning_rate": 9.406329406329407e-06, "loss": 0.6697, "step": 739 }, { "epoch": 0.7692307692307693, "grad_norm": 21.005599975585938, "learning_rate": 9.404019404019404e-06, "loss": 1.4857, "step": 740 }, { "epoch": 0.7702702702702703, "grad_norm": 4.1699748039245605, "learning_rate": 9.401709401709402e-06, "loss": 0.0761, "step": 741 }, { "epoch": 0.7713097713097713, "grad_norm": 16.020122528076172, "learning_rate": 9.3993993993994e-06, "loss": 0.5824, "step": 742 }, { "epoch": 0.7723492723492723, "grad_norm": 14.067607879638672, "learning_rate": 9.397089397089398e-06, "loss": 0.4511, "step": 743 }, { "epoch": 0.7733887733887734, "grad_norm": 8.97590160369873, "learning_rate": 9.394779394779395e-06, "loss": 0.2776, "step": 744 }, { "epoch": 0.7744282744282744, "grad_norm": 5.633431911468506, "learning_rate": 9.392469392469393e-06, "loss": 0.1346, "step": 745 }, { "epoch": 0.7754677754677755, "grad_norm": 12.008380889892578, "learning_rate": 9.390159390159391e-06, "loss": 0.3179, "step": 746 }, { "epoch": 0.7765072765072765, "grad_norm": 2.2428348064422607, "learning_rate": 9.387849387849388e-06, "loss": 0.0619, "step": 747 }, { "epoch": 0.7775467775467776, "grad_norm": 17.27653694152832, "learning_rate": 9.385539385539386e-06, "loss": 0.814, "step": 748 }, { "epoch": 0.7785862785862786, "grad_norm": 4.97644567489624, "learning_rate": 9.383229383229384e-06, "loss": 0.0552, "step": 749 }, { "epoch": 0.7796257796257796, "grad_norm": 3.5207571983337402, "learning_rate": 9.380919380919382e-06, "loss": 0.038, "step": 750 }, { "epoch": 0.7806652806652806, "grad_norm": 4.021271228790283, "learning_rate": 9.378609378609379e-06, "loss": 0.0547, "step": 751 }, { "epoch": 0.7817047817047817, "grad_norm": 11.576969146728516, "learning_rate": 9.376299376299377e-06, "loss": 0.948, "step": 752 }, { "epoch": 0.7827442827442828, "grad_norm": 0.44165757298469543, "learning_rate": 9.373989373989375e-06, "loss": 0.0109, "step": 753 }, { "epoch": 0.7837837837837838, "grad_norm": 7.726773262023926, "learning_rate": 9.371679371679371e-06, "loss": 0.2671, "step": 754 }, { "epoch": 0.7848232848232848, "grad_norm": 8.891253471374512, "learning_rate": 9.36936936936937e-06, "loss": 0.2311, "step": 755 }, { "epoch": 0.7858627858627859, "grad_norm": 11.3536958694458, "learning_rate": 9.367059367059368e-06, "loss": 0.6166, "step": 756 }, { "epoch": 0.7869022869022869, "grad_norm": 11.843234062194824, "learning_rate": 9.364749364749366e-06, "loss": 0.7811, "step": 757 }, { "epoch": 0.7879417879417879, "grad_norm": 14.331742286682129, "learning_rate": 9.362439362439363e-06, "loss": 0.5065, "step": 758 }, { "epoch": 0.7889812889812889, "grad_norm": 23.064044952392578, "learning_rate": 9.36012936012936e-06, "loss": 0.9797, "step": 759 }, { "epoch": 0.7900207900207901, "grad_norm": 3.3058671951293945, "learning_rate": 9.357819357819359e-06, "loss": 0.0457, "step": 760 }, { "epoch": 0.7910602910602911, "grad_norm": 8.004167556762695, "learning_rate": 9.355509355509357e-06, "loss": 0.1693, "step": 761 }, { "epoch": 0.7920997920997921, "grad_norm": 10.097188949584961, "learning_rate": 9.353199353199354e-06, "loss": 0.247, "step": 762 }, { "epoch": 0.7931392931392931, "grad_norm": 5.271298408508301, "learning_rate": 9.350889350889352e-06, "loss": 0.0974, "step": 763 }, { "epoch": 0.7941787941787942, "grad_norm": 0.3096296787261963, "learning_rate": 9.34857934857935e-06, "loss": 0.0059, "step": 764 }, { "epoch": 0.7952182952182952, "grad_norm": 11.955662727355957, "learning_rate": 9.346269346269346e-06, "loss": 0.6279, "step": 765 }, { "epoch": 0.7962577962577962, "grad_norm": 4.01197624206543, "learning_rate": 9.343959343959345e-06, "loss": 0.0092, "step": 766 }, { "epoch": 0.7972972972972973, "grad_norm": 1.3834837675094604, "learning_rate": 9.341649341649343e-06, "loss": 0.0126, "step": 767 }, { "epoch": 0.7983367983367984, "grad_norm": 10.295829772949219, "learning_rate": 9.339339339339341e-06, "loss": 0.2918, "step": 768 }, { "epoch": 0.7993762993762994, "grad_norm": 5.462759494781494, "learning_rate": 9.337029337029338e-06, "loss": 0.1415, "step": 769 }, { "epoch": 0.8004158004158004, "grad_norm": 10.10068416595459, "learning_rate": 9.334719334719336e-06, "loss": 0.2449, "step": 770 }, { "epoch": 0.8014553014553014, "grad_norm": 13.373099327087402, "learning_rate": 9.332409332409334e-06, "loss": 0.2247, "step": 771 }, { "epoch": 0.8024948024948025, "grad_norm": 9.904016494750977, "learning_rate": 9.33009933009933e-06, "loss": 0.4769, "step": 772 }, { "epoch": 0.8035343035343036, "grad_norm": 5.0328898429870605, "learning_rate": 9.327789327789329e-06, "loss": 0.0871, "step": 773 }, { "epoch": 0.8045738045738046, "grad_norm": 2.7860841751098633, "learning_rate": 9.325479325479327e-06, "loss": 0.0474, "step": 774 }, { "epoch": 0.8056133056133056, "grad_norm": 1.0327810049057007, "learning_rate": 9.323169323169325e-06, "loss": 0.0124, "step": 775 }, { "epoch": 0.8066528066528067, "grad_norm": 20.533740997314453, "learning_rate": 9.320859320859322e-06, "loss": 0.5364, "step": 776 }, { "epoch": 0.8076923076923077, "grad_norm": 1.5147625207901, "learning_rate": 9.31854931854932e-06, "loss": 0.0465, "step": 777 }, { "epoch": 0.8087318087318087, "grad_norm": 2.101473331451416, "learning_rate": 9.316239316239318e-06, "loss": 0.042, "step": 778 }, { "epoch": 0.8097713097713097, "grad_norm": 13.39181137084961, "learning_rate": 9.313929313929314e-06, "loss": 0.3735, "step": 779 }, { "epoch": 0.8108108108108109, "grad_norm": 18.286285400390625, "learning_rate": 9.311619311619313e-06, "loss": 0.6235, "step": 780 }, { "epoch": 0.8118503118503119, "grad_norm": 19.759904861450195, "learning_rate": 9.30930930930931e-06, "loss": 0.722, "step": 781 }, { "epoch": 0.8128898128898129, "grad_norm": 12.798063278198242, "learning_rate": 9.306999306999307e-06, "loss": 0.3945, "step": 782 }, { "epoch": 0.8139293139293139, "grad_norm": 6.071016788482666, "learning_rate": 9.304689304689305e-06, "loss": 0.2655, "step": 783 }, { "epoch": 0.814968814968815, "grad_norm": 2.006742477416992, "learning_rate": 9.302379302379304e-06, "loss": 0.0316, "step": 784 }, { "epoch": 0.816008316008316, "grad_norm": 19.665082931518555, "learning_rate": 9.3000693000693e-06, "loss": 1.3005, "step": 785 }, { "epoch": 0.817047817047817, "grad_norm": 19.220882415771484, "learning_rate": 9.297759297759298e-06, "loss": 0.6756, "step": 786 }, { "epoch": 0.818087318087318, "grad_norm": 7.785876274108887, "learning_rate": 9.295449295449297e-06, "loss": 0.2178, "step": 787 }, { "epoch": 0.8191268191268192, "grad_norm": 11.736283302307129, "learning_rate": 9.293139293139293e-06, "loss": 0.1697, "step": 788 }, { "epoch": 0.8201663201663202, "grad_norm": 12.8834228515625, "learning_rate": 9.290829290829291e-06, "loss": 0.8898, "step": 789 }, { "epoch": 0.8212058212058212, "grad_norm": 7.315210819244385, "learning_rate": 9.28851928851929e-06, "loss": 0.1394, "step": 790 }, { "epoch": 0.8222453222453222, "grad_norm": 13.195050239562988, "learning_rate": 9.286209286209288e-06, "loss": 0.5235, "step": 791 }, { "epoch": 0.8232848232848233, "grad_norm": 4.913758754730225, "learning_rate": 9.283899283899284e-06, "loss": 0.1314, "step": 792 }, { "epoch": 0.8243243243243243, "grad_norm": 7.278382301330566, "learning_rate": 9.281589281589282e-06, "loss": 0.2362, "step": 793 }, { "epoch": 0.8253638253638254, "grad_norm": 8.85083293914795, "learning_rate": 9.27927927927928e-06, "loss": 0.236, "step": 794 }, { "epoch": 0.8264033264033264, "grad_norm": 11.661330223083496, "learning_rate": 9.276969276969277e-06, "loss": 0.4306, "step": 795 }, { "epoch": 0.8274428274428275, "grad_norm": 7.37078332901001, "learning_rate": 9.274659274659275e-06, "loss": 0.1825, "step": 796 }, { "epoch": 0.8284823284823285, "grad_norm": 12.864847183227539, "learning_rate": 9.272349272349273e-06, "loss": 0.6644, "step": 797 }, { "epoch": 0.8295218295218295, "grad_norm": 16.365703582763672, "learning_rate": 9.27003927003927e-06, "loss": 0.9907, "step": 798 }, { "epoch": 0.8305613305613305, "grad_norm": 14.491663932800293, "learning_rate": 9.267729267729268e-06, "loss": 0.574, "step": 799 }, { "epoch": 0.8316008316008316, "grad_norm": 6.973012447357178, "learning_rate": 9.265419265419266e-06, "loss": 0.2341, "step": 800 }, { "epoch": 0.8326403326403327, "grad_norm": 0.363965779542923, "learning_rate": 9.263109263109263e-06, "loss": 0.0068, "step": 801 }, { "epoch": 0.8336798336798337, "grad_norm": 0.5364904999732971, "learning_rate": 9.260799260799261e-06, "loss": 0.0132, "step": 802 }, { "epoch": 0.8347193347193347, "grad_norm": 2.5990946292877197, "learning_rate": 9.258489258489259e-06, "loss": 0.0541, "step": 803 }, { "epoch": 0.8357588357588358, "grad_norm": 4.446463108062744, "learning_rate": 9.256179256179257e-06, "loss": 0.093, "step": 804 }, { "epoch": 0.8367983367983368, "grad_norm": 11.481074333190918, "learning_rate": 9.253869253869254e-06, "loss": 0.8781, "step": 805 }, { "epoch": 0.8378378378378378, "grad_norm": 5.719520092010498, "learning_rate": 9.251559251559252e-06, "loss": 0.1585, "step": 806 }, { "epoch": 0.8388773388773388, "grad_norm": 0.9901307821273804, "learning_rate": 9.24924924924925e-06, "loss": 0.0226, "step": 807 }, { "epoch": 0.83991683991684, "grad_norm": 6.784252166748047, "learning_rate": 9.246939246939247e-06, "loss": 0.1927, "step": 808 }, { "epoch": 0.840956340956341, "grad_norm": 0.0940762609243393, "learning_rate": 9.244629244629245e-06, "loss": 0.0017, "step": 809 }, { "epoch": 0.841995841995842, "grad_norm": 14.0625581741333, "learning_rate": 9.242319242319243e-06, "loss": 0.6236, "step": 810 }, { "epoch": 0.843035343035343, "grad_norm": 3.779634714126587, "learning_rate": 9.240009240009241e-06, "loss": 0.1269, "step": 811 }, { "epoch": 0.8440748440748441, "grad_norm": 8.7300443649292, "learning_rate": 9.237699237699238e-06, "loss": 0.1096, "step": 812 }, { "epoch": 0.8451143451143451, "grad_norm": 4.038877010345459, "learning_rate": 9.235389235389236e-06, "loss": 0.0703, "step": 813 }, { "epoch": 0.8461538461538461, "grad_norm": 22.447866439819336, "learning_rate": 9.233079233079234e-06, "loss": 1.1194, "step": 814 }, { "epoch": 0.8471933471933472, "grad_norm": 5.937042713165283, "learning_rate": 9.230769230769232e-06, "loss": 0.2543, "step": 815 }, { "epoch": 0.8482328482328483, "grad_norm": 0.08702496439218521, "learning_rate": 9.228459228459229e-06, "loss": 0.0019, "step": 816 }, { "epoch": 0.8492723492723493, "grad_norm": 9.366803169250488, "learning_rate": 9.226149226149227e-06, "loss": 0.3223, "step": 817 }, { "epoch": 0.8503118503118503, "grad_norm": 6.609611988067627, "learning_rate": 9.223839223839225e-06, "loss": 0.1015, "step": 818 }, { "epoch": 0.8513513513513513, "grad_norm": 5.29298210144043, "learning_rate": 9.221529221529222e-06, "loss": 0.3793, "step": 819 }, { "epoch": 0.8523908523908524, "grad_norm": 3.444406509399414, "learning_rate": 9.21921921921922e-06, "loss": 0.1107, "step": 820 }, { "epoch": 0.8534303534303534, "grad_norm": 1.0146065950393677, "learning_rate": 9.216909216909218e-06, "loss": 0.0188, "step": 821 }, { "epoch": 0.8544698544698545, "grad_norm": 0.4745628833770752, "learning_rate": 9.214599214599216e-06, "loss": 0.0088, "step": 822 }, { "epoch": 0.8555093555093555, "grad_norm": 16.4841365814209, "learning_rate": 9.212289212289213e-06, "loss": 0.7025, "step": 823 }, { "epoch": 0.8565488565488566, "grad_norm": 5.551940441131592, "learning_rate": 9.209979209979211e-06, "loss": 0.0577, "step": 824 }, { "epoch": 0.8575883575883576, "grad_norm": 5.3550262451171875, "learning_rate": 9.207669207669209e-06, "loss": 0.0957, "step": 825 }, { "epoch": 0.8586278586278586, "grad_norm": 18.20002555847168, "learning_rate": 9.205359205359206e-06, "loss": 1.2307, "step": 826 }, { "epoch": 0.8596673596673596, "grad_norm": 3.5077757835388184, "learning_rate": 9.203049203049204e-06, "loss": 0.0976, "step": 827 }, { "epoch": 0.8607068607068608, "grad_norm": 0.0986674576997757, "learning_rate": 9.200739200739202e-06, "loss": 0.003, "step": 828 }, { "epoch": 0.8617463617463618, "grad_norm": 14.541893005371094, "learning_rate": 9.1984291984292e-06, "loss": 0.5262, "step": 829 }, { "epoch": 0.8627858627858628, "grad_norm": 10.445574760437012, "learning_rate": 9.196119196119197e-06, "loss": 0.3528, "step": 830 }, { "epoch": 0.8638253638253638, "grad_norm": 9.452934265136719, "learning_rate": 9.193809193809195e-06, "loss": 0.2406, "step": 831 }, { "epoch": 0.8648648648648649, "grad_norm": 10.794607162475586, "learning_rate": 9.191499191499193e-06, "loss": 0.2028, "step": 832 }, { "epoch": 0.8659043659043659, "grad_norm": 11.823083877563477, "learning_rate": 9.189189189189191e-06, "loss": 0.5005, "step": 833 }, { "epoch": 0.8669438669438669, "grad_norm": 12.307615280151367, "learning_rate": 9.186879186879188e-06, "loss": 0.2977, "step": 834 }, { "epoch": 0.867983367983368, "grad_norm": 12.70445728302002, "learning_rate": 9.184569184569186e-06, "loss": 0.5725, "step": 835 }, { "epoch": 0.8690228690228691, "grad_norm": 2.4809796810150146, "learning_rate": 9.182259182259184e-06, "loss": 0.0758, "step": 836 }, { "epoch": 0.8700623700623701, "grad_norm": 1.8290390968322754, "learning_rate": 9.17994917994918e-06, "loss": 0.0356, "step": 837 }, { "epoch": 0.8711018711018711, "grad_norm": 16.160789489746094, "learning_rate": 9.177639177639179e-06, "loss": 1.0367, "step": 838 }, { "epoch": 0.8721413721413721, "grad_norm": 14.823076248168945, "learning_rate": 9.175329175329177e-06, "loss": 0.603, "step": 839 }, { "epoch": 0.8731808731808732, "grad_norm": 16.44916534423828, "learning_rate": 9.173019173019174e-06, "loss": 0.2872, "step": 840 }, { "epoch": 0.8742203742203742, "grad_norm": 9.719271659851074, "learning_rate": 9.170709170709172e-06, "loss": 0.3966, "step": 841 }, { "epoch": 0.8752598752598753, "grad_norm": 0.07257251441478729, "learning_rate": 9.16839916839917e-06, "loss": 0.0012, "step": 842 }, { "epoch": 0.8762993762993763, "grad_norm": 7.706364631652832, "learning_rate": 9.166089166089166e-06, "loss": 0.2788, "step": 843 }, { "epoch": 0.8773388773388774, "grad_norm": 7.656033992767334, "learning_rate": 9.163779163779165e-06, "loss": 0.1487, "step": 844 }, { "epoch": 0.8783783783783784, "grad_norm": 2.814105749130249, "learning_rate": 9.161469161469163e-06, "loss": 0.031, "step": 845 }, { "epoch": 0.8794178794178794, "grad_norm": 7.749004364013672, "learning_rate": 9.15915915915916e-06, "loss": 0.0998, "step": 846 }, { "epoch": 0.8804573804573804, "grad_norm": 1.6006381511688232, "learning_rate": 9.156849156849157e-06, "loss": 0.0176, "step": 847 }, { "epoch": 0.8814968814968815, "grad_norm": 4.9961419105529785, "learning_rate": 9.154539154539156e-06, "loss": 0.0861, "step": 848 }, { "epoch": 0.8825363825363826, "grad_norm": 12.169288635253906, "learning_rate": 9.152229152229152e-06, "loss": 0.2653, "step": 849 }, { "epoch": 0.8835758835758836, "grad_norm": 14.644551277160645, "learning_rate": 9.14991914991915e-06, "loss": 0.5879, "step": 850 }, { "epoch": 0.8846153846153846, "grad_norm": 13.133461952209473, "learning_rate": 9.147609147609149e-06, "loss": 0.3398, "step": 851 }, { "epoch": 0.8856548856548857, "grad_norm": 8.79302978515625, "learning_rate": 9.145299145299145e-06, "loss": 0.3166, "step": 852 }, { "epoch": 0.8866943866943867, "grad_norm": 0.8859695792198181, "learning_rate": 9.142989142989143e-06, "loss": 0.0172, "step": 853 }, { "epoch": 0.8877338877338877, "grad_norm": 11.242350578308105, "learning_rate": 9.140679140679141e-06, "loss": 0.4063, "step": 854 }, { "epoch": 0.8887733887733887, "grad_norm": 9.41787052154541, "learning_rate": 9.138369138369138e-06, "loss": 0.7275, "step": 855 }, { "epoch": 0.8898128898128899, "grad_norm": 6.311247825622559, "learning_rate": 9.136059136059136e-06, "loss": 0.2133, "step": 856 }, { "epoch": 0.8908523908523909, "grad_norm": 9.024970054626465, "learning_rate": 9.133749133749134e-06, "loss": 0.2356, "step": 857 }, { "epoch": 0.8918918918918919, "grad_norm": 12.511112213134766, "learning_rate": 9.131439131439132e-06, "loss": 0.6547, "step": 858 }, { "epoch": 0.8929313929313929, "grad_norm": 19.817020416259766, "learning_rate": 9.129129129129129e-06, "loss": 0.9193, "step": 859 }, { "epoch": 0.893970893970894, "grad_norm": 8.149579048156738, "learning_rate": 9.126819126819127e-06, "loss": 0.0618, "step": 860 }, { "epoch": 0.895010395010395, "grad_norm": 17.640111923217773, "learning_rate": 9.124509124509125e-06, "loss": 1.6197, "step": 861 }, { "epoch": 0.896049896049896, "grad_norm": 13.725643157958984, "learning_rate": 9.122199122199122e-06, "loss": 0.2125, "step": 862 }, { "epoch": 0.8970893970893971, "grad_norm": 13.83113956451416, "learning_rate": 9.11988911988912e-06, "loss": 0.3896, "step": 863 }, { "epoch": 0.8981288981288982, "grad_norm": 9.916857719421387, "learning_rate": 9.117579117579118e-06, "loss": 0.3564, "step": 864 }, { "epoch": 0.8991683991683992, "grad_norm": 20.230899810791016, "learning_rate": 9.115269115269116e-06, "loss": 0.4534, "step": 865 }, { "epoch": 0.9002079002079002, "grad_norm": 8.180204391479492, "learning_rate": 9.112959112959113e-06, "loss": 0.2119, "step": 866 }, { "epoch": 0.9012474012474012, "grad_norm": 14.04716968536377, "learning_rate": 9.110649110649111e-06, "loss": 0.936, "step": 867 }, { "epoch": 0.9022869022869023, "grad_norm": NaN, "learning_rate": 9.10833910833911e-06, "loss": 0.0104, "step": 868 }, { "epoch": 0.9033264033264033, "grad_norm": 3.089855909347534, "learning_rate": 9.106029106029107e-06, "loss": 0.0438, "step": 869 }, { "epoch": 0.9043659043659044, "grad_norm": 10.145646095275879, "learning_rate": 9.103719103719104e-06, "loss": 0.3031, "step": 870 }, { "epoch": 0.9054054054054054, "grad_norm": 7.578200817108154, "learning_rate": 9.101409101409102e-06, "loss": 0.2943, "step": 871 }, { "epoch": 0.9064449064449065, "grad_norm": 3.3656668663024902, "learning_rate": 9.0990990990991e-06, "loss": 0.0814, "step": 872 }, { "epoch": 0.9074844074844075, "grad_norm": 4.186402320861816, "learning_rate": 9.096789096789097e-06, "loss": 0.0662, "step": 873 }, { "epoch": 0.9085239085239085, "grad_norm": 13.625324249267578, "learning_rate": 9.094479094479095e-06, "loss": 2.2787, "step": 874 }, { "epoch": 0.9095634095634095, "grad_norm": 12.998013496398926, "learning_rate": 9.092169092169093e-06, "loss": 0.5581, "step": 875 }, { "epoch": 0.9106029106029107, "grad_norm": 17.50330352783203, "learning_rate": 9.089859089859091e-06, "loss": 0.9294, "step": 876 }, { "epoch": 0.9116424116424117, "grad_norm": 7.164412975311279, "learning_rate": 9.087549087549088e-06, "loss": 0.1759, "step": 877 }, { "epoch": 0.9126819126819127, "grad_norm": 5.057459354400635, "learning_rate": 9.085239085239086e-06, "loss": 0.0994, "step": 878 }, { "epoch": 0.9137214137214137, "grad_norm": 12.146804809570312, "learning_rate": 9.082929082929084e-06, "loss": 0.5214, "step": 879 }, { "epoch": 0.9147609147609148, "grad_norm": 1.3866074085235596, "learning_rate": 9.08061908061908e-06, "loss": 0.0279, "step": 880 }, { "epoch": 0.9158004158004158, "grad_norm": 6.697347164154053, "learning_rate": 9.078309078309079e-06, "loss": 0.142, "step": 881 }, { "epoch": 0.9168399168399168, "grad_norm": 10.428801536560059, "learning_rate": 9.075999075999077e-06, "loss": 0.4985, "step": 882 }, { "epoch": 0.9178794178794178, "grad_norm": 10.876057624816895, "learning_rate": 9.073689073689075e-06, "loss": 0.4017, "step": 883 }, { "epoch": 0.918918918918919, "grad_norm": 7.627023696899414, "learning_rate": 9.071379071379072e-06, "loss": 0.3287, "step": 884 }, { "epoch": 0.91995841995842, "grad_norm": 12.741010665893555, "learning_rate": 9.06906906906907e-06, "loss": 0.4065, "step": 885 }, { "epoch": 0.920997920997921, "grad_norm": 12.223453521728516, "learning_rate": 9.066759066759068e-06, "loss": 0.5238, "step": 886 }, { "epoch": 0.922037422037422, "grad_norm": 11.803918838500977, "learning_rate": 9.064449064449066e-06, "loss": 0.5711, "step": 887 }, { "epoch": 0.9230769230769231, "grad_norm": 5.423978805541992, "learning_rate": 9.062139062139063e-06, "loss": 0.1786, "step": 888 }, { "epoch": 0.9241164241164241, "grad_norm": 12.600936889648438, "learning_rate": 9.059829059829061e-06, "loss": 0.4155, "step": 889 }, { "epoch": 0.9251559251559252, "grad_norm": 5.410848140716553, "learning_rate": 9.05751905751906e-06, "loss": 0.121, "step": 890 }, { "epoch": 0.9261954261954262, "grad_norm": 1.709598422050476, "learning_rate": 9.055209055209056e-06, "loss": 0.0347, "step": 891 }, { "epoch": 0.9272349272349273, "grad_norm": 10.025089263916016, "learning_rate": 9.052899052899054e-06, "loss": 0.5089, "step": 892 }, { "epoch": 0.9282744282744283, "grad_norm": 0.5808941721916199, "learning_rate": 9.050589050589052e-06, "loss": 0.0147, "step": 893 }, { "epoch": 0.9293139293139293, "grad_norm": 11.988170623779297, "learning_rate": 9.048279048279049e-06, "loss": 0.9453, "step": 894 }, { "epoch": 0.9303534303534303, "grad_norm": 13.248468399047852, "learning_rate": 9.045969045969047e-06, "loss": 0.906, "step": 895 }, { "epoch": 0.9313929313929314, "grad_norm": 9.587570190429688, "learning_rate": 9.043659043659045e-06, "loss": 0.2799, "step": 896 }, { "epoch": 0.9324324324324325, "grad_norm": 4.256852626800537, "learning_rate": 9.041349041349042e-06, "loss": 0.0803, "step": 897 }, { "epoch": 0.9334719334719335, "grad_norm": 1.7316964864730835, "learning_rate": 9.03903903903904e-06, "loss": 0.0563, "step": 898 }, { "epoch": 0.9345114345114345, "grad_norm": 0.29858797788619995, "learning_rate": 9.036729036729038e-06, "loss": 0.0072, "step": 899 }, { "epoch": 0.9355509355509356, "grad_norm": 33.127559661865234, "learning_rate": 9.034419034419034e-06, "loss": 0.3419, "step": 900 }, { "epoch": 0.9365904365904366, "grad_norm": 3.9960570335388184, "learning_rate": 9.032109032109033e-06, "loss": 0.0265, "step": 901 }, { "epoch": 0.9376299376299376, "grad_norm": 5.864764213562012, "learning_rate": 9.029799029799031e-06, "loss": 0.1599, "step": 902 }, { "epoch": 0.9386694386694386, "grad_norm": 4.86962890625, "learning_rate": 9.027489027489027e-06, "loss": 0.2822, "step": 903 }, { "epoch": 0.9397089397089398, "grad_norm": 0.07891742140054703, "learning_rate": 9.025179025179026e-06, "loss": 0.002, "step": 904 }, { "epoch": 0.9407484407484408, "grad_norm": 1.544185996055603, "learning_rate": 9.022869022869024e-06, "loss": 0.025, "step": 905 }, { "epoch": 0.9417879417879418, "grad_norm": 18.113237380981445, "learning_rate": 9.02055902055902e-06, "loss": 1.8649, "step": 906 }, { "epoch": 0.9428274428274428, "grad_norm": 5.277674198150635, "learning_rate": 9.018249018249018e-06, "loss": 0.1455, "step": 907 }, { "epoch": 0.9438669438669439, "grad_norm": 3.8666481971740723, "learning_rate": 9.015939015939017e-06, "loss": 0.0412, "step": 908 }, { "epoch": 0.9449064449064449, "grad_norm": 5.831012725830078, "learning_rate": 9.013629013629013e-06, "loss": 0.5328, "step": 909 }, { "epoch": 0.9459459459459459, "grad_norm": 10.758270263671875, "learning_rate": 9.011319011319011e-06, "loss": 0.655, "step": 910 }, { "epoch": 0.946985446985447, "grad_norm": 6.609604835510254, "learning_rate": 9.00900900900901e-06, "loss": 0.2318, "step": 911 }, { "epoch": 0.9480249480249481, "grad_norm": 1.2616462707519531, "learning_rate": 9.006699006699008e-06, "loss": 0.0162, "step": 912 }, { "epoch": 0.9490644490644491, "grad_norm": 8.381549835205078, "learning_rate": 9.004389004389004e-06, "loss": 0.4058, "step": 913 }, { "epoch": 0.9501039501039501, "grad_norm": 13.460283279418945, "learning_rate": 9.002079002079002e-06, "loss": 0.6386, "step": 914 }, { "epoch": 0.9511434511434511, "grad_norm": 16.521923065185547, "learning_rate": 8.999768999769e-06, "loss": 0.9705, "step": 915 }, { "epoch": 0.9521829521829522, "grad_norm": 3.118180751800537, "learning_rate": 8.997458997458997e-06, "loss": 0.0535, "step": 916 }, { "epoch": 0.9532224532224532, "grad_norm": 3.885518789291382, "learning_rate": 8.995148995148995e-06, "loss": 0.0602, "step": 917 }, { "epoch": 0.9542619542619543, "grad_norm": 15.75216007232666, "learning_rate": 8.992838992838993e-06, "loss": 0.9788, "step": 918 }, { "epoch": 0.9553014553014553, "grad_norm": 15.924515724182129, "learning_rate": 8.990528990528992e-06, "loss": 0.7702, "step": 919 }, { "epoch": 0.9563409563409564, "grad_norm": 8.416720390319824, "learning_rate": 8.988218988218988e-06, "loss": 0.4784, "step": 920 }, { "epoch": 0.9573804573804574, "grad_norm": 9.229931831359863, "learning_rate": 8.985908985908986e-06, "loss": 1.115, "step": 921 }, { "epoch": 0.9584199584199584, "grad_norm": 4.722958564758301, "learning_rate": 8.983598983598985e-06, "loss": 0.1893, "step": 922 }, { "epoch": 0.9594594594594594, "grad_norm": 6.5108137130737305, "learning_rate": 8.981288981288983e-06, "loss": 0.1334, "step": 923 }, { "epoch": 0.9604989604989606, "grad_norm": 16.13971519470215, "learning_rate": 8.97897897897898e-06, "loss": 1.6763, "step": 924 }, { "epoch": 0.9615384615384616, "grad_norm": 6.114139556884766, "learning_rate": 8.976668976668977e-06, "loss": 0.1802, "step": 925 }, { "epoch": 0.9625779625779626, "grad_norm": 17.669862747192383, "learning_rate": 8.974358974358976e-06, "loss": 1.1319, "step": 926 }, { "epoch": 0.9636174636174636, "grad_norm": 2.548318386077881, "learning_rate": 8.972048972048972e-06, "loss": 0.1018, "step": 927 }, { "epoch": 0.9646569646569647, "grad_norm": 7.308948040008545, "learning_rate": 8.96973896973897e-06, "loss": 0.1264, "step": 928 }, { "epoch": 0.9656964656964657, "grad_norm": 3.920917272567749, "learning_rate": 8.967428967428968e-06, "loss": 0.1023, "step": 929 }, { "epoch": 0.9667359667359667, "grad_norm": 10.714740753173828, "learning_rate": 8.965118965118967e-06, "loss": 0.7717, "step": 930 }, { "epoch": 0.9677754677754677, "grad_norm": 7.647885322570801, "learning_rate": 8.962808962808963e-06, "loss": 0.2193, "step": 931 }, { "epoch": 0.9688149688149689, "grad_norm": 9.926279067993164, "learning_rate": 8.960498960498961e-06, "loss": 0.4121, "step": 932 }, { "epoch": 0.9698544698544699, "grad_norm": 1.8114266395568848, "learning_rate": 8.95818895818896e-06, "loss": 0.0401, "step": 933 }, { "epoch": 0.9708939708939709, "grad_norm": 13.62614917755127, "learning_rate": 8.955878955878956e-06, "loss": 0.8964, "step": 934 }, { "epoch": 0.9719334719334719, "grad_norm": 1.0622432231903076, "learning_rate": 8.953568953568954e-06, "loss": 0.0155, "step": 935 }, { "epoch": 0.972972972972973, "grad_norm": 6.726819038391113, "learning_rate": 8.951258951258952e-06, "loss": 0.1767, "step": 936 }, { "epoch": 0.974012474012474, "grad_norm": 8.454751014709473, "learning_rate": 8.94894894894895e-06, "loss": 0.5787, "step": 937 }, { "epoch": 0.975051975051975, "grad_norm": 1.420302152633667, "learning_rate": 8.946638946638947e-06, "loss": 0.0451, "step": 938 }, { "epoch": 0.9760914760914761, "grad_norm": 3.865201234817505, "learning_rate": 8.944328944328945e-06, "loss": 0.0869, "step": 939 }, { "epoch": 0.9771309771309772, "grad_norm": 4.933730125427246, "learning_rate": 8.942018942018943e-06, "loss": 0.0811, "step": 940 }, { "epoch": 0.9781704781704782, "grad_norm": 0.5709061622619629, "learning_rate": 8.939708939708942e-06, "loss": 0.0124, "step": 941 }, { "epoch": 0.9792099792099792, "grad_norm": 5.7213053703308105, "learning_rate": 8.937398937398938e-06, "loss": 0.0675, "step": 942 }, { "epoch": 0.9802494802494802, "grad_norm": 18.375486373901367, "learning_rate": 8.935088935088936e-06, "loss": 0.4712, "step": 943 }, { "epoch": 0.9812889812889813, "grad_norm": 12.987750053405762, "learning_rate": 8.932778932778935e-06, "loss": 0.2181, "step": 944 }, { "epoch": 0.9823284823284824, "grad_norm": 2.9199960231781006, "learning_rate": 8.930468930468931e-06, "loss": 0.0576, "step": 945 }, { "epoch": 0.9833679833679834, "grad_norm": 8.773855209350586, "learning_rate": 8.92815892815893e-06, "loss": 0.1334, "step": 946 }, { "epoch": 0.9844074844074844, "grad_norm": 3.0844333171844482, "learning_rate": 8.925848925848927e-06, "loss": 0.1197, "step": 947 }, { "epoch": 0.9854469854469855, "grad_norm": 7.7949066162109375, "learning_rate": 8.923538923538924e-06, "loss": 0.4242, "step": 948 }, { "epoch": 0.9864864864864865, "grad_norm": 3.214975595474243, "learning_rate": 8.921228921228922e-06, "loss": 0.0894, "step": 949 }, { "epoch": 0.9875259875259875, "grad_norm": 3.6018924713134766, "learning_rate": 8.91891891891892e-06, "loss": 0.2834, "step": 950 }, { "epoch": 0.9885654885654885, "grad_norm": 7.012938499450684, "learning_rate": 8.916608916608917e-06, "loss": 0.1426, "step": 951 }, { "epoch": 0.9896049896049897, "grad_norm": 23.49381446838379, "learning_rate": 8.914298914298915e-06, "loss": 1.7739, "step": 952 }, { "epoch": 0.9906444906444907, "grad_norm": 13.419476509094238, "learning_rate": 8.911988911988913e-06, "loss": 0.683, "step": 953 }, { "epoch": 0.9916839916839917, "grad_norm": 10.06177043914795, "learning_rate": 8.90967890967891e-06, "loss": 0.541, "step": 954 }, { "epoch": 0.9927234927234927, "grad_norm": 11.302132606506348, "learning_rate": 8.907368907368908e-06, "loss": 0.6544, "step": 955 }, { "epoch": 0.9937629937629938, "grad_norm": 2.3033945560455322, "learning_rate": 8.905058905058906e-06, "loss": 0.0506, "step": 956 }, { "epoch": 0.9948024948024948, "grad_norm": 8.643536567687988, "learning_rate": 8.902748902748903e-06, "loss": 0.4236, "step": 957 }, { "epoch": 0.9958419958419958, "grad_norm": 10.508223533630371, "learning_rate": 8.9004389004389e-06, "loss": 0.8041, "step": 958 }, { "epoch": 0.9968814968814969, "grad_norm": 16.5745792388916, "learning_rate": 8.898128898128899e-06, "loss": 0.7238, "step": 959 }, { "epoch": 0.997920997920998, "grad_norm": 6.697941303253174, "learning_rate": 8.895818895818895e-06, "loss": 0.1965, "step": 960 }, { "epoch": 0.998960498960499, "grad_norm": 1.8422704935073853, "learning_rate": 8.893508893508894e-06, "loss": 0.0437, "step": 961 }, { "epoch": 1.0, "grad_norm": 2.4980247020721436, "learning_rate": 8.891198891198892e-06, "loss": 0.0669, "step": 962 }, { "epoch": 1.001039501039501, "grad_norm": 5.414978504180908, "learning_rate": 8.888888888888888e-06, "loss": 0.2941, "step": 963 }, { "epoch": 1.002079002079002, "grad_norm": 10.896177291870117, "learning_rate": 8.886578886578886e-06, "loss": 0.7354, "step": 964 }, { "epoch": 1.003118503118503, "grad_norm": 11.286123275756836, "learning_rate": 8.884268884268885e-06, "loss": 0.4795, "step": 965 }, { "epoch": 1.004158004158004, "grad_norm": 16.44017219543457, "learning_rate": 8.881958881958883e-06, "loss": 0.683, "step": 966 }, { "epoch": 1.0051975051975053, "grad_norm": 6.277804374694824, "learning_rate": 8.87964887964888e-06, "loss": 0.2146, "step": 967 }, { "epoch": 1.0062370062370063, "grad_norm": 14.160048484802246, "learning_rate": 8.877338877338878e-06, "loss": 0.9587, "step": 968 }, { "epoch": 1.0072765072765073, "grad_norm": 1.9493069648742676, "learning_rate": 8.875028875028876e-06, "loss": 0.0522, "step": 969 }, { "epoch": 1.0083160083160083, "grad_norm": 1.8083431720733643, "learning_rate": 8.872718872718874e-06, "loss": 0.0541, "step": 970 }, { "epoch": 1.0093555093555093, "grad_norm": 2.2266340255737305, "learning_rate": 8.87040887040887e-06, "loss": 0.033, "step": 971 }, { "epoch": 1.0103950103950103, "grad_norm": 0.941646933555603, "learning_rate": 8.868098868098869e-06, "loss": 0.0124, "step": 972 }, { "epoch": 1.0114345114345114, "grad_norm": 7.98224401473999, "learning_rate": 8.865788865788867e-06, "loss": 0.3622, "step": 973 }, { "epoch": 1.0124740124740126, "grad_norm": 5.987460613250732, "learning_rate": 8.863478863478863e-06, "loss": 0.2252, "step": 974 }, { "epoch": 1.0135135135135136, "grad_norm": 11.708882331848145, "learning_rate": 8.861168861168862e-06, "loss": 0.411, "step": 975 }, { "epoch": 1.0145530145530146, "grad_norm": 8.168597221374512, "learning_rate": 8.85885885885886e-06, "loss": 0.3952, "step": 976 }, { "epoch": 1.0155925155925156, "grad_norm": 0.640728235244751, "learning_rate": 8.856548856548858e-06, "loss": 0.015, "step": 977 }, { "epoch": 1.0166320166320166, "grad_norm": 10.76993465423584, "learning_rate": 8.854238854238854e-06, "loss": 0.4832, "step": 978 }, { "epoch": 1.0176715176715176, "grad_norm": 12.423575401306152, "learning_rate": 8.851928851928853e-06, "loss": 0.3292, "step": 979 }, { "epoch": 1.0187110187110187, "grad_norm": 13.478914260864258, "learning_rate": 8.84961884961885e-06, "loss": 0.8858, "step": 980 }, { "epoch": 1.0197505197505197, "grad_norm": 9.08768367767334, "learning_rate": 8.847308847308847e-06, "loss": 0.4073, "step": 981 }, { "epoch": 1.0207900207900207, "grad_norm": 0.12124160677194595, "learning_rate": 8.844998844998845e-06, "loss": 0.0027, "step": 982 }, { "epoch": 1.021829521829522, "grad_norm": 15.824604988098145, "learning_rate": 8.842688842688844e-06, "loss": 1.2646, "step": 983 }, { "epoch": 1.022869022869023, "grad_norm": 1.92180597782135, "learning_rate": 8.840378840378842e-06, "loss": 0.0305, "step": 984 }, { "epoch": 1.023908523908524, "grad_norm": 3.483933210372925, "learning_rate": 8.838068838068838e-06, "loss": 0.0649, "step": 985 }, { "epoch": 1.024948024948025, "grad_norm": 7.1540045738220215, "learning_rate": 8.835758835758837e-06, "loss": 0.3779, "step": 986 }, { "epoch": 1.025987525987526, "grad_norm": 0.35930129885673523, "learning_rate": 8.833448833448835e-06, "loss": 0.0067, "step": 987 }, { "epoch": 1.027027027027027, "grad_norm": 5.606091022491455, "learning_rate": 8.831138831138833e-06, "loss": 0.1168, "step": 988 }, { "epoch": 1.028066528066528, "grad_norm": 13.382498741149902, "learning_rate": 8.82882882882883e-06, "loss": 1.0233, "step": 989 }, { "epoch": 1.0291060291060292, "grad_norm": 8.95033073425293, "learning_rate": 8.826518826518828e-06, "loss": 1.1854, "step": 990 }, { "epoch": 1.0301455301455302, "grad_norm": 10.991436004638672, "learning_rate": 8.824208824208826e-06, "loss": 0.8696, "step": 991 }, { "epoch": 1.0311850311850312, "grad_norm": 17.413393020629883, "learning_rate": 8.821898821898822e-06, "loss": 0.3927, "step": 992 }, { "epoch": 1.0322245322245323, "grad_norm": 0.3354548513889313, "learning_rate": 8.81958881958882e-06, "loss": 0.0095, "step": 993 }, { "epoch": 1.0332640332640333, "grad_norm": 3.1128478050231934, "learning_rate": 8.817278817278819e-06, "loss": 0.1201, "step": 994 }, { "epoch": 1.0343035343035343, "grad_norm": 1.8844947814941406, "learning_rate": 8.814968814968817e-06, "loss": 0.0735, "step": 995 }, { "epoch": 1.0353430353430353, "grad_norm": 3.381255626678467, "learning_rate": 8.812658812658813e-06, "loss": 0.0986, "step": 996 }, { "epoch": 1.0363825363825363, "grad_norm": 11.728659629821777, "learning_rate": 8.810348810348812e-06, "loss": 0.3749, "step": 997 }, { "epoch": 1.0374220374220373, "grad_norm": 9.866026878356934, "learning_rate": 8.80803880803881e-06, "loss": 0.4012, "step": 998 }, { "epoch": 1.0384615384615385, "grad_norm": 7.215566635131836, "learning_rate": 8.805728805728806e-06, "loss": 0.2596, "step": 999 }, { "epoch": 1.0395010395010396, "grad_norm": 0.8678320646286011, "learning_rate": 8.803418803418804e-06, "loss": 0.0259, "step": 1000 }, { "epoch": 1.0405405405405406, "grad_norm": 2.84258770942688, "learning_rate": 8.801108801108803e-06, "loss": 0.0602, "step": 1001 }, { "epoch": 1.0415800415800416, "grad_norm": 0.3957040309906006, "learning_rate": 8.798798798798799e-06, "loss": 0.006, "step": 1002 }, { "epoch": 1.0426195426195426, "grad_norm": 10.925928115844727, "learning_rate": 8.796488796488797e-06, "loss": 0.7463, "step": 1003 }, { "epoch": 1.0436590436590436, "grad_norm": 9.018205642700195, "learning_rate": 8.794178794178795e-06, "loss": 0.826, "step": 1004 }, { "epoch": 1.0446985446985446, "grad_norm": 1.216611623764038, "learning_rate": 8.791868791868794e-06, "loss": 0.0239, "step": 1005 }, { "epoch": 1.0457380457380459, "grad_norm": 0.12714122235774994, "learning_rate": 8.78955878955879e-06, "loss": 0.0023, "step": 1006 }, { "epoch": 1.0467775467775469, "grad_norm": 1.0121233463287354, "learning_rate": 8.787248787248788e-06, "loss": 0.0178, "step": 1007 }, { "epoch": 1.0478170478170479, "grad_norm": 5.475372314453125, "learning_rate": 8.784938784938787e-06, "loss": 0.0919, "step": 1008 }, { "epoch": 1.0488565488565489, "grad_norm": 2.838118553161621, "learning_rate": 8.782628782628783e-06, "loss": 0.0921, "step": 1009 }, { "epoch": 1.04989604989605, "grad_norm": 9.721406936645508, "learning_rate": 8.780318780318781e-06, "loss": 0.3828, "step": 1010 }, { "epoch": 1.050935550935551, "grad_norm": 9.811200141906738, "learning_rate": 8.77800877800878e-06, "loss": 0.5693, "step": 1011 }, { "epoch": 1.051975051975052, "grad_norm": 1.1226965188980103, "learning_rate": 8.775698775698776e-06, "loss": 0.0177, "step": 1012 }, { "epoch": 1.053014553014553, "grad_norm": 2.9267220497131348, "learning_rate": 8.773388773388774e-06, "loss": 0.0449, "step": 1013 }, { "epoch": 1.054054054054054, "grad_norm": 9.672924041748047, "learning_rate": 8.771078771078772e-06, "loss": 0.2402, "step": 1014 }, { "epoch": 1.0550935550935552, "grad_norm": 11.029509544372559, "learning_rate": 8.768768768768769e-06, "loss": 0.5824, "step": 1015 }, { "epoch": 1.0561330561330562, "grad_norm": 16.500804901123047, "learning_rate": 8.766458766458767e-06, "loss": 1.3339, "step": 1016 }, { "epoch": 1.0571725571725572, "grad_norm": 3.943711757659912, "learning_rate": 8.764148764148765e-06, "loss": 0.1545, "step": 1017 }, { "epoch": 1.0582120582120582, "grad_norm": 8.231452941894531, "learning_rate": 8.761838761838762e-06, "loss": 0.3989, "step": 1018 }, { "epoch": 1.0592515592515592, "grad_norm": 7.401911735534668, "learning_rate": 8.75952875952876e-06, "loss": 0.1997, "step": 1019 }, { "epoch": 1.0602910602910602, "grad_norm": 12.006552696228027, "learning_rate": 8.757218757218758e-06, "loss": 0.6828, "step": 1020 }, { "epoch": 1.0613305613305613, "grad_norm": 24.606016159057617, "learning_rate": 8.754908754908755e-06, "loss": 1.5875, "step": 1021 }, { "epoch": 1.0623700623700625, "grad_norm": 18.473796844482422, "learning_rate": 8.752598752598753e-06, "loss": 0.8307, "step": 1022 }, { "epoch": 1.0634095634095635, "grad_norm": 21.40069580078125, "learning_rate": 8.750288750288751e-06, "loss": 0.5301, "step": 1023 }, { "epoch": 1.0644490644490645, "grad_norm": 8.655167579650879, "learning_rate": 8.747978747978749e-06, "loss": 0.3022, "step": 1024 }, { "epoch": 1.0654885654885655, "grad_norm": 13.602567672729492, "learning_rate": 8.745668745668746e-06, "loss": 0.373, "step": 1025 }, { "epoch": 1.0665280665280665, "grad_norm": 2.638014793395996, "learning_rate": 8.743358743358744e-06, "loss": 0.0676, "step": 1026 }, { "epoch": 1.0675675675675675, "grad_norm": 7.661709308624268, "learning_rate": 8.741048741048742e-06, "loss": 0.2338, "step": 1027 }, { "epoch": 1.0686070686070686, "grad_norm": 7.508708477020264, "learning_rate": 8.738738738738739e-06, "loss": 0.4082, "step": 1028 }, { "epoch": 1.0696465696465696, "grad_norm": 3.627363443374634, "learning_rate": 8.736428736428737e-06, "loss": 0.0684, "step": 1029 }, { "epoch": 1.0706860706860706, "grad_norm": 5.621256351470947, "learning_rate": 8.734118734118735e-06, "loss": 0.1047, "step": 1030 }, { "epoch": 1.0717255717255718, "grad_norm": 0.027766048908233643, "learning_rate": 8.731808731808733e-06, "loss": 0.0008, "step": 1031 }, { "epoch": 1.0727650727650728, "grad_norm": 0.6192247867584229, "learning_rate": 8.72949872949873e-06, "loss": 0.0086, "step": 1032 }, { "epoch": 1.0738045738045738, "grad_norm": 5.98634672164917, "learning_rate": 8.727188727188728e-06, "loss": 0.1553, "step": 1033 }, { "epoch": 1.0748440748440748, "grad_norm": 7.63029146194458, "learning_rate": 8.724878724878726e-06, "loss": 0.4071, "step": 1034 }, { "epoch": 1.0758835758835759, "grad_norm": 17.183271408081055, "learning_rate": 8.722568722568722e-06, "loss": 0.724, "step": 1035 }, { "epoch": 1.0769230769230769, "grad_norm": 8.9673490524292, "learning_rate": 8.72025872025872e-06, "loss": 0.0969, "step": 1036 }, { "epoch": 1.0779625779625779, "grad_norm": 9.389533996582031, "learning_rate": 8.717948717948719e-06, "loss": 0.7216, "step": 1037 }, { "epoch": 1.0790020790020791, "grad_norm": 7.645777702331543, "learning_rate": 8.715638715638717e-06, "loss": 0.3217, "step": 1038 }, { "epoch": 1.0800415800415801, "grad_norm": 1.5751149654388428, "learning_rate": 8.713328713328714e-06, "loss": 0.031, "step": 1039 }, { "epoch": 1.0810810810810811, "grad_norm": 2.261518955230713, "learning_rate": 8.711018711018712e-06, "loss": 0.0895, "step": 1040 }, { "epoch": 1.0821205821205822, "grad_norm": 0.32005324959754944, "learning_rate": 8.70870870870871e-06, "loss": 0.0096, "step": 1041 }, { "epoch": 1.0831600831600832, "grad_norm": 0.7632772922515869, "learning_rate": 8.706398706398708e-06, "loss": 0.0126, "step": 1042 }, { "epoch": 1.0841995841995842, "grad_norm": 3.8408350944519043, "learning_rate": 8.704088704088705e-06, "loss": 0.0494, "step": 1043 }, { "epoch": 1.0852390852390852, "grad_norm": 6.506851673126221, "learning_rate": 8.701778701778703e-06, "loss": 0.1149, "step": 1044 }, { "epoch": 1.0862785862785862, "grad_norm": 5.510050296783447, "learning_rate": 8.699468699468701e-06, "loss": 0.1562, "step": 1045 }, { "epoch": 1.0873180873180872, "grad_norm": 0.16971538960933685, "learning_rate": 8.697158697158697e-06, "loss": 0.003, "step": 1046 }, { "epoch": 1.0883575883575884, "grad_norm": 9.97668170928955, "learning_rate": 8.694848694848696e-06, "loss": 0.6033, "step": 1047 }, { "epoch": 1.0893970893970895, "grad_norm": 0.341886967420578, "learning_rate": 8.692538692538694e-06, "loss": 0.0061, "step": 1048 }, { "epoch": 1.0904365904365905, "grad_norm": 10.055686950683594, "learning_rate": 8.690228690228692e-06, "loss": 0.2323, "step": 1049 }, { "epoch": 1.0914760914760915, "grad_norm": 6.149728775024414, "learning_rate": 8.687918687918689e-06, "loss": 0.2168, "step": 1050 }, { "epoch": 1.0925155925155925, "grad_norm": 0.03411586210131645, "learning_rate": 8.685608685608687e-06, "loss": 0.0009, "step": 1051 }, { "epoch": 1.0935550935550935, "grad_norm": 1.4763023853302002, "learning_rate": 8.683298683298685e-06, "loss": 0.0403, "step": 1052 }, { "epoch": 1.0945945945945945, "grad_norm": 0.5744214653968811, "learning_rate": 8.680988680988681e-06, "loss": 0.0182, "step": 1053 }, { "epoch": 1.0956340956340958, "grad_norm": 8.413043975830078, "learning_rate": 8.67867867867868e-06, "loss": 0.4508, "step": 1054 }, { "epoch": 1.0966735966735968, "grad_norm": 3.0033912658691406, "learning_rate": 8.676368676368678e-06, "loss": 0.0878, "step": 1055 }, { "epoch": 1.0977130977130978, "grad_norm": 2.9098126888275146, "learning_rate": 8.674058674058676e-06, "loss": 0.123, "step": 1056 }, { "epoch": 1.0987525987525988, "grad_norm": 10.180469512939453, "learning_rate": 8.671748671748672e-06, "loss": 0.3434, "step": 1057 }, { "epoch": 1.0997920997920998, "grad_norm": 0.4921611249446869, "learning_rate": 8.66943866943867e-06, "loss": 0.0136, "step": 1058 }, { "epoch": 1.1008316008316008, "grad_norm": 13.588961601257324, "learning_rate": 8.667128667128669e-06, "loss": 1.4453, "step": 1059 }, { "epoch": 1.1018711018711018, "grad_norm": 14.660667419433594, "learning_rate": 8.664818664818665e-06, "loss": 0.5864, "step": 1060 }, { "epoch": 1.1029106029106028, "grad_norm": 5.017448902130127, "learning_rate": 8.662508662508664e-06, "loss": 0.1624, "step": 1061 }, { "epoch": 1.1039501039501038, "grad_norm": 17.048917770385742, "learning_rate": 8.660198660198662e-06, "loss": 1.5054, "step": 1062 }, { "epoch": 1.104989604989605, "grad_norm": 8.416142463684082, "learning_rate": 8.657888657888658e-06, "loss": 0.4395, "step": 1063 }, { "epoch": 1.106029106029106, "grad_norm": 10.862117767333984, "learning_rate": 8.655578655578656e-06, "loss": 0.9302, "step": 1064 }, { "epoch": 1.107068607068607, "grad_norm": 10.158036231994629, "learning_rate": 8.653268653268655e-06, "loss": 0.5861, "step": 1065 }, { "epoch": 1.1081081081081081, "grad_norm": 6.491422653198242, "learning_rate": 8.650958650958651e-06, "loss": 0.1169, "step": 1066 }, { "epoch": 1.1091476091476091, "grad_norm": 6.694852352142334, "learning_rate": 8.64864864864865e-06, "loss": 0.1717, "step": 1067 }, { "epoch": 1.1101871101871101, "grad_norm": 5.018531799316406, "learning_rate": 8.646338646338648e-06, "loss": 0.1341, "step": 1068 }, { "epoch": 1.1112266112266111, "grad_norm": 1.5225193500518799, "learning_rate": 8.644028644028644e-06, "loss": 0.0356, "step": 1069 }, { "epoch": 1.1122661122661124, "grad_norm": 2.1500790119171143, "learning_rate": 8.641718641718642e-06, "loss": 0.0403, "step": 1070 }, { "epoch": 1.1133056133056134, "grad_norm": 33.20718002319336, "learning_rate": 8.63940863940864e-06, "loss": 0.1609, "step": 1071 }, { "epoch": 1.1143451143451144, "grad_norm": 16.251794815063477, "learning_rate": 8.637098637098637e-06, "loss": 0.5655, "step": 1072 }, { "epoch": 1.1153846153846154, "grad_norm": 3.3469290733337402, "learning_rate": 8.634788634788635e-06, "loss": 0.0595, "step": 1073 }, { "epoch": 1.1164241164241164, "grad_norm": 0.12442325055599213, "learning_rate": 8.632478632478633e-06, "loss": 0.002, "step": 1074 }, { "epoch": 1.1174636174636174, "grad_norm": 0.4608583450317383, "learning_rate": 8.63016863016863e-06, "loss": 0.0039, "step": 1075 }, { "epoch": 1.1185031185031185, "grad_norm": 6.6315717697143555, "learning_rate": 8.627858627858628e-06, "loss": 0.1658, "step": 1076 }, { "epoch": 1.1195426195426195, "grad_norm": 14.120782852172852, "learning_rate": 8.625548625548626e-06, "loss": 1.1364, "step": 1077 }, { "epoch": 1.1205821205821205, "grad_norm": 12.094304084777832, "learning_rate": 8.623238623238624e-06, "loss": 1.4489, "step": 1078 }, { "epoch": 1.1216216216216217, "grad_norm": 14.251092910766602, "learning_rate": 8.62092862092862e-06, "loss": 1.493, "step": 1079 }, { "epoch": 1.1226611226611227, "grad_norm": 2.4907310009002686, "learning_rate": 8.618618618618619e-06, "loss": 0.0375, "step": 1080 }, { "epoch": 1.1237006237006237, "grad_norm": 10.427132606506348, "learning_rate": 8.616308616308617e-06, "loss": 0.2393, "step": 1081 }, { "epoch": 1.1247401247401247, "grad_norm": 0.9016990661621094, "learning_rate": 8.613998613998614e-06, "loss": 0.024, "step": 1082 }, { "epoch": 1.1257796257796258, "grad_norm": 13.01040267944336, "learning_rate": 8.611688611688612e-06, "loss": 0.3459, "step": 1083 }, { "epoch": 1.1268191268191268, "grad_norm": 0.26096540689468384, "learning_rate": 8.60937860937861e-06, "loss": 0.0075, "step": 1084 }, { "epoch": 1.1278586278586278, "grad_norm": 5.673876762390137, "learning_rate": 8.607068607068608e-06, "loss": 0.2176, "step": 1085 }, { "epoch": 1.128898128898129, "grad_norm": 1.2679526805877686, "learning_rate": 8.604758604758605e-06, "loss": 0.0507, "step": 1086 }, { "epoch": 1.12993762993763, "grad_norm": 10.18384838104248, "learning_rate": 8.602448602448603e-06, "loss": 0.9842, "step": 1087 }, { "epoch": 1.130977130977131, "grad_norm": 1.5802457332611084, "learning_rate": 8.600138600138601e-06, "loss": 0.0257, "step": 1088 }, { "epoch": 1.132016632016632, "grad_norm": 11.541592597961426, "learning_rate": 8.597828597828598e-06, "loss": 1.3012, "step": 1089 }, { "epoch": 1.133056133056133, "grad_norm": 9.967923164367676, "learning_rate": 8.595518595518596e-06, "loss": 0.8518, "step": 1090 }, { "epoch": 1.134095634095634, "grad_norm": 1.6320937871932983, "learning_rate": 8.593208593208594e-06, "loss": 0.0294, "step": 1091 }, { "epoch": 1.135135135135135, "grad_norm": 18.003585815429688, "learning_rate": 8.590898590898592e-06, "loss": 1.2795, "step": 1092 }, { "epoch": 1.136174636174636, "grad_norm": 9.14699935913086, "learning_rate": 8.588588588588589e-06, "loss": 0.2811, "step": 1093 }, { "epoch": 1.137214137214137, "grad_norm": 8.81598949432373, "learning_rate": 8.586278586278587e-06, "loss": 0.4506, "step": 1094 }, { "epoch": 1.1382536382536383, "grad_norm": 10.513948440551758, "learning_rate": 8.583968583968585e-06, "loss": 0.2852, "step": 1095 }, { "epoch": 1.1392931392931394, "grad_norm": 10.706802368164062, "learning_rate": 8.581658581658583e-06, "loss": 0.3514, "step": 1096 }, { "epoch": 1.1403326403326404, "grad_norm": 9.5643310546875, "learning_rate": 8.57934857934858e-06, "loss": 1.0022, "step": 1097 }, { "epoch": 1.1413721413721414, "grad_norm": 3.270961284637451, "learning_rate": 8.577038577038578e-06, "loss": 0.0555, "step": 1098 }, { "epoch": 1.1424116424116424, "grad_norm": 11.908968925476074, "learning_rate": 8.574728574728576e-06, "loss": 0.2819, "step": 1099 }, { "epoch": 1.1434511434511434, "grad_norm": 9.292601585388184, "learning_rate": 8.572418572418573e-06, "loss": 0.3581, "step": 1100 }, { "epoch": 1.1444906444906444, "grad_norm": 0.7729955911636353, "learning_rate": 8.570108570108571e-06, "loss": 0.0193, "step": 1101 }, { "epoch": 1.1455301455301456, "grad_norm": 1.3490409851074219, "learning_rate": 8.567798567798569e-06, "loss": 0.0295, "step": 1102 }, { "epoch": 1.1465696465696467, "grad_norm": 9.878353118896484, "learning_rate": 8.565488565488567e-06, "loss": 0.4995, "step": 1103 }, { "epoch": 1.1476091476091477, "grad_norm": 8.845831871032715, "learning_rate": 8.563178563178564e-06, "loss": 0.6224, "step": 1104 }, { "epoch": 1.1486486486486487, "grad_norm": 0.5114465951919556, "learning_rate": 8.560868560868562e-06, "loss": 0.0118, "step": 1105 }, { "epoch": 1.1496881496881497, "grad_norm": 0.6183711290359497, "learning_rate": 8.55855855855856e-06, "loss": 0.0184, "step": 1106 }, { "epoch": 1.1507276507276507, "grad_norm": 0.25186434388160706, "learning_rate": 8.556248556248557e-06, "loss": 0.0053, "step": 1107 }, { "epoch": 1.1517671517671517, "grad_norm": 2.6005747318267822, "learning_rate": 8.553938553938555e-06, "loss": 0.0756, "step": 1108 }, { "epoch": 1.1528066528066527, "grad_norm": 6.125280380249023, "learning_rate": 8.551628551628553e-06, "loss": 0.3256, "step": 1109 }, { "epoch": 1.1538461538461537, "grad_norm": 6.985195636749268, "learning_rate": 8.549318549318551e-06, "loss": 0.3697, "step": 1110 }, { "epoch": 1.154885654885655, "grad_norm": 7.720252513885498, "learning_rate": 8.547008547008548e-06, "loss": 0.3452, "step": 1111 }, { "epoch": 1.155925155925156, "grad_norm": 0.41022229194641113, "learning_rate": 8.544698544698546e-06, "loss": 0.0086, "step": 1112 }, { "epoch": 1.156964656964657, "grad_norm": 4.509541988372803, "learning_rate": 8.542388542388544e-06, "loss": 0.1699, "step": 1113 }, { "epoch": 1.158004158004158, "grad_norm": 0.026520010083913803, "learning_rate": 8.54007854007854e-06, "loss": 0.0009, "step": 1114 }, { "epoch": 1.159043659043659, "grad_norm": 1.7383307218551636, "learning_rate": 8.537768537768539e-06, "loss": 0.0466, "step": 1115 }, { "epoch": 1.16008316008316, "grad_norm": 5.903339385986328, "learning_rate": 8.535458535458537e-06, "loss": 0.2287, "step": 1116 }, { "epoch": 1.161122661122661, "grad_norm": 2.3729774951934814, "learning_rate": 8.533148533148533e-06, "loss": 0.0557, "step": 1117 }, { "epoch": 1.1621621621621623, "grad_norm": 7.9382548332214355, "learning_rate": 8.530838530838532e-06, "loss": 0.3049, "step": 1118 }, { "epoch": 1.1632016632016633, "grad_norm": 19.727888107299805, "learning_rate": 8.52852852852853e-06, "loss": 1.9244, "step": 1119 }, { "epoch": 1.1642411642411643, "grad_norm": 8.529170989990234, "learning_rate": 8.526218526218526e-06, "loss": 0.4675, "step": 1120 }, { "epoch": 1.1652806652806653, "grad_norm": 3.230149507522583, "learning_rate": 8.523908523908525e-06, "loss": 0.0742, "step": 1121 }, { "epoch": 1.1663201663201663, "grad_norm": 2.254007577896118, "learning_rate": 8.521598521598523e-06, "loss": 0.0609, "step": 1122 }, { "epoch": 1.1673596673596673, "grad_norm": 7.165493011474609, "learning_rate": 8.51928851928852e-06, "loss": 0.4766, "step": 1123 }, { "epoch": 1.1683991683991684, "grad_norm": 6.5497145652771, "learning_rate": 8.516978516978517e-06, "loss": 0.1697, "step": 1124 }, { "epoch": 1.1694386694386694, "grad_norm": 7.064365863800049, "learning_rate": 8.514668514668516e-06, "loss": 0.1954, "step": 1125 }, { "epoch": 1.1704781704781704, "grad_norm": 1.0393562316894531, "learning_rate": 8.512358512358512e-06, "loss": 0.029, "step": 1126 }, { "epoch": 1.1715176715176716, "grad_norm": 14.624014854431152, "learning_rate": 8.51004851004851e-06, "loss": 0.3145, "step": 1127 }, { "epoch": 1.1725571725571726, "grad_norm": 11.193547248840332, "learning_rate": 8.507738507738508e-06, "loss": 0.785, "step": 1128 }, { "epoch": 1.1735966735966736, "grad_norm": 9.468733787536621, "learning_rate": 8.505428505428505e-06, "loss": 0.5649, "step": 1129 }, { "epoch": 1.1746361746361746, "grad_norm": 14.636096954345703, "learning_rate": 8.503118503118503e-06, "loss": 1.0006, "step": 1130 }, { "epoch": 1.1756756756756757, "grad_norm": 0.3806226849555969, "learning_rate": 8.500808500808501e-06, "loss": 0.0093, "step": 1131 }, { "epoch": 1.1767151767151767, "grad_norm": 0.13574692606925964, "learning_rate": 8.4984984984985e-06, "loss": 0.0033, "step": 1132 }, { "epoch": 1.1777546777546777, "grad_norm": 8.421379089355469, "learning_rate": 8.496188496188496e-06, "loss": 0.2561, "step": 1133 }, { "epoch": 1.178794178794179, "grad_norm": 4.2164201736450195, "learning_rate": 8.493878493878494e-06, "loss": 0.2455, "step": 1134 }, { "epoch": 1.17983367983368, "grad_norm": 6.926000595092773, "learning_rate": 8.491568491568492e-06, "loss": 0.3277, "step": 1135 }, { "epoch": 1.180873180873181, "grad_norm": 3.1254656314849854, "learning_rate": 8.489258489258489e-06, "loss": 0.0762, "step": 1136 }, { "epoch": 1.181912681912682, "grad_norm": 0.08411677926778793, "learning_rate": 8.486948486948487e-06, "loss": 0.0014, "step": 1137 }, { "epoch": 1.182952182952183, "grad_norm": 1.5572375059127808, "learning_rate": 8.484638484638485e-06, "loss": 0.0215, "step": 1138 }, { "epoch": 1.183991683991684, "grad_norm": 1.5848573446273804, "learning_rate": 8.482328482328483e-06, "loss": 0.0225, "step": 1139 }, { "epoch": 1.185031185031185, "grad_norm": 1.0582151412963867, "learning_rate": 8.48001848001848e-06, "loss": 0.2594, "step": 1140 }, { "epoch": 1.186070686070686, "grad_norm": 9.030559539794922, "learning_rate": 8.477708477708478e-06, "loss": 0.2851, "step": 1141 }, { "epoch": 1.187110187110187, "grad_norm": 15.570144653320312, "learning_rate": 8.475398475398476e-06, "loss": 0.3463, "step": 1142 }, { "epoch": 1.1881496881496882, "grad_norm": 10.316506385803223, "learning_rate": 8.473088473088473e-06, "loss": 0.5758, "step": 1143 }, { "epoch": 1.1891891891891893, "grad_norm": 5.263421058654785, "learning_rate": 8.470778470778471e-06, "loss": 0.1077, "step": 1144 }, { "epoch": 1.1902286902286903, "grad_norm": 4.201646327972412, "learning_rate": 8.46846846846847e-06, "loss": 0.1171, "step": 1145 }, { "epoch": 1.1912681912681913, "grad_norm": 0.2046760469675064, "learning_rate": 8.466158466158467e-06, "loss": 0.0044, "step": 1146 }, { "epoch": 1.1923076923076923, "grad_norm": 2.979363203048706, "learning_rate": 8.463848463848464e-06, "loss": 0.0425, "step": 1147 }, { "epoch": 1.1933471933471933, "grad_norm": 18.337308883666992, "learning_rate": 8.461538461538462e-06, "loss": 1.5175, "step": 1148 }, { "epoch": 1.1943866943866943, "grad_norm": 3.2387545108795166, "learning_rate": 8.45922845922846e-06, "loss": 0.0629, "step": 1149 }, { "epoch": 1.1954261954261955, "grad_norm": 10.86263656616211, "learning_rate": 8.456918456918458e-06, "loss": 0.3675, "step": 1150 }, { "epoch": 1.1964656964656966, "grad_norm": NaN, "learning_rate": 8.454608454608455e-06, "loss": 0.0022, "step": 1151 }, { "epoch": 1.1975051975051976, "grad_norm": 1.9744138717651367, "learning_rate": 8.452298452298453e-06, "loss": 0.0545, "step": 1152 }, { "epoch": 1.1985446985446986, "grad_norm": 9.182241439819336, "learning_rate": 8.449988449988451e-06, "loss": 0.1764, "step": 1153 }, { "epoch": 1.1995841995841996, "grad_norm": 14.103419303894043, "learning_rate": 8.447678447678448e-06, "loss": 1.7113, "step": 1154 }, { "epoch": 1.2006237006237006, "grad_norm": 5.250944137573242, "learning_rate": 8.445368445368446e-06, "loss": 0.2917, "step": 1155 }, { "epoch": 1.2016632016632016, "grad_norm": 8.00632095336914, "learning_rate": 8.443058443058444e-06, "loss": 0.3142, "step": 1156 }, { "epoch": 1.2027027027027026, "grad_norm": 9.843782424926758, "learning_rate": 8.440748440748442e-06, "loss": 0.524, "step": 1157 }, { "epoch": 1.2037422037422036, "grad_norm": 0.603416919708252, "learning_rate": 8.438438438438439e-06, "loss": 0.0099, "step": 1158 }, { "epoch": 1.2047817047817049, "grad_norm": 8.074755668640137, "learning_rate": 8.436128436128437e-06, "loss": 0.2532, "step": 1159 }, { "epoch": 1.2058212058212059, "grad_norm": 8.356449127197266, "learning_rate": 8.433818433818435e-06, "loss": 1.1221, "step": 1160 }, { "epoch": 1.206860706860707, "grad_norm": 12.065170288085938, "learning_rate": 8.431508431508432e-06, "loss": 0.6152, "step": 1161 }, { "epoch": 1.207900207900208, "grad_norm": 1.928618311882019, "learning_rate": 8.42919842919843e-06, "loss": 0.039, "step": 1162 }, { "epoch": 1.208939708939709, "grad_norm": 3.5592713356018066, "learning_rate": 8.426888426888428e-06, "loss": 0.1305, "step": 1163 }, { "epoch": 1.20997920997921, "grad_norm": 6.7794671058654785, "learning_rate": 8.424578424578426e-06, "loss": 0.5323, "step": 1164 }, { "epoch": 1.211018711018711, "grad_norm": 10.264362335205078, "learning_rate": 8.422268422268423e-06, "loss": 0.3864, "step": 1165 }, { "epoch": 1.2120582120582122, "grad_norm": 11.034218788146973, "learning_rate": 8.419958419958421e-06, "loss": 0.3514, "step": 1166 }, { "epoch": 1.2130977130977132, "grad_norm": 5.860688209533691, "learning_rate": 8.41764841764842e-06, "loss": 0.2283, "step": 1167 }, { "epoch": 1.2141372141372142, "grad_norm": 5.28604793548584, "learning_rate": 8.415338415338416e-06, "loss": 0.0811, "step": 1168 }, { "epoch": 1.2151767151767152, "grad_norm": 3.432724952697754, "learning_rate": 8.413028413028414e-06, "loss": 0.1946, "step": 1169 }, { "epoch": 1.2162162162162162, "grad_norm": 11.879193305969238, "learning_rate": 8.410718410718412e-06, "loss": 1.0293, "step": 1170 }, { "epoch": 1.2172557172557172, "grad_norm": 4.63136625289917, "learning_rate": 8.408408408408409e-06, "loss": 0.1425, "step": 1171 }, { "epoch": 1.2182952182952183, "grad_norm": 3.035247325897217, "learning_rate": 8.406098406098407e-06, "loss": 0.0758, "step": 1172 }, { "epoch": 1.2193347193347193, "grad_norm": 0.7272508144378662, "learning_rate": 8.403788403788405e-06, "loss": 0.0207, "step": 1173 }, { "epoch": 1.2203742203742203, "grad_norm": 7.765983581542969, "learning_rate": 8.401478401478402e-06, "loss": 0.4684, "step": 1174 }, { "epoch": 1.2214137214137215, "grad_norm": 1.4539945125579834, "learning_rate": 8.3991683991684e-06, "loss": 0.0477, "step": 1175 }, { "epoch": 1.2224532224532225, "grad_norm": 7.17827033996582, "learning_rate": 8.396858396858398e-06, "loss": 0.5285, "step": 1176 }, { "epoch": 1.2234927234927235, "grad_norm": 2.0810439586639404, "learning_rate": 8.394548394548394e-06, "loss": 0.1235, "step": 1177 }, { "epoch": 1.2245322245322245, "grad_norm": 1.930201530456543, "learning_rate": 8.392238392238393e-06, "loss": 0.0675, "step": 1178 }, { "epoch": 1.2255717255717256, "grad_norm": 14.370950698852539, "learning_rate": 8.38992838992839e-06, "loss": 1.0809, "step": 1179 }, { "epoch": 1.2266112266112266, "grad_norm": 7.8660736083984375, "learning_rate": 8.387618387618387e-06, "loss": 0.5185, "step": 1180 }, { "epoch": 1.2276507276507276, "grad_norm": 1.655981183052063, "learning_rate": 8.385308385308385e-06, "loss": 0.0581, "step": 1181 }, { "epoch": 1.2286902286902288, "grad_norm": 3.708282947540283, "learning_rate": 8.382998382998384e-06, "loss": 0.1657, "step": 1182 }, { "epoch": 1.2297297297297298, "grad_norm": 2.6310458183288574, "learning_rate": 8.38068838068838e-06, "loss": 0.0677, "step": 1183 }, { "epoch": 1.2307692307692308, "grad_norm": 2.877119779586792, "learning_rate": 8.378378378378378e-06, "loss": 0.0854, "step": 1184 }, { "epoch": 1.2318087318087318, "grad_norm": 3.463524341583252, "learning_rate": 8.376068376068377e-06, "loss": 0.1511, "step": 1185 }, { "epoch": 1.2328482328482329, "grad_norm": 8.751903533935547, "learning_rate": 8.373758373758375e-06, "loss": 0.4159, "step": 1186 }, { "epoch": 1.2338877338877339, "grad_norm": 0.5461509823799133, "learning_rate": 8.371448371448371e-06, "loss": 0.0178, "step": 1187 }, { "epoch": 1.2349272349272349, "grad_norm": 0.9689798355102539, "learning_rate": 8.36913836913837e-06, "loss": 0.0244, "step": 1188 }, { "epoch": 1.235966735966736, "grad_norm": 8.795180320739746, "learning_rate": 8.366828366828368e-06, "loss": 0.5102, "step": 1189 }, { "epoch": 1.237006237006237, "grad_norm": 1.2706940174102783, "learning_rate": 8.364518364518364e-06, "loss": 0.0361, "step": 1190 }, { "epoch": 1.2380457380457381, "grad_norm": 0.7795925140380859, "learning_rate": 8.362208362208362e-06, "loss": 0.0183, "step": 1191 }, { "epoch": 1.2390852390852392, "grad_norm": 9.270479202270508, "learning_rate": 8.35989835989836e-06, "loss": 0.8109, "step": 1192 }, { "epoch": 1.2401247401247402, "grad_norm": 6.105793476104736, "learning_rate": 8.357588357588359e-06, "loss": 0.243, "step": 1193 }, { "epoch": 1.2411642411642412, "grad_norm": 0.34443458914756775, "learning_rate": 8.355278355278355e-06, "loss": 0.0044, "step": 1194 }, { "epoch": 1.2422037422037422, "grad_norm": 1.8256933689117432, "learning_rate": 8.352968352968353e-06, "loss": 0.0363, "step": 1195 }, { "epoch": 1.2432432432432432, "grad_norm": 8.279906272888184, "learning_rate": 8.350658350658352e-06, "loss": 0.5154, "step": 1196 }, { "epoch": 1.2442827442827442, "grad_norm": 1.0331989526748657, "learning_rate": 8.348348348348348e-06, "loss": 0.028, "step": 1197 }, { "epoch": 1.2453222453222454, "grad_norm": 4.658940315246582, "learning_rate": 8.346038346038346e-06, "loss": 0.2403, "step": 1198 }, { "epoch": 1.2463617463617465, "grad_norm": 5.019296169281006, "learning_rate": 8.343728343728344e-06, "loss": 0.0645, "step": 1199 }, { "epoch": 1.2474012474012475, "grad_norm": 0.07969323545694351, "learning_rate": 8.341418341418343e-06, "loss": 0.0015, "step": 1200 }, { "epoch": 1.2484407484407485, "grad_norm": 0.04548674076795578, "learning_rate": 8.339108339108339e-06, "loss": 0.0005, "step": 1201 }, { "epoch": 1.2494802494802495, "grad_norm": 10.047810554504395, "learning_rate": 8.336798336798337e-06, "loss": 0.6915, "step": 1202 }, { "epoch": 1.2505197505197505, "grad_norm": 0.18876366317272186, "learning_rate": 8.334488334488335e-06, "loss": 0.005, "step": 1203 }, { "epoch": 1.2515592515592515, "grad_norm": 13.021455764770508, "learning_rate": 8.332178332178334e-06, "loss": 0.9729, "step": 1204 }, { "epoch": 1.2525987525987525, "grad_norm": 1.3052574396133423, "learning_rate": 8.32986832986833e-06, "loss": 0.0346, "step": 1205 }, { "epoch": 1.2536382536382535, "grad_norm": 9.151220321655273, "learning_rate": 8.327558327558328e-06, "loss": 0.5962, "step": 1206 }, { "epoch": 1.2546777546777546, "grad_norm": 7.05905294418335, "learning_rate": 8.325248325248327e-06, "loss": 0.2319, "step": 1207 }, { "epoch": 1.2557172557172558, "grad_norm": 6.653399467468262, "learning_rate": 8.322938322938323e-06, "loss": 0.141, "step": 1208 }, { "epoch": 1.2567567567567568, "grad_norm": 0.6559470891952515, "learning_rate": 8.320628320628321e-06, "loss": 0.0125, "step": 1209 }, { "epoch": 1.2577962577962578, "grad_norm": 10.03073501586914, "learning_rate": 8.31831831831832e-06, "loss": 0.6753, "step": 1210 }, { "epoch": 1.2588357588357588, "grad_norm": 1.6613415479660034, "learning_rate": 8.316008316008318e-06, "loss": 0.0338, "step": 1211 }, { "epoch": 1.2598752598752598, "grad_norm": 6.826779842376709, "learning_rate": 8.313698313698314e-06, "loss": 0.3042, "step": 1212 }, { "epoch": 1.2609147609147608, "grad_norm": 2.6513710021972656, "learning_rate": 8.311388311388312e-06, "loss": 0.1294, "step": 1213 }, { "epoch": 1.261954261954262, "grad_norm": 5.151392936706543, "learning_rate": 8.30907830907831e-06, "loss": 0.0831, "step": 1214 }, { "epoch": 1.262993762993763, "grad_norm": 6.245535373687744, "learning_rate": 8.306768306768307e-06, "loss": 0.1705, "step": 1215 }, { "epoch": 1.264033264033264, "grad_norm": 0.5823240876197815, "learning_rate": 8.304458304458305e-06, "loss": 0.0125, "step": 1216 }, { "epoch": 1.2650727650727651, "grad_norm": 9.183516502380371, "learning_rate": 8.302148302148303e-06, "loss": 0.4556, "step": 1217 }, { "epoch": 1.2661122661122661, "grad_norm": 6.539437770843506, "learning_rate": 8.299838299838302e-06, "loss": 0.2557, "step": 1218 }, { "epoch": 1.2671517671517671, "grad_norm": 8.801095962524414, "learning_rate": 8.297528297528298e-06, "loss": 0.604, "step": 1219 }, { "epoch": 1.2681912681912682, "grad_norm": 0.5784547328948975, "learning_rate": 8.295218295218296e-06, "loss": 0.0208, "step": 1220 }, { "epoch": 1.2692307692307692, "grad_norm": 7.860816955566406, "learning_rate": 8.292908292908294e-06, "loss": 0.329, "step": 1221 }, { "epoch": 1.2702702702702702, "grad_norm": 0.34168723225593567, "learning_rate": 8.290598290598293e-06, "loss": 0.008, "step": 1222 }, { "epoch": 1.2713097713097712, "grad_norm": 3.0554420948028564, "learning_rate": 8.288288288288289e-06, "loss": 0.0614, "step": 1223 }, { "epoch": 1.2723492723492724, "grad_norm": 13.036967277526855, "learning_rate": 8.285978285978287e-06, "loss": 0.7422, "step": 1224 }, { "epoch": 1.2733887733887734, "grad_norm": 5.609252452850342, "learning_rate": 8.283668283668286e-06, "loss": 0.2516, "step": 1225 }, { "epoch": 1.2744282744282744, "grad_norm": 10.398143768310547, "learning_rate": 8.281358281358282e-06, "loss": 0.7848, "step": 1226 }, { "epoch": 1.2754677754677755, "grad_norm": 9.209569931030273, "learning_rate": 8.27904827904828e-06, "loss": 0.1943, "step": 1227 }, { "epoch": 1.2765072765072765, "grad_norm": 7.4310197830200195, "learning_rate": 8.276738276738278e-06, "loss": 0.2437, "step": 1228 }, { "epoch": 1.2775467775467775, "grad_norm": 11.599946022033691, "learning_rate": 8.274428274428275e-06, "loss": 0.7414, "step": 1229 }, { "epoch": 1.2785862785862787, "grad_norm": 13.055266380310059, "learning_rate": 8.272118272118273e-06, "loss": 0.6268, "step": 1230 }, { "epoch": 1.2796257796257797, "grad_norm": 10.34594440460205, "learning_rate": 8.269808269808271e-06, "loss": 0.3585, "step": 1231 }, { "epoch": 1.2806652806652807, "grad_norm": 4.395898818969727, "learning_rate": 8.267498267498268e-06, "loss": 0.0964, "step": 1232 }, { "epoch": 1.2817047817047817, "grad_norm": 3.7805607318878174, "learning_rate": 8.265188265188266e-06, "loss": 0.1501, "step": 1233 }, { "epoch": 1.2827442827442828, "grad_norm": 8.583539009094238, "learning_rate": 8.262878262878264e-06, "loss": 0.6436, "step": 1234 }, { "epoch": 1.2837837837837838, "grad_norm": 2.4144604206085205, "learning_rate": 8.26056826056826e-06, "loss": 0.0688, "step": 1235 }, { "epoch": 1.2848232848232848, "grad_norm": 4.065729141235352, "learning_rate": 8.258258258258259e-06, "loss": 0.073, "step": 1236 }, { "epoch": 1.2858627858627858, "grad_norm": 4.701965808868408, "learning_rate": 8.255948255948257e-06, "loss": 0.1044, "step": 1237 }, { "epoch": 1.2869022869022868, "grad_norm": 3.801809787750244, "learning_rate": 8.253638253638254e-06, "loss": 0.1211, "step": 1238 }, { "epoch": 1.2879417879417878, "grad_norm": 9.999377250671387, "learning_rate": 8.251328251328252e-06, "loss": 0.5039, "step": 1239 }, { "epoch": 1.288981288981289, "grad_norm": 10.668828964233398, "learning_rate": 8.24901824901825e-06, "loss": 0.5704, "step": 1240 }, { "epoch": 1.29002079002079, "grad_norm": 7.582046985626221, "learning_rate": 8.246708246708246e-06, "loss": 0.3647, "step": 1241 }, { "epoch": 1.291060291060291, "grad_norm": 6.662896633148193, "learning_rate": 8.244398244398245e-06, "loss": 0.1706, "step": 1242 }, { "epoch": 1.292099792099792, "grad_norm": 14.700654029846191, "learning_rate": 8.242088242088243e-06, "loss": 0.9503, "step": 1243 }, { "epoch": 1.293139293139293, "grad_norm": 0.9491019248962402, "learning_rate": 8.23977823977824e-06, "loss": 0.0185, "step": 1244 }, { "epoch": 1.2941787941787941, "grad_norm": 7.621535778045654, "learning_rate": 8.237468237468237e-06, "loss": 0.2511, "step": 1245 }, { "epoch": 1.2952182952182953, "grad_norm": 4.274621486663818, "learning_rate": 8.235158235158236e-06, "loss": 0.061, "step": 1246 }, { "epoch": 1.2962577962577964, "grad_norm": 13.053854942321777, "learning_rate": 8.232848232848234e-06, "loss": 0.5283, "step": 1247 }, { "epoch": 1.2972972972972974, "grad_norm": 13.249424934387207, "learning_rate": 8.23053823053823e-06, "loss": 0.5875, "step": 1248 }, { "epoch": 1.2983367983367984, "grad_norm": 9.532174110412598, "learning_rate": 8.228228228228229e-06, "loss": 0.231, "step": 1249 }, { "epoch": 1.2993762993762994, "grad_norm": 3.8693883419036865, "learning_rate": 8.225918225918227e-06, "loss": 0.1521, "step": 1250 }, { "epoch": 1.3004158004158004, "grad_norm": 9.420363426208496, "learning_rate": 8.223608223608223e-06, "loss": 0.405, "step": 1251 }, { "epoch": 1.3014553014553014, "grad_norm": 6.109569072723389, "learning_rate": 8.221298221298221e-06, "loss": 0.2135, "step": 1252 }, { "epoch": 1.3024948024948024, "grad_norm": 13.657364845275879, "learning_rate": 8.21898821898822e-06, "loss": 0.9535, "step": 1253 }, { "epoch": 1.3035343035343034, "grad_norm": 9.316267013549805, "learning_rate": 8.216678216678218e-06, "loss": 0.368, "step": 1254 }, { "epoch": 1.3045738045738045, "grad_norm": 5.461922645568848, "learning_rate": 8.214368214368214e-06, "loss": 0.1893, "step": 1255 }, { "epoch": 1.3056133056133057, "grad_norm": 0.023014483973383904, "learning_rate": 8.212058212058212e-06, "loss": 0.0005, "step": 1256 }, { "epoch": 1.3066528066528067, "grad_norm": 0.6040786504745483, "learning_rate": 8.20974820974821e-06, "loss": 0.0146, "step": 1257 }, { "epoch": 1.3076923076923077, "grad_norm": 2.6426706314086914, "learning_rate": 8.207438207438209e-06, "loss": 0.0522, "step": 1258 }, { "epoch": 1.3087318087318087, "grad_norm": 0.6564608812332153, "learning_rate": 8.205128205128205e-06, "loss": 0.0132, "step": 1259 }, { "epoch": 1.3097713097713097, "grad_norm": 9.825348854064941, "learning_rate": 8.202818202818204e-06, "loss": 0.3831, "step": 1260 }, { "epoch": 1.3108108108108107, "grad_norm": 9.027199745178223, "learning_rate": 8.200508200508202e-06, "loss": 0.1866, "step": 1261 }, { "epoch": 1.311850311850312, "grad_norm": 12.532243728637695, "learning_rate": 8.198198198198198e-06, "loss": 1.3439, "step": 1262 }, { "epoch": 1.312889812889813, "grad_norm": 7.945847511291504, "learning_rate": 8.195888195888196e-06, "loss": 0.0877, "step": 1263 }, { "epoch": 1.313929313929314, "grad_norm": 9.476608276367188, "learning_rate": 8.193578193578195e-06, "loss": 0.4055, "step": 1264 }, { "epoch": 1.314968814968815, "grad_norm": 8.13533878326416, "learning_rate": 8.191268191268193e-06, "loss": 0.1765, "step": 1265 }, { "epoch": 1.316008316008316, "grad_norm": 1.300158977508545, "learning_rate": 8.18895818895819e-06, "loss": 0.0216, "step": 1266 }, { "epoch": 1.317047817047817, "grad_norm": 2.6788394451141357, "learning_rate": 8.186648186648188e-06, "loss": 0.09, "step": 1267 }, { "epoch": 1.318087318087318, "grad_norm": 4.26913595199585, "learning_rate": 8.184338184338186e-06, "loss": 0.2048, "step": 1268 }, { "epoch": 1.319126819126819, "grad_norm": 15.426655769348145, "learning_rate": 8.182028182028182e-06, "loss": 1.023, "step": 1269 }, { "epoch": 1.32016632016632, "grad_norm": 8.184285163879395, "learning_rate": 8.17971817971818e-06, "loss": 0.2651, "step": 1270 }, { "epoch": 1.321205821205821, "grad_norm": 0.1392313688993454, "learning_rate": 8.177408177408179e-06, "loss": 0.0016, "step": 1271 }, { "epoch": 1.3222453222453223, "grad_norm": 0.1190238893032074, "learning_rate": 8.175098175098177e-06, "loss": 0.0021, "step": 1272 }, { "epoch": 1.3232848232848233, "grad_norm": 17.05913734436035, "learning_rate": 8.172788172788173e-06, "loss": 1.2166, "step": 1273 }, { "epoch": 1.3243243243243243, "grad_norm": 0.30950024724006653, "learning_rate": 8.170478170478171e-06, "loss": 0.0065, "step": 1274 }, { "epoch": 1.3253638253638254, "grad_norm": 1.1400848627090454, "learning_rate": 8.16816816816817e-06, "loss": 0.0196, "step": 1275 }, { "epoch": 1.3264033264033264, "grad_norm": 8.545634269714355, "learning_rate": 8.165858165858168e-06, "loss": 0.3026, "step": 1276 }, { "epoch": 1.3274428274428274, "grad_norm": 0.4845014810562134, "learning_rate": 8.163548163548164e-06, "loss": 0.006, "step": 1277 }, { "epoch": 1.3284823284823286, "grad_norm": 16.63973045349121, "learning_rate": 8.161238161238163e-06, "loss": 1.0234, "step": 1278 }, { "epoch": 1.3295218295218296, "grad_norm": 15.797550201416016, "learning_rate": 8.15892815892816e-06, "loss": 1.2203, "step": 1279 }, { "epoch": 1.3305613305613306, "grad_norm": 1.719501256942749, "learning_rate": 8.156618156618157e-06, "loss": 0.0417, "step": 1280 }, { "epoch": 1.3316008316008316, "grad_norm": 16.285903930664062, "learning_rate": 8.154308154308155e-06, "loss": 0.2835, "step": 1281 }, { "epoch": 1.3326403326403327, "grad_norm": 0.5948153138160706, "learning_rate": 8.151998151998154e-06, "loss": 0.0117, "step": 1282 }, { "epoch": 1.3336798336798337, "grad_norm": 13.316926956176758, "learning_rate": 8.14968814968815e-06, "loss": 0.8549, "step": 1283 }, { "epoch": 1.3347193347193347, "grad_norm": 6.364133834838867, "learning_rate": 8.147378147378148e-06, "loss": 0.3454, "step": 1284 }, { "epoch": 1.3357588357588357, "grad_norm": 8.6270170211792, "learning_rate": 8.145068145068146e-06, "loss": 0.4939, "step": 1285 }, { "epoch": 1.3367983367983367, "grad_norm": 0.4271339774131775, "learning_rate": 8.142758142758143e-06, "loss": 0.0111, "step": 1286 }, { "epoch": 1.3378378378378377, "grad_norm": 10.21527099609375, "learning_rate": 8.140448140448141e-06, "loss": 0.5921, "step": 1287 }, { "epoch": 1.338877338877339, "grad_norm": 10.404336929321289, "learning_rate": 8.13813813813814e-06, "loss": 0.6905, "step": 1288 }, { "epoch": 1.33991683991684, "grad_norm": 1.3672646284103394, "learning_rate": 8.135828135828136e-06, "loss": 0.0269, "step": 1289 }, { "epoch": 1.340956340956341, "grad_norm": 1.9783116579055786, "learning_rate": 8.133518133518134e-06, "loss": 0.0359, "step": 1290 }, { "epoch": 1.341995841995842, "grad_norm": 4.706028938293457, "learning_rate": 8.131208131208132e-06, "loss": 0.1376, "step": 1291 }, { "epoch": 1.343035343035343, "grad_norm": 6.213561058044434, "learning_rate": 8.128898128898129e-06, "loss": 0.1551, "step": 1292 }, { "epoch": 1.344074844074844, "grad_norm": 17.572166442871094, "learning_rate": 8.126588126588127e-06, "loss": 0.1122, "step": 1293 }, { "epoch": 1.3451143451143452, "grad_norm": 9.871464729309082, "learning_rate": 8.124278124278125e-06, "loss": 0.4517, "step": 1294 }, { "epoch": 1.3461538461538463, "grad_norm": 3.3636231422424316, "learning_rate": 8.121968121968122e-06, "loss": 0.1099, "step": 1295 }, { "epoch": 1.3471933471933473, "grad_norm": 6.92306661605835, "learning_rate": 8.11965811965812e-06, "loss": 0.2374, "step": 1296 }, { "epoch": 1.3482328482328483, "grad_norm": 12.991156578063965, "learning_rate": 8.117348117348118e-06, "loss": 1.7004, "step": 1297 }, { "epoch": 1.3492723492723493, "grad_norm": 2.913350820541382, "learning_rate": 8.115038115038114e-06, "loss": 0.0435, "step": 1298 }, { "epoch": 1.3503118503118503, "grad_norm": 0.5244566798210144, "learning_rate": 8.112728112728113e-06, "loss": 0.0153, "step": 1299 }, { "epoch": 1.3513513513513513, "grad_norm": 11.311691284179688, "learning_rate": 8.110418110418111e-06, "loss": 0.1636, "step": 1300 }, { "epoch": 1.3523908523908523, "grad_norm": 11.263629913330078, "learning_rate": 8.108108108108109e-06, "loss": 0.4313, "step": 1301 }, { "epoch": 1.3534303534303533, "grad_norm": 2.7492759227752686, "learning_rate": 8.105798105798106e-06, "loss": 0.0683, "step": 1302 }, { "epoch": 1.3544698544698544, "grad_norm": 2.5676145553588867, "learning_rate": 8.103488103488104e-06, "loss": 0.097, "step": 1303 }, { "epoch": 1.3555093555093556, "grad_norm": 3.0315756797790527, "learning_rate": 8.101178101178102e-06, "loss": 0.2299, "step": 1304 }, { "epoch": 1.3565488565488566, "grad_norm": 10.284161567687988, "learning_rate": 8.098868098868098e-06, "loss": 1.137, "step": 1305 }, { "epoch": 1.3575883575883576, "grad_norm": 0.3827531039714813, "learning_rate": 8.096558096558097e-06, "loss": 0.0108, "step": 1306 }, { "epoch": 1.3586278586278586, "grad_norm": 4.843155384063721, "learning_rate": 8.094248094248095e-06, "loss": 0.06, "step": 1307 }, { "epoch": 1.3596673596673596, "grad_norm": 13.512953758239746, "learning_rate": 8.091938091938093e-06, "loss": 1.04, "step": 1308 }, { "epoch": 1.3607068607068606, "grad_norm": 5.828279972076416, "learning_rate": 8.08962808962809e-06, "loss": 0.089, "step": 1309 }, { "epoch": 1.3617463617463619, "grad_norm": 2.431767702102661, "learning_rate": 8.087318087318088e-06, "loss": 0.0354, "step": 1310 }, { "epoch": 1.362785862785863, "grad_norm": 1.5293114185333252, "learning_rate": 8.085008085008086e-06, "loss": 0.0218, "step": 1311 }, { "epoch": 1.363825363825364, "grad_norm": 11.862726211547852, "learning_rate": 8.082698082698084e-06, "loss": 0.4311, "step": 1312 }, { "epoch": 1.364864864864865, "grad_norm": 14.451634407043457, "learning_rate": 8.08038808038808e-06, "loss": 2.1098, "step": 1313 }, { "epoch": 1.365904365904366, "grad_norm": 11.710643768310547, "learning_rate": 8.078078078078079e-06, "loss": 0.2682, "step": 1314 }, { "epoch": 1.366943866943867, "grad_norm": 1.511605143547058, "learning_rate": 8.075768075768077e-06, "loss": 0.0272, "step": 1315 }, { "epoch": 1.367983367983368, "grad_norm": 11.279335021972656, "learning_rate": 8.073458073458073e-06, "loss": 0.7856, "step": 1316 }, { "epoch": 1.369022869022869, "grad_norm": 2.178616762161255, "learning_rate": 8.071148071148072e-06, "loss": 0.0562, "step": 1317 }, { "epoch": 1.37006237006237, "grad_norm": 1.0928305387496948, "learning_rate": 8.06883806883807e-06, "loss": 0.0292, "step": 1318 }, { "epoch": 1.371101871101871, "grad_norm": 0.34847453236579895, "learning_rate": 8.066528066528068e-06, "loss": 0.0067, "step": 1319 }, { "epoch": 1.3721413721413722, "grad_norm": 0.4622173011302948, "learning_rate": 8.064218064218065e-06, "loss": 0.0092, "step": 1320 }, { "epoch": 1.3731808731808732, "grad_norm": 0.11356835067272186, "learning_rate": 8.061908061908063e-06, "loss": 0.0021, "step": 1321 }, { "epoch": 1.3742203742203742, "grad_norm": 0.6464567184448242, "learning_rate": 8.059598059598061e-06, "loss": 0.0167, "step": 1322 }, { "epoch": 1.3752598752598753, "grad_norm": 9.651885032653809, "learning_rate": 8.057288057288057e-06, "loss": 0.2491, "step": 1323 }, { "epoch": 1.3762993762993763, "grad_norm": 3.052480936050415, "learning_rate": 8.054978054978056e-06, "loss": 0.1249, "step": 1324 }, { "epoch": 1.3773388773388773, "grad_norm": 2.6563804149627686, "learning_rate": 8.052668052668054e-06, "loss": 0.0743, "step": 1325 }, { "epoch": 1.3783783783783785, "grad_norm": 2.6560709476470947, "learning_rate": 8.050358050358052e-06, "loss": 0.0542, "step": 1326 }, { "epoch": 1.3794178794178795, "grad_norm": 7.124080657958984, "learning_rate": 8.048048048048048e-06, "loss": 0.2983, "step": 1327 }, { "epoch": 1.3804573804573805, "grad_norm": 5.484076499938965, "learning_rate": 8.045738045738047e-06, "loss": 0.0964, "step": 1328 }, { "epoch": 1.3814968814968815, "grad_norm": 4.23430061340332, "learning_rate": 8.043428043428045e-06, "loss": 0.045, "step": 1329 }, { "epoch": 1.3825363825363826, "grad_norm": 0.15918128192424774, "learning_rate": 8.041118041118043e-06, "loss": 0.0051, "step": 1330 }, { "epoch": 1.3835758835758836, "grad_norm": 9.001898765563965, "learning_rate": 8.03880803880804e-06, "loss": 0.3259, "step": 1331 }, { "epoch": 1.3846153846153846, "grad_norm": 7.946660041809082, "learning_rate": 8.036498036498038e-06, "loss": 0.3321, "step": 1332 }, { "epoch": 1.3856548856548856, "grad_norm": 9.039274215698242, "learning_rate": 8.034188034188036e-06, "loss": 0.1995, "step": 1333 }, { "epoch": 1.3866943866943866, "grad_norm": 9.994538307189941, "learning_rate": 8.031878031878032e-06, "loss": 0.7766, "step": 1334 }, { "epoch": 1.3877338877338876, "grad_norm": 0.012579978443682194, "learning_rate": 8.02956802956803e-06, "loss": 0.0001, "step": 1335 }, { "epoch": 1.3887733887733889, "grad_norm": 10.88592529296875, "learning_rate": 8.027258027258029e-06, "loss": 1.4343, "step": 1336 }, { "epoch": 1.3898128898128899, "grad_norm": 2.3097336292266846, "learning_rate": 8.024948024948025e-06, "loss": 0.0364, "step": 1337 }, { "epoch": 1.3908523908523909, "grad_norm": 7.158543586730957, "learning_rate": 8.022638022638023e-06, "loss": 0.1027, "step": 1338 }, { "epoch": 1.3918918918918919, "grad_norm": 10.87025260925293, "learning_rate": 8.020328020328022e-06, "loss": 0.3478, "step": 1339 }, { "epoch": 1.392931392931393, "grad_norm": 0.6269477605819702, "learning_rate": 8.018018018018018e-06, "loss": 0.0067, "step": 1340 }, { "epoch": 1.393970893970894, "grad_norm": 0.879704475402832, "learning_rate": 8.015708015708016e-06, "loss": 0.0216, "step": 1341 }, { "epoch": 1.3950103950103951, "grad_norm": 0.16545157134532928, "learning_rate": 8.013398013398015e-06, "loss": 0.0041, "step": 1342 }, { "epoch": 1.3960498960498962, "grad_norm": 2.778501272201538, "learning_rate": 8.011088011088011e-06, "loss": 0.0508, "step": 1343 }, { "epoch": 1.3970893970893972, "grad_norm": 3.787515878677368, "learning_rate": 8.00877800877801e-06, "loss": 0.0563, "step": 1344 }, { "epoch": 1.3981288981288982, "grad_norm": 0.3681766390800476, "learning_rate": 8.006468006468007e-06, "loss": 0.0086, "step": 1345 }, { "epoch": 1.3991683991683992, "grad_norm": 2.6923065185546875, "learning_rate": 8.004158004158004e-06, "loss": 0.0508, "step": 1346 }, { "epoch": 1.4002079002079002, "grad_norm": 5.058040142059326, "learning_rate": 8.001848001848002e-06, "loss": 0.0971, "step": 1347 }, { "epoch": 1.4012474012474012, "grad_norm": 5.924242973327637, "learning_rate": 7.999537999538e-06, "loss": 0.4377, "step": 1348 }, { "epoch": 1.4022869022869022, "grad_norm": 0.38952162861824036, "learning_rate": 7.997227997227997e-06, "loss": 0.0083, "step": 1349 }, { "epoch": 1.4033264033264032, "grad_norm": 11.313742637634277, "learning_rate": 7.994917994917995e-06, "loss": 0.9199, "step": 1350 }, { "epoch": 1.4043659043659042, "grad_norm": 11.081106185913086, "learning_rate": 7.992607992607993e-06, "loss": 0.5341, "step": 1351 }, { "epoch": 1.4054054054054055, "grad_norm": 18.970726013183594, "learning_rate": 7.99029799029799e-06, "loss": 1.0361, "step": 1352 }, { "epoch": 1.4064449064449065, "grad_norm": 8.993247985839844, "learning_rate": 7.987987987987988e-06, "loss": 0.7547, "step": 1353 }, { "epoch": 1.4074844074844075, "grad_norm": 0.08616983145475388, "learning_rate": 7.985677985677986e-06, "loss": 0.0015, "step": 1354 }, { "epoch": 1.4085239085239085, "grad_norm": 9.323349952697754, "learning_rate": 7.983367983367984e-06, "loss": 0.2254, "step": 1355 }, { "epoch": 1.4095634095634095, "grad_norm": 1.1458944082260132, "learning_rate": 7.98105798105798e-06, "loss": 0.051, "step": 1356 }, { "epoch": 1.4106029106029105, "grad_norm": 22.63304901123047, "learning_rate": 7.978747978747979e-06, "loss": 0.6127, "step": 1357 }, { "epoch": 1.4116424116424118, "grad_norm": 2.062016725540161, "learning_rate": 7.976437976437977e-06, "loss": 0.0247, "step": 1358 }, { "epoch": 1.4126819126819128, "grad_norm": 1.6762837171554565, "learning_rate": 7.974127974127974e-06, "loss": 0.0488, "step": 1359 }, { "epoch": 1.4137214137214138, "grad_norm": 2.0908496379852295, "learning_rate": 7.971817971817972e-06, "loss": 0.0375, "step": 1360 }, { "epoch": 1.4147609147609148, "grad_norm": 1.9176737070083618, "learning_rate": 7.96950796950797e-06, "loss": 0.0213, "step": 1361 }, { "epoch": 1.4158004158004158, "grad_norm": 8.565489768981934, "learning_rate": 7.967197967197968e-06, "loss": 0.3279, "step": 1362 }, { "epoch": 1.4168399168399168, "grad_norm": 6.474407196044922, "learning_rate": 7.964887964887965e-06, "loss": 0.1724, "step": 1363 }, { "epoch": 1.4178794178794178, "grad_norm": 10.128867149353027, "learning_rate": 7.962577962577963e-06, "loss": 0.3057, "step": 1364 }, { "epoch": 1.4189189189189189, "grad_norm": 1.244747281074524, "learning_rate": 7.960267960267961e-06, "loss": 0.0178, "step": 1365 }, { "epoch": 1.4199584199584199, "grad_norm": 10.41561508178711, "learning_rate": 7.95795795795796e-06, "loss": 0.3249, "step": 1366 }, { "epoch": 1.4209979209979209, "grad_norm": 6.584688186645508, "learning_rate": 7.955647955647956e-06, "loss": 0.1788, "step": 1367 }, { "epoch": 1.4220374220374221, "grad_norm": 0.3154916763305664, "learning_rate": 7.953337953337954e-06, "loss": 0.0071, "step": 1368 }, { "epoch": 1.4230769230769231, "grad_norm": 7.990124225616455, "learning_rate": 7.951027951027952e-06, "loss": 0.2696, "step": 1369 }, { "epoch": 1.4241164241164241, "grad_norm": 0.6271470189094543, "learning_rate": 7.948717948717949e-06, "loss": 0.0121, "step": 1370 }, { "epoch": 1.4251559251559252, "grad_norm": 11.38403606414795, "learning_rate": 7.946407946407947e-06, "loss": 0.4569, "step": 1371 }, { "epoch": 1.4261954261954262, "grad_norm": 0.29966673254966736, "learning_rate": 7.944097944097945e-06, "loss": 0.0057, "step": 1372 }, { "epoch": 1.4272349272349272, "grad_norm": 3.51631498336792, "learning_rate": 7.941787941787943e-06, "loss": 0.0646, "step": 1373 }, { "epoch": 1.4282744282744284, "grad_norm": 0.11474553495645523, "learning_rate": 7.93947793947794e-06, "loss": 0.0014, "step": 1374 }, { "epoch": 1.4293139293139294, "grad_norm": 0.058466557413339615, "learning_rate": 7.937167937167938e-06, "loss": 0.0007, "step": 1375 }, { "epoch": 1.4303534303534304, "grad_norm": 2.909372568130493, "learning_rate": 7.934857934857936e-06, "loss": 0.0337, "step": 1376 }, { "epoch": 1.4313929313929314, "grad_norm": 2.4320931434631348, "learning_rate": 7.932547932547933e-06, "loss": 0.0398, "step": 1377 }, { "epoch": 1.4324324324324325, "grad_norm": 6.761545658111572, "learning_rate": 7.93023793023793e-06, "loss": 0.2224, "step": 1378 }, { "epoch": 1.4334719334719335, "grad_norm": 3.9421579837799072, "learning_rate": 7.927927927927929e-06, "loss": 0.0674, "step": 1379 }, { "epoch": 1.4345114345114345, "grad_norm": 3.7583696842193604, "learning_rate": 7.925617925617927e-06, "loss": 0.1516, "step": 1380 }, { "epoch": 1.4355509355509355, "grad_norm": 4.613794326782227, "learning_rate": 7.923307923307924e-06, "loss": 0.0488, "step": 1381 }, { "epoch": 1.4365904365904365, "grad_norm": 7.776555061340332, "learning_rate": 7.920997920997922e-06, "loss": 0.4145, "step": 1382 }, { "epoch": 1.4376299376299375, "grad_norm": 10.442293167114258, "learning_rate": 7.91868791868792e-06, "loss": 0.8719, "step": 1383 }, { "epoch": 1.4386694386694387, "grad_norm": 0.04214008152484894, "learning_rate": 7.916377916377918e-06, "loss": 0.0009, "step": 1384 }, { "epoch": 1.4397089397089398, "grad_norm": 0.6979100704193115, "learning_rate": 7.914067914067915e-06, "loss": 0.0195, "step": 1385 }, { "epoch": 1.4407484407484408, "grad_norm": 6.32226037979126, "learning_rate": 7.911757911757913e-06, "loss": 0.216, "step": 1386 }, { "epoch": 1.4417879417879418, "grad_norm": 5.914670944213867, "learning_rate": 7.909447909447911e-06, "loss": 0.1529, "step": 1387 }, { "epoch": 1.4428274428274428, "grad_norm": 2.56058669090271, "learning_rate": 7.907137907137908e-06, "loss": 0.0471, "step": 1388 }, { "epoch": 1.4438669438669438, "grad_norm": 12.25080680847168, "learning_rate": 7.904827904827906e-06, "loss": 0.4424, "step": 1389 }, { "epoch": 1.444906444906445, "grad_norm": 5.598354339599609, "learning_rate": 7.902517902517904e-06, "loss": 0.1845, "step": 1390 }, { "epoch": 1.445945945945946, "grad_norm": 0.8475781083106995, "learning_rate": 7.9002079002079e-06, "loss": 0.0107, "step": 1391 }, { "epoch": 1.446985446985447, "grad_norm": 12.017598152160645, "learning_rate": 7.897897897897899e-06, "loss": 0.9019, "step": 1392 }, { "epoch": 1.448024948024948, "grad_norm": 26.395370483398438, "learning_rate": 7.895587895587897e-06, "loss": 0.6365, "step": 1393 }, { "epoch": 1.449064449064449, "grad_norm": 8.331962585449219, "learning_rate": 7.893277893277893e-06, "loss": 0.2966, "step": 1394 }, { "epoch": 1.45010395010395, "grad_norm": 0.055291905999183655, "learning_rate": 7.890967890967892e-06, "loss": 0.0011, "step": 1395 }, { "epoch": 1.4511434511434511, "grad_norm": 15.261618614196777, "learning_rate": 7.88865788865789e-06, "loss": 0.7521, "step": 1396 }, { "epoch": 1.4521829521829521, "grad_norm": 0.3626764416694641, "learning_rate": 7.886347886347886e-06, "loss": 0.0083, "step": 1397 }, { "epoch": 1.4532224532224531, "grad_norm": 8.048782348632812, "learning_rate": 7.884037884037884e-06, "loss": 0.502, "step": 1398 }, { "epoch": 1.4542619542619541, "grad_norm": 0.0779711902141571, "learning_rate": 7.881727881727883e-06, "loss": 0.002, "step": 1399 }, { "epoch": 1.4553014553014554, "grad_norm": 9.601903915405273, "learning_rate": 7.879417879417879e-06, "loss": 0.1312, "step": 1400 }, { "epoch": 1.4563409563409564, "grad_norm": 9.24765682220459, "learning_rate": 7.877107877107877e-06, "loss": 0.3113, "step": 1401 }, { "epoch": 1.4573804573804574, "grad_norm": 13.755073547363281, "learning_rate": 7.874797874797875e-06, "loss": 0.5496, "step": 1402 }, { "epoch": 1.4584199584199584, "grad_norm": 6.341691493988037, "learning_rate": 7.872487872487872e-06, "loss": 0.0982, "step": 1403 }, { "epoch": 1.4594594594594594, "grad_norm": 19.51810646057129, "learning_rate": 7.87017787017787e-06, "loss": 0.5321, "step": 1404 }, { "epoch": 1.4604989604989604, "grad_norm": 1.2743085622787476, "learning_rate": 7.867867867867868e-06, "loss": 0.0373, "step": 1405 }, { "epoch": 1.4615384615384617, "grad_norm": 1.669488549232483, "learning_rate": 7.865557865557867e-06, "loss": 0.0329, "step": 1406 }, { "epoch": 1.4625779625779627, "grad_norm": 11.852375030517578, "learning_rate": 7.863247863247863e-06, "loss": 0.8486, "step": 1407 }, { "epoch": 1.4636174636174637, "grad_norm": 4.6349568367004395, "learning_rate": 7.860937860937861e-06, "loss": 0.1185, "step": 1408 }, { "epoch": 1.4646569646569647, "grad_norm": 3.4211432933807373, "learning_rate": 7.85862785862786e-06, "loss": 0.1006, "step": 1409 }, { "epoch": 1.4656964656964657, "grad_norm": 3.819972038269043, "learning_rate": 7.856317856317856e-06, "loss": 0.0653, "step": 1410 }, { "epoch": 1.4667359667359667, "grad_norm": 2.380999803543091, "learning_rate": 7.854007854007854e-06, "loss": 0.0969, "step": 1411 }, { "epoch": 1.4677754677754677, "grad_norm": 20.77876853942871, "learning_rate": 7.851697851697852e-06, "loss": 3.1078, "step": 1412 }, { "epoch": 1.4688149688149688, "grad_norm": 7.184508800506592, "learning_rate": 7.849387849387849e-06, "loss": 0.1976, "step": 1413 }, { "epoch": 1.4698544698544698, "grad_norm": 9.331084251403809, "learning_rate": 7.847077847077847e-06, "loss": 0.3223, "step": 1414 }, { "epoch": 1.4708939708939708, "grad_norm": 4.02923583984375, "learning_rate": 7.844767844767845e-06, "loss": 0.1476, "step": 1415 }, { "epoch": 1.471933471933472, "grad_norm": 11.09693717956543, "learning_rate": 7.842457842457843e-06, "loss": 0.6451, "step": 1416 }, { "epoch": 1.472972972972973, "grad_norm": 0.6593981385231018, "learning_rate": 7.84014784014784e-06, "loss": 0.0162, "step": 1417 }, { "epoch": 1.474012474012474, "grad_norm": 7.206985950469971, "learning_rate": 7.837837837837838e-06, "loss": 0.1954, "step": 1418 }, { "epoch": 1.475051975051975, "grad_norm": 0.5606830716133118, "learning_rate": 7.835527835527836e-06, "loss": 0.008, "step": 1419 }, { "epoch": 1.476091476091476, "grad_norm": 17.2354679107666, "learning_rate": 7.833217833217834e-06, "loss": 1.3854, "step": 1420 }, { "epoch": 1.477130977130977, "grad_norm": 0.23821276426315308, "learning_rate": 7.830907830907831e-06, "loss": 0.0042, "step": 1421 }, { "epoch": 1.4781704781704783, "grad_norm": 6.488010406494141, "learning_rate": 7.828597828597829e-06, "loss": 0.1319, "step": 1422 }, { "epoch": 1.4792099792099793, "grad_norm": 20.678817749023438, "learning_rate": 7.826287826287827e-06, "loss": 0.5714, "step": 1423 }, { "epoch": 1.4802494802494803, "grad_norm": 1.193542242050171, "learning_rate": 7.823977823977824e-06, "loss": 0.025, "step": 1424 }, { "epoch": 1.4812889812889813, "grad_norm": 5.369607925415039, "learning_rate": 7.821667821667822e-06, "loss": 0.1313, "step": 1425 }, { "epoch": 1.4823284823284824, "grad_norm": 0.40537557005882263, "learning_rate": 7.81935781935782e-06, "loss": 0.008, "step": 1426 }, { "epoch": 1.4833679833679834, "grad_norm": 13.514300346374512, "learning_rate": 7.817047817047818e-06, "loss": 0.5955, "step": 1427 }, { "epoch": 1.4844074844074844, "grad_norm": 4.4053544998168945, "learning_rate": 7.814737814737815e-06, "loss": 0.1025, "step": 1428 }, { "epoch": 1.4854469854469854, "grad_norm": 2.3163092136383057, "learning_rate": 7.812427812427813e-06, "loss": 0.0969, "step": 1429 }, { "epoch": 1.4864864864864864, "grad_norm": 3.30670428276062, "learning_rate": 7.810117810117811e-06, "loss": 0.0597, "step": 1430 }, { "epoch": 1.4875259875259874, "grad_norm": 13.37387466430664, "learning_rate": 7.807807807807808e-06, "loss": 1.9395, "step": 1431 }, { "epoch": 1.4885654885654886, "grad_norm": 2.1618831157684326, "learning_rate": 7.805497805497806e-06, "loss": 0.0635, "step": 1432 }, { "epoch": 1.4896049896049897, "grad_norm": 0.022710174322128296, "learning_rate": 7.803187803187804e-06, "loss": 0.0005, "step": 1433 }, { "epoch": 1.4906444906444907, "grad_norm": 0.03452178090810776, "learning_rate": 7.800877800877802e-06, "loss": 0.0007, "step": 1434 }, { "epoch": 1.4916839916839917, "grad_norm": 6.462287425994873, "learning_rate": 7.798567798567799e-06, "loss": 0.3239, "step": 1435 }, { "epoch": 1.4927234927234927, "grad_norm": 6.634264945983887, "learning_rate": 7.796257796257797e-06, "loss": 0.1728, "step": 1436 }, { "epoch": 1.4937629937629937, "grad_norm": 9.087811470031738, "learning_rate": 7.793947793947795e-06, "loss": 0.8339, "step": 1437 }, { "epoch": 1.494802494802495, "grad_norm": 4.972245693206787, "learning_rate": 7.791637791637793e-06, "loss": 0.1508, "step": 1438 }, { "epoch": 1.495841995841996, "grad_norm": 9.671388626098633, "learning_rate": 7.78932778932779e-06, "loss": 0.167, "step": 1439 }, { "epoch": 1.496881496881497, "grad_norm": 5.96663236618042, "learning_rate": 7.787017787017788e-06, "loss": 0.2168, "step": 1440 }, { "epoch": 1.497920997920998, "grad_norm": 16.127059936523438, "learning_rate": 7.784707784707786e-06, "loss": 2.7398, "step": 1441 }, { "epoch": 1.498960498960499, "grad_norm": 0.006674590986222029, "learning_rate": 7.782397782397783e-06, "loss": 0.0001, "step": 1442 }, { "epoch": 1.5, "grad_norm": 9.377375602722168, "learning_rate": 7.780087780087781e-06, "loss": 0.6852, "step": 1443 }, { "epoch": 1.501039501039501, "grad_norm": 7.858187675476074, "learning_rate": 7.77777777777778e-06, "loss": 0.405, "step": 1444 }, { "epoch": 1.502079002079002, "grad_norm": 7.439113140106201, "learning_rate": 7.775467775467777e-06, "loss": 0.3658, "step": 1445 }, { "epoch": 1.503118503118503, "grad_norm": 6.521304130554199, "learning_rate": 7.773157773157774e-06, "loss": 0.4594, "step": 1446 }, { "epoch": 1.504158004158004, "grad_norm": 13.938838958740234, "learning_rate": 7.770847770847772e-06, "loss": 0.2794, "step": 1447 }, { "epoch": 1.505197505197505, "grad_norm": 8.016144752502441, "learning_rate": 7.76853776853777e-06, "loss": 0.306, "step": 1448 }, { "epoch": 1.506237006237006, "grad_norm": 10.953642845153809, "learning_rate": 7.766227766227767e-06, "loss": 0.3486, "step": 1449 }, { "epoch": 1.5072765072765073, "grad_norm": 0.8176953196525574, "learning_rate": 7.763917763917765e-06, "loss": 0.0171, "step": 1450 }, { "epoch": 1.5083160083160083, "grad_norm": 9.059322357177734, "learning_rate": 7.761607761607763e-06, "loss": 0.1213, "step": 1451 }, { "epoch": 1.5093555093555093, "grad_norm": 3.3097989559173584, "learning_rate": 7.75929775929776e-06, "loss": 0.1607, "step": 1452 }, { "epoch": 1.5103950103950103, "grad_norm": 1.459271788597107, "learning_rate": 7.756987756987758e-06, "loss": 0.0208, "step": 1453 }, { "epoch": 1.5114345114345116, "grad_norm": 2.3589069843292236, "learning_rate": 7.754677754677756e-06, "loss": 0.0431, "step": 1454 }, { "epoch": 1.5124740124740126, "grad_norm": 0.13137201964855194, "learning_rate": 7.752367752367752e-06, "loss": 0.0036, "step": 1455 }, { "epoch": 1.5135135135135136, "grad_norm": 9.798415184020996, "learning_rate": 7.75005775005775e-06, "loss": 0.3902, "step": 1456 }, { "epoch": 1.5145530145530146, "grad_norm": 0.7354431748390198, "learning_rate": 7.747747747747749e-06, "loss": 0.0218, "step": 1457 }, { "epoch": 1.5155925155925156, "grad_norm": 6.521695137023926, "learning_rate": 7.745437745437745e-06, "loss": 0.2193, "step": 1458 }, { "epoch": 1.5166320166320166, "grad_norm": 0.0805174857378006, "learning_rate": 7.743127743127744e-06, "loss": 0.0023, "step": 1459 }, { "epoch": 1.5176715176715176, "grad_norm": 3.0972719192504883, "learning_rate": 7.740817740817742e-06, "loss": 0.0756, "step": 1460 }, { "epoch": 1.5187110187110187, "grad_norm": 9.875079154968262, "learning_rate": 7.738507738507738e-06, "loss": 0.8773, "step": 1461 }, { "epoch": 1.5197505197505197, "grad_norm": 3.0031330585479736, "learning_rate": 7.736197736197736e-06, "loss": 0.0603, "step": 1462 }, { "epoch": 1.5207900207900207, "grad_norm": 14.723185539245605, "learning_rate": 7.733887733887735e-06, "loss": 1.6017, "step": 1463 }, { "epoch": 1.5218295218295217, "grad_norm": 0.344759076833725, "learning_rate": 7.731577731577731e-06, "loss": 0.0105, "step": 1464 }, { "epoch": 1.5228690228690227, "grad_norm": 5.559858322143555, "learning_rate": 7.72926772926773e-06, "loss": 0.2469, "step": 1465 }, { "epoch": 1.523908523908524, "grad_norm": 13.323915481567383, "learning_rate": 7.726957726957728e-06, "loss": 1.2224, "step": 1466 }, { "epoch": 1.524948024948025, "grad_norm": 0.22189731895923615, "learning_rate": 7.724647724647726e-06, "loss": 0.0073, "step": 1467 }, { "epoch": 1.525987525987526, "grad_norm": 0.010859851725399494, "learning_rate": 7.722337722337722e-06, "loss": 0.0003, "step": 1468 }, { "epoch": 1.527027027027027, "grad_norm": 0.7272409796714783, "learning_rate": 7.72002772002772e-06, "loss": 0.0224, "step": 1469 }, { "epoch": 1.5280665280665282, "grad_norm": 5.3687052726745605, "learning_rate": 7.717717717717719e-06, "loss": 0.4379, "step": 1470 }, { "epoch": 1.5291060291060292, "grad_norm": 12.218472480773926, "learning_rate": 7.715407715407715e-06, "loss": 1.353, "step": 1471 }, { "epoch": 1.5301455301455302, "grad_norm": 2.5254576206207275, "learning_rate": 7.713097713097713e-06, "loss": 0.0473, "step": 1472 }, { "epoch": 1.5311850311850312, "grad_norm": 1.6205060482025146, "learning_rate": 7.710787710787711e-06, "loss": 0.0293, "step": 1473 }, { "epoch": 1.5322245322245323, "grad_norm": 7.083399772644043, "learning_rate": 7.70847770847771e-06, "loss": 0.1893, "step": 1474 }, { "epoch": 1.5332640332640333, "grad_norm": 0.48890095949172974, "learning_rate": 7.706167706167706e-06, "loss": 0.0165, "step": 1475 }, { "epoch": 1.5343035343035343, "grad_norm": 13.486111640930176, "learning_rate": 7.703857703857704e-06, "loss": 0.5817, "step": 1476 }, { "epoch": 1.5353430353430353, "grad_norm": 8.082154273986816, "learning_rate": 7.701547701547703e-06, "loss": 0.1659, "step": 1477 }, { "epoch": 1.5363825363825363, "grad_norm": 3.9634504318237305, "learning_rate": 7.699237699237699e-06, "loss": 0.1039, "step": 1478 }, { "epoch": 1.5374220374220373, "grad_norm": 0.2630949318408966, "learning_rate": 7.696927696927697e-06, "loss": 0.0071, "step": 1479 }, { "epoch": 1.5384615384615383, "grad_norm": 8.920835494995117, "learning_rate": 7.694617694617695e-06, "loss": 0.2141, "step": 1480 }, { "epoch": 1.5395010395010393, "grad_norm": 8.480643272399902, "learning_rate": 7.692307692307694e-06, "loss": 0.2279, "step": 1481 }, { "epoch": 1.5405405405405406, "grad_norm": 8.147748947143555, "learning_rate": 7.68999768999769e-06, "loss": 0.2301, "step": 1482 }, { "epoch": 1.5415800415800416, "grad_norm": 9.60760498046875, "learning_rate": 7.687687687687688e-06, "loss": 0.0828, "step": 1483 }, { "epoch": 1.5426195426195426, "grad_norm": 0.3811738193035126, "learning_rate": 7.685377685377686e-06, "loss": 0.0085, "step": 1484 }, { "epoch": 1.5436590436590436, "grad_norm": 0.14392603933811188, "learning_rate": 7.683067683067685e-06, "loss": 0.0029, "step": 1485 }, { "epoch": 1.5446985446985448, "grad_norm": 3.819782018661499, "learning_rate": 7.680757680757681e-06, "loss": 0.2339, "step": 1486 }, { "epoch": 1.5457380457380459, "grad_norm": 3.4342427253723145, "learning_rate": 7.67844767844768e-06, "loss": 0.0857, "step": 1487 }, { "epoch": 1.5467775467775469, "grad_norm": 11.615262031555176, "learning_rate": 7.676137676137678e-06, "loss": 1.1684, "step": 1488 }, { "epoch": 1.5478170478170479, "grad_norm": 2.01460599899292, "learning_rate": 7.673827673827674e-06, "loss": 0.04, "step": 1489 }, { "epoch": 1.5488565488565489, "grad_norm": 0.06286811083555222, "learning_rate": 7.671517671517672e-06, "loss": 0.0011, "step": 1490 }, { "epoch": 1.54989604989605, "grad_norm": 12.91536808013916, "learning_rate": 7.66920766920767e-06, "loss": 0.4833, "step": 1491 }, { "epoch": 1.550935550935551, "grad_norm": 0.4687879979610443, "learning_rate": 7.666897666897669e-06, "loss": 0.0058, "step": 1492 }, { "epoch": 1.551975051975052, "grad_norm": 3.517775774002075, "learning_rate": 7.664587664587665e-06, "loss": 0.1211, "step": 1493 }, { "epoch": 1.553014553014553, "grad_norm": 6.1218719482421875, "learning_rate": 7.662277662277663e-06, "loss": 0.1487, "step": 1494 }, { "epoch": 1.554054054054054, "grad_norm": 4.684225559234619, "learning_rate": 7.659967659967661e-06, "loss": 0.1004, "step": 1495 }, { "epoch": 1.555093555093555, "grad_norm": 4.690369606018066, "learning_rate": 7.657657657657658e-06, "loss": 0.2011, "step": 1496 }, { "epoch": 1.556133056133056, "grad_norm": 7.243317127227783, "learning_rate": 7.655347655347656e-06, "loss": 0.1644, "step": 1497 }, { "epoch": 1.5571725571725572, "grad_norm": 4.194143772125244, "learning_rate": 7.653037653037654e-06, "loss": 0.1912, "step": 1498 }, { "epoch": 1.5582120582120582, "grad_norm": 3.54541277885437, "learning_rate": 7.650727650727653e-06, "loss": 0.0766, "step": 1499 }, { "epoch": 1.5592515592515592, "grad_norm": 5.809798240661621, "learning_rate": 7.648417648417649e-06, "loss": 0.1385, "step": 1500 }, { "epoch": 1.5602910602910602, "grad_norm": 0.055198103189468384, "learning_rate": 7.646107646107647e-06, "loss": 0.0016, "step": 1501 }, { "epoch": 1.5613305613305615, "grad_norm": 0.09694336354732513, "learning_rate": 7.643797643797645e-06, "loss": 0.0024, "step": 1502 }, { "epoch": 1.5623700623700625, "grad_norm": 6.44903564453125, "learning_rate": 7.641487641487642e-06, "loss": 0.3952, "step": 1503 }, { "epoch": 1.5634095634095635, "grad_norm": 0.7087557315826416, "learning_rate": 7.63917763917764e-06, "loss": 0.0143, "step": 1504 }, { "epoch": 1.5644490644490645, "grad_norm": 8.263898849487305, "learning_rate": 7.636867636867638e-06, "loss": 0.5762, "step": 1505 }, { "epoch": 1.5654885654885655, "grad_norm": 2.20043683052063, "learning_rate": 7.634557634557635e-06, "loss": 0.0495, "step": 1506 }, { "epoch": 1.5665280665280665, "grad_norm": 4.385960578918457, "learning_rate": 7.632247632247633e-06, "loss": 0.0778, "step": 1507 }, { "epoch": 1.5675675675675675, "grad_norm": 0.7370150685310364, "learning_rate": 7.629937629937631e-06, "loss": 0.0202, "step": 1508 }, { "epoch": 1.5686070686070686, "grad_norm": 0.4597228467464447, "learning_rate": 7.6276276276276285e-06, "loss": 0.0069, "step": 1509 }, { "epoch": 1.5696465696465696, "grad_norm": 4.463411331176758, "learning_rate": 7.625317625317627e-06, "loss": 0.0576, "step": 1510 }, { "epoch": 1.5706860706860706, "grad_norm": 10.788517951965332, "learning_rate": 7.623007623007623e-06, "loss": 0.7124, "step": 1511 }, { "epoch": 1.5717255717255716, "grad_norm": 5.738900184631348, "learning_rate": 7.620697620697621e-06, "loss": 0.2167, "step": 1512 }, { "epoch": 1.5727650727650726, "grad_norm": 10.37417221069336, "learning_rate": 7.61838761838762e-06, "loss": 0.4828, "step": 1513 }, { "epoch": 1.5738045738045738, "grad_norm": 8.865348815917969, "learning_rate": 7.616077616077616e-06, "loss": 0.7257, "step": 1514 }, { "epoch": 1.5748440748440748, "grad_norm": 1.017466425895691, "learning_rate": 7.613767613767614e-06, "loss": 0.0313, "step": 1515 }, { "epoch": 1.5758835758835759, "grad_norm": 0.827038586139679, "learning_rate": 7.6114576114576125e-06, "loss": 0.028, "step": 1516 }, { "epoch": 1.5769230769230769, "grad_norm": 8.467267990112305, "learning_rate": 7.60914760914761e-06, "loss": 0.7333, "step": 1517 }, { "epoch": 1.577962577962578, "grad_norm": 4.1905646324157715, "learning_rate": 7.606837606837607e-06, "loss": 0.1714, "step": 1518 }, { "epoch": 1.5790020790020791, "grad_norm": 1.6618033647537231, "learning_rate": 7.604527604527605e-06, "loss": 0.0323, "step": 1519 }, { "epoch": 1.5800415800415801, "grad_norm": 2.4269134998321533, "learning_rate": 7.602217602217603e-06, "loss": 0.0614, "step": 1520 }, { "epoch": 1.5810810810810811, "grad_norm": 10.702552795410156, "learning_rate": 7.599907599907601e-06, "loss": 0.8274, "step": 1521 }, { "epoch": 1.5821205821205822, "grad_norm": 9.825342178344727, "learning_rate": 7.597597597597598e-06, "loss": 2.324, "step": 1522 }, { "epoch": 1.5831600831600832, "grad_norm": 2.365910291671753, "learning_rate": 7.5952875952875956e-06, "loss": 0.0784, "step": 1523 }, { "epoch": 1.5841995841995842, "grad_norm": 0.04099468141794205, "learning_rate": 7.592977592977594e-06, "loss": 0.0007, "step": 1524 }, { "epoch": 1.5852390852390852, "grad_norm": 11.147265434265137, "learning_rate": 7.590667590667591e-06, "loss": 0.3298, "step": 1525 }, { "epoch": 1.5862785862785862, "grad_norm": 0.05397927388548851, "learning_rate": 7.5883575883575885e-06, "loss": 0.001, "step": 1526 }, { "epoch": 1.5873180873180872, "grad_norm": 2.043637990951538, "learning_rate": 7.586047586047587e-06, "loss": 0.0426, "step": 1527 }, { "epoch": 1.5883575883575882, "grad_norm": 0.0009239251958206296, "learning_rate": 7.583737583737585e-06, "loss": 0.0, "step": 1528 }, { "epoch": 1.5893970893970892, "grad_norm": 0.13833796977996826, "learning_rate": 7.581427581427581e-06, "loss": 0.0022, "step": 1529 }, { "epoch": 1.5904365904365905, "grad_norm": 9.737223625183105, "learning_rate": 7.5791175791175795e-06, "loss": 0.4196, "step": 1530 }, { "epoch": 1.5914760914760915, "grad_norm": 4.808546543121338, "learning_rate": 7.576807576807578e-06, "loss": 0.1391, "step": 1531 }, { "epoch": 1.5925155925155925, "grad_norm": 7.235491752624512, "learning_rate": 7.574497574497574e-06, "loss": 0.1397, "step": 1532 }, { "epoch": 1.5935550935550935, "grad_norm": 0.9746120572090149, "learning_rate": 7.572187572187572e-06, "loss": 0.0173, "step": 1533 }, { "epoch": 1.5945945945945947, "grad_norm": 14.863147735595703, "learning_rate": 7.569877569877571e-06, "loss": 1.9486, "step": 1534 }, { "epoch": 1.5956340956340958, "grad_norm": 11.282873153686523, "learning_rate": 7.567567567567569e-06, "loss": 0.6396, "step": 1535 }, { "epoch": 1.5966735966735968, "grad_norm": 0.25318750739097595, "learning_rate": 7.565257565257565e-06, "loss": 0.0038, "step": 1536 }, { "epoch": 1.5977130977130978, "grad_norm": 1.9883997440338135, "learning_rate": 7.5629475629475635e-06, "loss": 0.0468, "step": 1537 }, { "epoch": 1.5987525987525988, "grad_norm": 6.504814147949219, "learning_rate": 7.560637560637562e-06, "loss": 0.3667, "step": 1538 }, { "epoch": 1.5997920997920998, "grad_norm": 2.2447383403778076, "learning_rate": 7.55832755832756e-06, "loss": 0.0298, "step": 1539 }, { "epoch": 1.6008316008316008, "grad_norm": 0.31640592217445374, "learning_rate": 7.556017556017556e-06, "loss": 0.0111, "step": 1540 }, { "epoch": 1.6018711018711018, "grad_norm": 7.6291399002075195, "learning_rate": 7.5537075537075545e-06, "loss": 0.1263, "step": 1541 }, { "epoch": 1.6029106029106028, "grad_norm": 3.217961549758911, "learning_rate": 7.551397551397553e-06, "loss": 0.1511, "step": 1542 }, { "epoch": 1.6039501039501038, "grad_norm": 2.8815371990203857, "learning_rate": 7.549087549087549e-06, "loss": 0.0455, "step": 1543 }, { "epoch": 1.6049896049896049, "grad_norm": 1.2733142375946045, "learning_rate": 7.546777546777547e-06, "loss": 0.0268, "step": 1544 }, { "epoch": 1.6060291060291059, "grad_norm": 0.5411023497581482, "learning_rate": 7.544467544467546e-06, "loss": 0.0096, "step": 1545 }, { "epoch": 1.607068607068607, "grad_norm": 4.737339496612549, "learning_rate": 7.542157542157543e-06, "loss": 0.1374, "step": 1546 }, { "epoch": 1.6081081081081081, "grad_norm": 0.9114570021629333, "learning_rate": 7.53984753984754e-06, "loss": 0.0267, "step": 1547 }, { "epoch": 1.6091476091476091, "grad_norm": 0.3017042279243469, "learning_rate": 7.5375375375375385e-06, "loss": 0.0069, "step": 1548 }, { "epoch": 1.6101871101871101, "grad_norm": 8.151854515075684, "learning_rate": 7.535227535227536e-06, "loss": 0.2909, "step": 1549 }, { "epoch": 1.6112266112266114, "grad_norm": 0.5599004626274109, "learning_rate": 7.532917532917533e-06, "loss": 0.0167, "step": 1550 }, { "epoch": 1.6122661122661124, "grad_norm": 4.013564109802246, "learning_rate": 7.530607530607531e-06, "loss": 0.1025, "step": 1551 }, { "epoch": 1.6133056133056134, "grad_norm": 6.159677028656006, "learning_rate": 7.528297528297529e-06, "loss": 0.087, "step": 1552 }, { "epoch": 1.6143451143451144, "grad_norm": 0.3326902389526367, "learning_rate": 7.525987525987527e-06, "loss": 0.0077, "step": 1553 }, { "epoch": 1.6153846153846154, "grad_norm": 13.508583068847656, "learning_rate": 7.523677523677524e-06, "loss": 1.1188, "step": 1554 }, { "epoch": 1.6164241164241164, "grad_norm": 9.111117362976074, "learning_rate": 7.521367521367522e-06, "loss": 0.177, "step": 1555 }, { "epoch": 1.6174636174636174, "grad_norm": 0.7300674319267273, "learning_rate": 7.51905751905752e-06, "loss": 0.0162, "step": 1556 }, { "epoch": 1.6185031185031185, "grad_norm": 6.406063556671143, "learning_rate": 7.516747516747518e-06, "loss": 0.1966, "step": 1557 }, { "epoch": 1.6195426195426195, "grad_norm": 12.330337524414062, "learning_rate": 7.5144375144375145e-06, "loss": 2.1292, "step": 1558 }, { "epoch": 1.6205821205821205, "grad_norm": 1.7357144355773926, "learning_rate": 7.512127512127513e-06, "loss": 0.0297, "step": 1559 }, { "epoch": 1.6216216216216215, "grad_norm": 3.0343379974365234, "learning_rate": 7.509817509817511e-06, "loss": 0.0694, "step": 1560 }, { "epoch": 1.6226611226611225, "grad_norm": 0.01201736181974411, "learning_rate": 7.507507507507507e-06, "loss": 0.0003, "step": 1561 }, { "epoch": 1.6237006237006237, "grad_norm": 21.220945358276367, "learning_rate": 7.5051975051975055e-06, "loss": 2.9917, "step": 1562 }, { "epoch": 1.6247401247401247, "grad_norm": 6.297361850738525, "learning_rate": 7.502887502887504e-06, "loss": 0.1415, "step": 1563 }, { "epoch": 1.6257796257796258, "grad_norm": 8.065841674804688, "learning_rate": 7.500577500577502e-06, "loss": 0.3198, "step": 1564 }, { "epoch": 1.6268191268191268, "grad_norm": 6.228341579437256, "learning_rate": 7.498267498267498e-06, "loss": 0.3259, "step": 1565 }, { "epoch": 1.627858627858628, "grad_norm": 5.884280204772949, "learning_rate": 7.495957495957497e-06, "loss": 0.2876, "step": 1566 }, { "epoch": 1.628898128898129, "grad_norm": 13.760956764221191, "learning_rate": 7.493647493647495e-06, "loss": 0.885, "step": 1567 }, { "epoch": 1.62993762993763, "grad_norm": 11.207575798034668, "learning_rate": 7.491337491337491e-06, "loss": 0.8122, "step": 1568 }, { "epoch": 1.630977130977131, "grad_norm": 0.14747324585914612, "learning_rate": 7.4890274890274895e-06, "loss": 0.0024, "step": 1569 }, { "epoch": 1.632016632016632, "grad_norm": 6.728248119354248, "learning_rate": 7.486717486717488e-06, "loss": 0.2973, "step": 1570 }, { "epoch": 1.633056133056133, "grad_norm": 0.09715936332941055, "learning_rate": 7.484407484407485e-06, "loss": 0.0018, "step": 1571 }, { "epoch": 1.634095634095634, "grad_norm": 2.335740804672241, "learning_rate": 7.482097482097482e-06, "loss": 0.0711, "step": 1572 }, { "epoch": 1.635135135135135, "grad_norm": 2.9282028675079346, "learning_rate": 7.4797874797874805e-06, "loss": 0.058, "step": 1573 }, { "epoch": 1.636174636174636, "grad_norm": 2.8667209148406982, "learning_rate": 7.477477477477479e-06, "loss": 0.0595, "step": 1574 }, { "epoch": 1.637214137214137, "grad_norm": 7.022189617156982, "learning_rate": 7.475167475167476e-06, "loss": 0.3922, "step": 1575 }, { "epoch": 1.6382536382536381, "grad_norm": 8.579756736755371, "learning_rate": 7.4728574728574734e-06, "loss": 0.208, "step": 1576 }, { "epoch": 1.6392931392931391, "grad_norm": 12.531072616577148, "learning_rate": 7.470547470547472e-06, "loss": 0.844, "step": 1577 }, { "epoch": 1.6403326403326404, "grad_norm": 2.289039373397827, "learning_rate": 7.468237468237469e-06, "loss": 0.0383, "step": 1578 }, { "epoch": 1.6413721413721414, "grad_norm": 11.637489318847656, "learning_rate": 7.465927465927466e-06, "loss": 0.4017, "step": 1579 }, { "epoch": 1.6424116424116424, "grad_norm": 0.017794983461499214, "learning_rate": 7.4636174636174645e-06, "loss": 0.0005, "step": 1580 }, { "epoch": 1.6434511434511434, "grad_norm": 10.051448822021484, "learning_rate": 7.461307461307462e-06, "loss": 1.1899, "step": 1581 }, { "epoch": 1.6444906444906446, "grad_norm": 0.20261698961257935, "learning_rate": 7.45899745899746e-06, "loss": 0.0038, "step": 1582 }, { "epoch": 1.6455301455301456, "grad_norm": 2.199592113494873, "learning_rate": 7.456687456687457e-06, "loss": 0.0379, "step": 1583 }, { "epoch": 1.6465696465696467, "grad_norm": 0.1683259755373001, "learning_rate": 7.454377454377455e-06, "loss": 0.0032, "step": 1584 }, { "epoch": 1.6476091476091477, "grad_norm": 6.000250339508057, "learning_rate": 7.452067452067453e-06, "loss": 0.2425, "step": 1585 }, { "epoch": 1.6486486486486487, "grad_norm": 0.08056347072124481, "learning_rate": 7.44975744975745e-06, "loss": 0.0025, "step": 1586 }, { "epoch": 1.6496881496881497, "grad_norm": 9.408105850219727, "learning_rate": 7.447447447447448e-06, "loss": 0.3774, "step": 1587 }, { "epoch": 1.6507276507276507, "grad_norm": 6.828577995300293, "learning_rate": 7.445137445137446e-06, "loss": 0.5208, "step": 1588 }, { "epoch": 1.6517671517671517, "grad_norm": 9.918600082397461, "learning_rate": 7.442827442827444e-06, "loss": 0.6716, "step": 1589 }, { "epoch": 1.6528066528066527, "grad_norm": 1.9643577337265015, "learning_rate": 7.4405174405174405e-06, "loss": 0.0253, "step": 1590 }, { "epoch": 1.6538461538461537, "grad_norm": 0.17032547295093536, "learning_rate": 7.438207438207439e-06, "loss": 0.0032, "step": 1591 }, { "epoch": 1.6548856548856548, "grad_norm": 9.601398468017578, "learning_rate": 7.435897435897437e-06, "loss": 0.1225, "step": 1592 }, { "epoch": 1.6559251559251558, "grad_norm": 0.04000294581055641, "learning_rate": 7.433587433587435e-06, "loss": 0.0014, "step": 1593 }, { "epoch": 1.656964656964657, "grad_norm": 1.3193373680114746, "learning_rate": 7.4312774312774315e-06, "loss": 0.0365, "step": 1594 }, { "epoch": 1.658004158004158, "grad_norm": 1.8269625902175903, "learning_rate": 7.42896742896743e-06, "loss": 0.0426, "step": 1595 }, { "epoch": 1.659043659043659, "grad_norm": 0.24532830715179443, "learning_rate": 7.426657426657428e-06, "loss": 0.0055, "step": 1596 }, { "epoch": 1.66008316008316, "grad_norm": 1.368507981300354, "learning_rate": 7.424347424347424e-06, "loss": 0.0634, "step": 1597 }, { "epoch": 1.6611226611226613, "grad_norm": 0.16978596150875092, "learning_rate": 7.422037422037423e-06, "loss": 0.0031, "step": 1598 }, { "epoch": 1.6621621621621623, "grad_norm": 2.2428948879241943, "learning_rate": 7.419727419727421e-06, "loss": 0.0447, "step": 1599 }, { "epoch": 1.6632016632016633, "grad_norm": 1.2490960359573364, "learning_rate": 7.417417417417418e-06, "loss": 0.0241, "step": 1600 }, { "epoch": 1.6642411642411643, "grad_norm": 5.034488677978516, "learning_rate": 7.4151074151074155e-06, "loss": 0.3478, "step": 1601 }, { "epoch": 1.6652806652806653, "grad_norm": 0.5424055457115173, "learning_rate": 7.412797412797414e-06, "loss": 0.0111, "step": 1602 }, { "epoch": 1.6663201663201663, "grad_norm": 0.16386502981185913, "learning_rate": 7.410487410487411e-06, "loss": 0.003, "step": 1603 }, { "epoch": 1.6673596673596673, "grad_norm": 12.852590560913086, "learning_rate": 7.408177408177408e-06, "loss": 0.6814, "step": 1604 }, { "epoch": 1.6683991683991684, "grad_norm": 3.859780788421631, "learning_rate": 7.4058674058674066e-06, "loss": 0.0922, "step": 1605 }, { "epoch": 1.6694386694386694, "grad_norm": 3.8476450443267822, "learning_rate": 7.403557403557404e-06, "loss": 0.1143, "step": 1606 }, { "epoch": 1.6704781704781704, "grad_norm": 14.699956893920898, "learning_rate": 7.401247401247402e-06, "loss": 0.1259, "step": 1607 }, { "epoch": 1.6715176715176714, "grad_norm": 0.9650105834007263, "learning_rate": 7.3989373989373994e-06, "loss": 0.0456, "step": 1608 }, { "epoch": 1.6725571725571724, "grad_norm": 0.15530744194984436, "learning_rate": 7.396627396627397e-06, "loss": 0.0047, "step": 1609 }, { "epoch": 1.6735966735966736, "grad_norm": 4.704723358154297, "learning_rate": 7.394317394317395e-06, "loss": 0.2454, "step": 1610 }, { "epoch": 1.6746361746361746, "grad_norm": 4.737837314605713, "learning_rate": 7.392007392007393e-06, "loss": 0.1039, "step": 1611 }, { "epoch": 1.6756756756756757, "grad_norm": 8.195130348205566, "learning_rate": 7.38969738969739e-06, "loss": 0.419, "step": 1612 }, { "epoch": 1.6767151767151767, "grad_norm": 0.1998622566461563, "learning_rate": 7.387387387387388e-06, "loss": 0.004, "step": 1613 }, { "epoch": 1.677754677754678, "grad_norm": 2.4961326122283936, "learning_rate": 7.385077385077386e-06, "loss": 0.117, "step": 1614 }, { "epoch": 1.678794178794179, "grad_norm": 5.578335285186768, "learning_rate": 7.3827673827673825e-06, "loss": 0.1941, "step": 1615 }, { "epoch": 1.67983367983368, "grad_norm": 7.483165264129639, "learning_rate": 7.380457380457381e-06, "loss": 0.7208, "step": 1616 }, { "epoch": 1.680873180873181, "grad_norm": 2.0960891246795654, "learning_rate": 7.378147378147379e-06, "loss": 0.0293, "step": 1617 }, { "epoch": 1.681912681912682, "grad_norm": 7.603874206542969, "learning_rate": 7.375837375837377e-06, "loss": 0.2035, "step": 1618 }, { "epoch": 1.682952182952183, "grad_norm": 0.9122418165206909, "learning_rate": 7.373527373527374e-06, "loss": 0.0224, "step": 1619 }, { "epoch": 1.683991683991684, "grad_norm": 6.815097332000732, "learning_rate": 7.371217371217372e-06, "loss": 0.1871, "step": 1620 }, { "epoch": 1.685031185031185, "grad_norm": 11.70824909210205, "learning_rate": 7.36890736890737e-06, "loss": 0.7655, "step": 1621 }, { "epoch": 1.686070686070686, "grad_norm": 0.46677353978157043, "learning_rate": 7.3665973665973665e-06, "loss": 0.008, "step": 1622 }, { "epoch": 1.687110187110187, "grad_norm": 2.1971335411071777, "learning_rate": 7.364287364287365e-06, "loss": 0.0468, "step": 1623 }, { "epoch": 1.688149688149688, "grad_norm": 5.790288925170898, "learning_rate": 7.361977361977363e-06, "loss": 0.2241, "step": 1624 }, { "epoch": 1.689189189189189, "grad_norm": 4.230993747711182, "learning_rate": 7.359667359667361e-06, "loss": 0.1058, "step": 1625 }, { "epoch": 1.6902286902286903, "grad_norm": 1.344787359237671, "learning_rate": 7.3573573573573575e-06, "loss": 0.0207, "step": 1626 }, { "epoch": 1.6912681912681913, "grad_norm": 2.8183813095092773, "learning_rate": 7.355047355047356e-06, "loss": 0.067, "step": 1627 }, { "epoch": 1.6923076923076923, "grad_norm": 6.174064636230469, "learning_rate": 7.352737352737354e-06, "loss": 0.2275, "step": 1628 }, { "epoch": 1.6933471933471933, "grad_norm": 5.060527801513672, "learning_rate": 7.350427350427351e-06, "loss": 0.2279, "step": 1629 }, { "epoch": 1.6943866943866945, "grad_norm": 5.803682327270508, "learning_rate": 7.348117348117349e-06, "loss": 0.0884, "step": 1630 }, { "epoch": 1.6954261954261955, "grad_norm": 2.137218713760376, "learning_rate": 7.345807345807347e-06, "loss": 0.1781, "step": 1631 }, { "epoch": 1.6964656964656966, "grad_norm": 9.369726181030273, "learning_rate": 7.343497343497344e-06, "loss": 0.2808, "step": 1632 }, { "epoch": 1.6975051975051976, "grad_norm": 3.2292776107788086, "learning_rate": 7.3411873411873415e-06, "loss": 0.054, "step": 1633 }, { "epoch": 1.6985446985446986, "grad_norm": 8.865103721618652, "learning_rate": 7.33887733887734e-06, "loss": 0.2376, "step": 1634 }, { "epoch": 1.6995841995841996, "grad_norm": 4.944544315338135, "learning_rate": 7.336567336567337e-06, "loss": 0.1071, "step": 1635 }, { "epoch": 1.7006237006237006, "grad_norm": 3.2673540115356445, "learning_rate": 7.334257334257335e-06, "loss": 0.0807, "step": 1636 }, { "epoch": 1.7016632016632016, "grad_norm": 8.087736129760742, "learning_rate": 7.3319473319473326e-06, "loss": 0.3968, "step": 1637 }, { "epoch": 1.7027027027027026, "grad_norm": 10.094491958618164, "learning_rate": 7.32963732963733e-06, "loss": 0.2602, "step": 1638 }, { "epoch": 1.7037422037422036, "grad_norm": 5.964760780334473, "learning_rate": 7.327327327327328e-06, "loss": 0.1667, "step": 1639 }, { "epoch": 1.7047817047817047, "grad_norm": 8.874397277832031, "learning_rate": 7.3250173250173254e-06, "loss": 0.4751, "step": 1640 }, { "epoch": 1.7058212058212057, "grad_norm": 14.104971885681152, "learning_rate": 7.322707322707323e-06, "loss": 1.1987, "step": 1641 }, { "epoch": 1.706860706860707, "grad_norm": 14.051151275634766, "learning_rate": 7.320397320397321e-06, "loss": 1.1578, "step": 1642 }, { "epoch": 1.707900207900208, "grad_norm": 0.019640829414129257, "learning_rate": 7.318087318087319e-06, "loss": 0.0006, "step": 1643 }, { "epoch": 1.708939708939709, "grad_norm": 11.411649703979492, "learning_rate": 7.315777315777316e-06, "loss": 1.4277, "step": 1644 }, { "epoch": 1.70997920997921, "grad_norm": 0.04116674140095711, "learning_rate": 7.313467313467314e-06, "loss": 0.0009, "step": 1645 }, { "epoch": 1.7110187110187112, "grad_norm": 0.9147244095802307, "learning_rate": 7.311157311157312e-06, "loss": 0.0226, "step": 1646 }, { "epoch": 1.7120582120582122, "grad_norm": 10.076512336730957, "learning_rate": 7.30884730884731e-06, "loss": 0.7167, "step": 1647 }, { "epoch": 1.7130977130977132, "grad_norm": 7.493573188781738, "learning_rate": 7.306537306537307e-06, "loss": 0.2172, "step": 1648 }, { "epoch": 1.7141372141372142, "grad_norm": 5.900646209716797, "learning_rate": 7.304227304227305e-06, "loss": 0.0806, "step": 1649 }, { "epoch": 1.7151767151767152, "grad_norm": 3.126075029373169, "learning_rate": 7.301917301917303e-06, "loss": 0.247, "step": 1650 }, { "epoch": 1.7162162162162162, "grad_norm": 6.303007125854492, "learning_rate": 7.2996072996073e-06, "loss": 0.2534, "step": 1651 }, { "epoch": 1.7172557172557172, "grad_norm": 11.120226860046387, "learning_rate": 7.297297297297298e-06, "loss": 0.6595, "step": 1652 }, { "epoch": 1.7182952182952183, "grad_norm": 0.505618155002594, "learning_rate": 7.294987294987296e-06, "loss": 0.0075, "step": 1653 }, { "epoch": 1.7193347193347193, "grad_norm": 8.373292922973633, "learning_rate": 7.292677292677293e-06, "loss": 0.6847, "step": 1654 }, { "epoch": 1.7203742203742203, "grad_norm": 6.690762996673584, "learning_rate": 7.290367290367291e-06, "loss": 0.2232, "step": 1655 }, { "epoch": 1.7214137214137213, "grad_norm": 0.13206851482391357, "learning_rate": 7.288057288057289e-06, "loss": 0.0024, "step": 1656 }, { "epoch": 1.7224532224532223, "grad_norm": 7.297382831573486, "learning_rate": 7.285747285747286e-06, "loss": 0.4353, "step": 1657 }, { "epoch": 1.7234927234927235, "grad_norm": 1.3188226222991943, "learning_rate": 7.2834372834372836e-06, "loss": 0.0408, "step": 1658 }, { "epoch": 1.7245322245322245, "grad_norm": 7.577664852142334, "learning_rate": 7.281127281127282e-06, "loss": 0.2984, "step": 1659 }, { "epoch": 1.7255717255717256, "grad_norm": 1.5589632987976074, "learning_rate": 7.278817278817279e-06, "loss": 0.0465, "step": 1660 }, { "epoch": 1.7266112266112266, "grad_norm": 7.908683776855469, "learning_rate": 7.276507276507277e-06, "loss": 0.7164, "step": 1661 }, { "epoch": 1.7276507276507278, "grad_norm": 11.915677070617676, "learning_rate": 7.274197274197275e-06, "loss": 0.8969, "step": 1662 }, { "epoch": 1.7286902286902288, "grad_norm": 8.111615180969238, "learning_rate": 7.271887271887272e-06, "loss": 0.3777, "step": 1663 }, { "epoch": 1.7297297297297298, "grad_norm": 0.46552711725234985, "learning_rate": 7.26957726957727e-06, "loss": 0.0113, "step": 1664 }, { "epoch": 1.7307692307692308, "grad_norm": 4.33073616027832, "learning_rate": 7.267267267267268e-06, "loss": 0.2095, "step": 1665 }, { "epoch": 1.7318087318087318, "grad_norm": 17.092309951782227, "learning_rate": 7.264957264957266e-06, "loss": 0.7215, "step": 1666 }, { "epoch": 1.7328482328482329, "grad_norm": 1.5251858234405518, "learning_rate": 7.262647262647263e-06, "loss": 0.0386, "step": 1667 }, { "epoch": 1.7338877338877339, "grad_norm": 0.3032921254634857, "learning_rate": 7.260337260337261e-06, "loss": 0.0099, "step": 1668 }, { "epoch": 1.7349272349272349, "grad_norm": 11.229517936706543, "learning_rate": 7.2580272580272586e-06, "loss": 0.7061, "step": 1669 }, { "epoch": 1.735966735966736, "grad_norm": 6.576710224151611, "learning_rate": 7.255717255717256e-06, "loss": 0.3782, "step": 1670 }, { "epoch": 1.737006237006237, "grad_norm": 1.0471800565719604, "learning_rate": 7.253407253407254e-06, "loss": 0.0211, "step": 1671 }, { "epoch": 1.738045738045738, "grad_norm": 4.3084306716918945, "learning_rate": 7.251097251097252e-06, "loss": 0.2368, "step": 1672 }, { "epoch": 1.739085239085239, "grad_norm": 1.829033613204956, "learning_rate": 7.248787248787249e-06, "loss": 0.0723, "step": 1673 }, { "epoch": 1.7401247401247402, "grad_norm": 6.535826683044434, "learning_rate": 7.246477246477247e-06, "loss": 0.4326, "step": 1674 }, { "epoch": 1.7411642411642412, "grad_norm": 0.2173418551683426, "learning_rate": 7.244167244167245e-06, "loss": 0.0058, "step": 1675 }, { "epoch": 1.7422037422037422, "grad_norm": 1.3365967273712158, "learning_rate": 7.241857241857242e-06, "loss": 0.0298, "step": 1676 }, { "epoch": 1.7432432432432432, "grad_norm": 0.48751041293144226, "learning_rate": 7.23954723954724e-06, "loss": 0.0104, "step": 1677 }, { "epoch": 1.7442827442827444, "grad_norm": 0.29479509592056274, "learning_rate": 7.237237237237238e-06, "loss": 0.0054, "step": 1678 }, { "epoch": 1.7453222453222454, "grad_norm": 0.020942412316799164, "learning_rate": 7.234927234927236e-06, "loss": 0.0007, "step": 1679 }, { "epoch": 1.7463617463617465, "grad_norm": 13.055707931518555, "learning_rate": 7.232617232617233e-06, "loss": 0.7853, "step": 1680 }, { "epoch": 1.7474012474012475, "grad_norm": 0.0012717257486656308, "learning_rate": 7.230307230307231e-06, "loss": 0.0, "step": 1681 }, { "epoch": 1.7484407484407485, "grad_norm": 0.08711827546358109, "learning_rate": 7.227997227997229e-06, "loss": 0.0018, "step": 1682 }, { "epoch": 1.7494802494802495, "grad_norm": 4.8014326095581055, "learning_rate": 7.2256872256872265e-06, "loss": 0.1296, "step": 1683 }, { "epoch": 1.7505197505197505, "grad_norm": 6.449215888977051, "learning_rate": 7.223377223377224e-06, "loss": 0.2195, "step": 1684 }, { "epoch": 1.7515592515592515, "grad_norm": 8.582338333129883, "learning_rate": 7.221067221067222e-06, "loss": 0.261, "step": 1685 }, { "epoch": 1.7525987525987525, "grad_norm": 1.6855406761169434, "learning_rate": 7.218757218757219e-06, "loss": 0.0462, "step": 1686 }, { "epoch": 1.7536382536382535, "grad_norm": 0.14350074529647827, "learning_rate": 7.216447216447217e-06, "loss": 0.002, "step": 1687 }, { "epoch": 1.7546777546777546, "grad_norm": 7.426890850067139, "learning_rate": 7.214137214137215e-06, "loss": 0.2078, "step": 1688 }, { "epoch": 1.7557172557172556, "grad_norm": 7.843624591827393, "learning_rate": 7.211827211827212e-06, "loss": 0.4337, "step": 1689 }, { "epoch": 1.7567567567567568, "grad_norm": 2.7187883853912354, "learning_rate": 7.20951720951721e-06, "loss": 0.032, "step": 1690 }, { "epoch": 1.7577962577962578, "grad_norm": 9.4097261428833, "learning_rate": 7.207207207207208e-06, "loss": 0.5517, "step": 1691 }, { "epoch": 1.7588357588357588, "grad_norm": 0.20567497611045837, "learning_rate": 7.204897204897205e-06, "loss": 0.0046, "step": 1692 }, { "epoch": 1.7598752598752598, "grad_norm": 3.7219395637512207, "learning_rate": 7.202587202587203e-06, "loss": 0.0952, "step": 1693 }, { "epoch": 1.760914760914761, "grad_norm": 0.9259941577911377, "learning_rate": 7.200277200277201e-06, "loss": 0.0224, "step": 1694 }, { "epoch": 1.761954261954262, "grad_norm": 0.19360046088695526, "learning_rate": 7.197967197967198e-06, "loss": 0.0032, "step": 1695 }, { "epoch": 1.762993762993763, "grad_norm": 9.575984954833984, "learning_rate": 7.195657195657196e-06, "loss": 0.9476, "step": 1696 }, { "epoch": 1.764033264033264, "grad_norm": 0.27635741233825684, "learning_rate": 7.193347193347194e-06, "loss": 0.0053, "step": 1697 }, { "epoch": 1.7650727650727651, "grad_norm": 5.570565700531006, "learning_rate": 7.191037191037191e-06, "loss": 0.1848, "step": 1698 }, { "epoch": 1.7661122661122661, "grad_norm": 8.489222526550293, "learning_rate": 7.188727188727189e-06, "loss": 0.327, "step": 1699 }, { "epoch": 1.7671517671517671, "grad_norm": 0.45986512303352356, "learning_rate": 7.186417186417187e-06, "loss": 0.0076, "step": 1700 }, { "epoch": 1.7681912681912682, "grad_norm": 1.1088552474975586, "learning_rate": 7.1841071841071854e-06, "loss": 0.0475, "step": 1701 }, { "epoch": 1.7692307692307692, "grad_norm": 13.177712440490723, "learning_rate": 7.181797181797182e-06, "loss": 0.203, "step": 1702 }, { "epoch": 1.7702702702702702, "grad_norm": 7.197229862213135, "learning_rate": 7.17948717948718e-06, "loss": 0.4223, "step": 1703 }, { "epoch": 1.7713097713097712, "grad_norm": 3.42079496383667, "learning_rate": 7.177177177177178e-06, "loss": 0.0728, "step": 1704 }, { "epoch": 1.7723492723492722, "grad_norm": 0.43769392371177673, "learning_rate": 7.174867174867175e-06, "loss": 0.0143, "step": 1705 }, { "epoch": 1.7733887733887734, "grad_norm": 0.10485343635082245, "learning_rate": 7.172557172557173e-06, "loss": 0.0023, "step": 1706 }, { "epoch": 1.7744282744282744, "grad_norm": 9.893296241760254, "learning_rate": 7.170247170247171e-06, "loss": 0.8923, "step": 1707 }, { "epoch": 1.7754677754677755, "grad_norm": 4.010193347930908, "learning_rate": 7.167937167937169e-06, "loss": 0.1206, "step": 1708 }, { "epoch": 1.7765072765072765, "grad_norm": 0.3834474980831146, "learning_rate": 7.165627165627166e-06, "loss": 0.0048, "step": 1709 }, { "epoch": 1.7775467775467777, "grad_norm": 3.8938255310058594, "learning_rate": 7.163317163317164e-06, "loss": 0.1693, "step": 1710 }, { "epoch": 1.7785862785862787, "grad_norm": 11.818151473999023, "learning_rate": 7.161007161007162e-06, "loss": 0.4191, "step": 1711 }, { "epoch": 1.7796257796257797, "grad_norm": 2.825345039367676, "learning_rate": 7.158697158697159e-06, "loss": 0.0724, "step": 1712 }, { "epoch": 1.7806652806652807, "grad_norm": 0.23397642374038696, "learning_rate": 7.156387156387157e-06, "loss": 0.0085, "step": 1713 }, { "epoch": 1.7817047817047817, "grad_norm": 2.670799970626831, "learning_rate": 7.154077154077155e-06, "loss": 0.0735, "step": 1714 }, { "epoch": 1.7827442827442828, "grad_norm": 6.973132133483887, "learning_rate": 7.1517671517671525e-06, "loss": 0.3394, "step": 1715 }, { "epoch": 1.7837837837837838, "grad_norm": 5.924585342407227, "learning_rate": 7.14945714945715e-06, "loss": 0.3206, "step": 1716 }, { "epoch": 1.7848232848232848, "grad_norm": 7.079158306121826, "learning_rate": 7.147147147147148e-06, "loss": 0.9937, "step": 1717 }, { "epoch": 1.7858627858627858, "grad_norm": 0.8085556626319885, "learning_rate": 7.144837144837145e-06, "loss": 0.0233, "step": 1718 }, { "epoch": 1.7869022869022868, "grad_norm": 0.26186278462409973, "learning_rate": 7.1425271425271435e-06, "loss": 0.0043, "step": 1719 }, { "epoch": 1.7879417879417878, "grad_norm": 3.6057870388031006, "learning_rate": 7.140217140217141e-06, "loss": 0.1303, "step": 1720 }, { "epoch": 1.7889812889812888, "grad_norm": 7.762540340423584, "learning_rate": 7.137907137907138e-06, "loss": 0.6902, "step": 1721 }, { "epoch": 1.79002079002079, "grad_norm": 14.205892562866211, "learning_rate": 7.1355971355971364e-06, "loss": 2.8257, "step": 1722 }, { "epoch": 1.791060291060291, "grad_norm": 11.998815536499023, "learning_rate": 7.133287133287134e-06, "loss": 1.8192, "step": 1723 }, { "epoch": 1.792099792099792, "grad_norm": 4.715017318725586, "learning_rate": 7.130977130977131e-06, "loss": 0.1902, "step": 1724 }, { "epoch": 1.793139293139293, "grad_norm": 6.3350396156311035, "learning_rate": 7.128667128667129e-06, "loss": 0.1873, "step": 1725 }, { "epoch": 1.7941787941787943, "grad_norm": 9.118733406066895, "learning_rate": 7.1263571263571275e-06, "loss": 0.56, "step": 1726 }, { "epoch": 1.7952182952182953, "grad_norm": 5.499832630157471, "learning_rate": 7.124047124047124e-06, "loss": 0.3405, "step": 1727 }, { "epoch": 1.7962577962577964, "grad_norm": 1.0076048374176025, "learning_rate": 7.121737121737122e-06, "loss": 0.0391, "step": 1728 }, { "epoch": 1.7972972972972974, "grad_norm": 9.98312759399414, "learning_rate": 7.11942711942712e-06, "loss": 1.3863, "step": 1729 }, { "epoch": 1.7983367983367984, "grad_norm": 6.568471908569336, "learning_rate": 7.117117117117117e-06, "loss": 0.356, "step": 1730 }, { "epoch": 1.7993762993762994, "grad_norm": 8.882364273071289, "learning_rate": 7.114807114807115e-06, "loss": 0.5891, "step": 1731 }, { "epoch": 1.8004158004158004, "grad_norm": 0.11736893653869629, "learning_rate": 7.112497112497113e-06, "loss": 0.003, "step": 1732 }, { "epoch": 1.8014553014553014, "grad_norm": 11.34277057647705, "learning_rate": 7.1101871101871114e-06, "loss": 0.0025, "step": 1733 }, { "epoch": 1.8024948024948024, "grad_norm": 0.47979533672332764, "learning_rate": 7.107877107877108e-06, "loss": 0.0094, "step": 1734 }, { "epoch": 1.8035343035343034, "grad_norm": 7.334555149078369, "learning_rate": 7.105567105567106e-06, "loss": 0.3448, "step": 1735 }, { "epoch": 1.8045738045738045, "grad_norm": 7.494808197021484, "learning_rate": 7.103257103257104e-06, "loss": 0.8725, "step": 1736 }, { "epoch": 1.8056133056133055, "grad_norm": 4.283798694610596, "learning_rate": 7.100947100947102e-06, "loss": 0.1439, "step": 1737 }, { "epoch": 1.8066528066528067, "grad_norm": 1.145999789237976, "learning_rate": 7.098637098637099e-06, "loss": 0.025, "step": 1738 }, { "epoch": 1.8076923076923077, "grad_norm": 0.05247219651937485, "learning_rate": 7.096327096327097e-06, "loss": 0.001, "step": 1739 }, { "epoch": 1.8087318087318087, "grad_norm": 2.770738124847412, "learning_rate": 7.0940170940170945e-06, "loss": 0.1025, "step": 1740 }, { "epoch": 1.8097713097713097, "grad_norm": 1.1891086101531982, "learning_rate": 7.091707091707092e-06, "loss": 0.0212, "step": 1741 }, { "epoch": 1.810810810810811, "grad_norm": 1.8017946481704712, "learning_rate": 7.08939708939709e-06, "loss": 0.0425, "step": 1742 }, { "epoch": 1.811850311850312, "grad_norm": 3.020580530166626, "learning_rate": 7.087087087087087e-06, "loss": 0.0616, "step": 1743 }, { "epoch": 1.812889812889813, "grad_norm": 3.8289687633514404, "learning_rate": 7.084777084777086e-06, "loss": 0.155, "step": 1744 }, { "epoch": 1.813929313929314, "grad_norm": 0.27335742115974426, "learning_rate": 7.082467082467083e-06, "loss": 0.0069, "step": 1745 }, { "epoch": 1.814968814968815, "grad_norm": 9.197246551513672, "learning_rate": 7.08015708015708e-06, "loss": 0.1974, "step": 1746 }, { "epoch": 1.816008316008316, "grad_norm": 4.551156997680664, "learning_rate": 7.0778470778470785e-06, "loss": 0.0991, "step": 1747 }, { "epoch": 1.817047817047817, "grad_norm": 1.667893409729004, "learning_rate": 7.075537075537076e-06, "loss": 0.0315, "step": 1748 }, { "epoch": 1.818087318087318, "grad_norm": 15.546823501586914, "learning_rate": 7.073227073227073e-06, "loss": 1.4498, "step": 1749 }, { "epoch": 1.819126819126819, "grad_norm": 6.499513626098633, "learning_rate": 7.070917070917071e-06, "loss": 0.1394, "step": 1750 }, { "epoch": 1.82016632016632, "grad_norm": 3.992257833480835, "learning_rate": 7.0686070686070696e-06, "loss": 0.1193, "step": 1751 }, { "epoch": 1.821205821205821, "grad_norm": 2.7317323684692383, "learning_rate": 7.066297066297066e-06, "loss": 0.0496, "step": 1752 }, { "epoch": 1.822245322245322, "grad_norm": 0.6675400137901306, "learning_rate": 7.063987063987064e-06, "loss": 0.0152, "step": 1753 }, { "epoch": 1.8232848232848233, "grad_norm": 0.47929680347442627, "learning_rate": 7.0616770616770624e-06, "loss": 0.0178, "step": 1754 }, { "epoch": 1.8243243243243243, "grad_norm": 5.309060573577881, "learning_rate": 7.059367059367061e-06, "loss": 0.1906, "step": 1755 }, { "epoch": 1.8253638253638254, "grad_norm": 0.5200766324996948, "learning_rate": 7.057057057057057e-06, "loss": 0.0098, "step": 1756 }, { "epoch": 1.8264033264033264, "grad_norm": 3.534630060195923, "learning_rate": 7.054747054747055e-06, "loss": 0.1054, "step": 1757 }, { "epoch": 1.8274428274428276, "grad_norm": 8.29762077331543, "learning_rate": 7.0524370524370535e-06, "loss": 0.3073, "step": 1758 }, { "epoch": 1.8284823284823286, "grad_norm": 0.8621770143508911, "learning_rate": 7.05012705012705e-06, "loss": 0.0204, "step": 1759 }, { "epoch": 1.8295218295218296, "grad_norm": 0.23380033671855927, "learning_rate": 7.047817047817048e-06, "loss": 0.0035, "step": 1760 }, { "epoch": 1.8305613305613306, "grad_norm": 5.623581409454346, "learning_rate": 7.045507045507046e-06, "loss": 0.2227, "step": 1761 }, { "epoch": 1.8316008316008316, "grad_norm": 2.2980833053588867, "learning_rate": 7.0431970431970446e-06, "loss": 0.0643, "step": 1762 }, { "epoch": 1.8326403326403327, "grad_norm": 12.151287078857422, "learning_rate": 7.040887040887041e-06, "loss": 0.2851, "step": 1763 }, { "epoch": 1.8336798336798337, "grad_norm": 0.5834892392158508, "learning_rate": 7.038577038577039e-06, "loss": 0.0108, "step": 1764 }, { "epoch": 1.8347193347193347, "grad_norm": 0.06696605682373047, "learning_rate": 7.0362670362670374e-06, "loss": 0.0014, "step": 1765 }, { "epoch": 1.8357588357588357, "grad_norm": 4.016539096832275, "learning_rate": 7.033957033957034e-06, "loss": 0.1166, "step": 1766 }, { "epoch": 1.8367983367983367, "grad_norm": 1.0611064434051514, "learning_rate": 7.031647031647032e-06, "loss": 0.041, "step": 1767 }, { "epoch": 1.8378378378378377, "grad_norm": 0.7719981074333191, "learning_rate": 7.02933702933703e-06, "loss": 0.0163, "step": 1768 }, { "epoch": 1.8388773388773387, "grad_norm": 3.026901960372925, "learning_rate": 7.027027027027028e-06, "loss": 0.0903, "step": 1769 }, { "epoch": 1.83991683991684, "grad_norm": 3.5234310626983643, "learning_rate": 7.024717024717025e-06, "loss": 0.111, "step": 1770 }, { "epoch": 1.840956340956341, "grad_norm": 0.18749161064624786, "learning_rate": 7.022407022407023e-06, "loss": 0.003, "step": 1771 }, { "epoch": 1.841995841995842, "grad_norm": 1.624272108078003, "learning_rate": 7.0200970200970205e-06, "loss": 0.0276, "step": 1772 }, { "epoch": 1.843035343035343, "grad_norm": 0.39288201928138733, "learning_rate": 7.017787017787019e-06, "loss": 0.0139, "step": 1773 }, { "epoch": 1.8440748440748442, "grad_norm": 8.802492141723633, "learning_rate": 7.015477015477016e-06, "loss": 0.0945, "step": 1774 }, { "epoch": 1.8451143451143452, "grad_norm": 10.572566986083984, "learning_rate": 7.0131670131670134e-06, "loss": 0.399, "step": 1775 }, { "epoch": 1.8461538461538463, "grad_norm": 6.584804058074951, "learning_rate": 7.010857010857012e-06, "loss": 0.3771, "step": 1776 }, { "epoch": 1.8471933471933473, "grad_norm": 2.911133289337158, "learning_rate": 7.008547008547009e-06, "loss": 0.0747, "step": 1777 }, { "epoch": 1.8482328482328483, "grad_norm": 0.4883025288581848, "learning_rate": 7.006237006237006e-06, "loss": 0.0088, "step": 1778 }, { "epoch": 1.8492723492723493, "grad_norm": 10.067765235900879, "learning_rate": 7.0039270039270045e-06, "loss": 0.6624, "step": 1779 }, { "epoch": 1.8503118503118503, "grad_norm": 1.677093505859375, "learning_rate": 7.001617001617003e-06, "loss": 0.0389, "step": 1780 }, { "epoch": 1.8513513513513513, "grad_norm": 6.398299217224121, "learning_rate": 6.999306999306999e-06, "loss": 0.365, "step": 1781 }, { "epoch": 1.8523908523908523, "grad_norm": 0.02928902953863144, "learning_rate": 6.996996996996997e-06, "loss": 0.001, "step": 1782 }, { "epoch": 1.8534303534303533, "grad_norm": 13.064369201660156, "learning_rate": 6.9946869946869956e-06, "loss": 0.9939, "step": 1783 }, { "epoch": 1.8544698544698544, "grad_norm": 0.872677743434906, "learning_rate": 6.992376992376992e-06, "loss": 0.0126, "step": 1784 }, { "epoch": 1.8555093555093554, "grad_norm": 4.7186198234558105, "learning_rate": 6.99006699006699e-06, "loss": 0.2806, "step": 1785 }, { "epoch": 1.8565488565488566, "grad_norm": 2.009587287902832, "learning_rate": 6.9877569877569884e-06, "loss": 0.1332, "step": 1786 }, { "epoch": 1.8575883575883576, "grad_norm": 10.572334289550781, "learning_rate": 6.985446985446987e-06, "loss": 0.6191, "step": 1787 }, { "epoch": 1.8586278586278586, "grad_norm": 2.9769487380981445, "learning_rate": 6.983136983136983e-06, "loss": 0.0616, "step": 1788 }, { "epoch": 1.8596673596673596, "grad_norm": 6.331596374511719, "learning_rate": 6.980826980826981e-06, "loss": 0.2601, "step": 1789 }, { "epoch": 1.8607068607068609, "grad_norm": 0.20288600027561188, "learning_rate": 6.9785169785169795e-06, "loss": 0.0061, "step": 1790 }, { "epoch": 1.8617463617463619, "grad_norm": 8.505599021911621, "learning_rate": 6.976206976206978e-06, "loss": 0.0917, "step": 1791 }, { "epoch": 1.862785862785863, "grad_norm": 1.9080631732940674, "learning_rate": 6.973896973896974e-06, "loss": 0.0389, "step": 1792 }, { "epoch": 1.863825363825364, "grad_norm": 1.8575496673583984, "learning_rate": 6.971586971586972e-06, "loss": 0.039, "step": 1793 }, { "epoch": 1.864864864864865, "grad_norm": 2.9888617992401123, "learning_rate": 6.969276969276971e-06, "loss": 0.0925, "step": 1794 }, { "epoch": 1.865904365904366, "grad_norm": 13.349923133850098, "learning_rate": 6.966966966966967e-06, "loss": 0.9717, "step": 1795 }, { "epoch": 1.866943866943867, "grad_norm": 6.0613861083984375, "learning_rate": 6.964656964656965e-06, "loss": 0.2833, "step": 1796 }, { "epoch": 1.867983367983368, "grad_norm": 2.117635726928711, "learning_rate": 6.9623469623469635e-06, "loss": 0.0424, "step": 1797 }, { "epoch": 1.869022869022869, "grad_norm": 2.256873607635498, "learning_rate": 6.960036960036961e-06, "loss": 0.0965, "step": 1798 }, { "epoch": 1.87006237006237, "grad_norm": 0.33173665404319763, "learning_rate": 6.957726957726958e-06, "loss": 0.0106, "step": 1799 }, { "epoch": 1.871101871101871, "grad_norm": 4.786936283111572, "learning_rate": 6.955416955416956e-06, "loss": 0.1814, "step": 1800 }, { "epoch": 1.872141372141372, "grad_norm": 0.11546695232391357, "learning_rate": 6.953106953106954e-06, "loss": 0.0017, "step": 1801 }, { "epoch": 1.8731808731808732, "grad_norm": 5.822028160095215, "learning_rate": 6.950796950796951e-06, "loss": 0.132, "step": 1802 }, { "epoch": 1.8742203742203742, "grad_norm": 29.958877563476562, "learning_rate": 6.948486948486949e-06, "loss": 0.6952, "step": 1803 }, { "epoch": 1.8752598752598753, "grad_norm": 17.843074798583984, "learning_rate": 6.9461769461769466e-06, "loss": 1.4509, "step": 1804 }, { "epoch": 1.8762993762993763, "grad_norm": 7.777376174926758, "learning_rate": 6.943866943866945e-06, "loss": 0.2881, "step": 1805 }, { "epoch": 1.8773388773388775, "grad_norm": 3.4465243816375732, "learning_rate": 6.941556941556942e-06, "loss": 0.0959, "step": 1806 }, { "epoch": 1.8783783783783785, "grad_norm": 0.019290627911686897, "learning_rate": 6.9392469392469394e-06, "loss": 0.0005, "step": 1807 }, { "epoch": 1.8794178794178795, "grad_norm": 0.029469970613718033, "learning_rate": 6.936936936936938e-06, "loss": 0.0007, "step": 1808 }, { "epoch": 1.8804573804573805, "grad_norm": 7.679044723510742, "learning_rate": 6.934626934626936e-06, "loss": 0.4403, "step": 1809 }, { "epoch": 1.8814968814968815, "grad_norm": 0.09692568331956863, "learning_rate": 6.932316932316932e-06, "loss": 0.0024, "step": 1810 }, { "epoch": 1.8825363825363826, "grad_norm": 5.558783054351807, "learning_rate": 6.9300069300069305e-06, "loss": 0.1236, "step": 1811 }, { "epoch": 1.8835758835758836, "grad_norm": 9.43947982788086, "learning_rate": 6.927696927696929e-06, "loss": 0.4962, "step": 1812 }, { "epoch": 1.8846153846153846, "grad_norm": 1.136598825454712, "learning_rate": 6.925386925386925e-06, "loss": 0.0386, "step": 1813 }, { "epoch": 1.8856548856548856, "grad_norm": 3.2195546627044678, "learning_rate": 6.923076923076923e-06, "loss": 0.0834, "step": 1814 }, { "epoch": 1.8866943866943866, "grad_norm": 6.025984287261963, "learning_rate": 6.9207669207669216e-06, "loss": 0.3229, "step": 1815 }, { "epoch": 1.8877338877338876, "grad_norm": 1.9402605295181274, "learning_rate": 6.91845691845692e-06, "loss": 0.0161, "step": 1816 }, { "epoch": 1.8887733887733886, "grad_norm": 1.3776854276657104, "learning_rate": 6.916146916146916e-06, "loss": 0.0468, "step": 1817 }, { "epoch": 1.8898128898128899, "grad_norm": 0.0545632541179657, "learning_rate": 6.9138369138369145e-06, "loss": 0.0011, "step": 1818 }, { "epoch": 1.8908523908523909, "grad_norm": 2.274041175842285, "learning_rate": 6.911526911526913e-06, "loss": 0.0631, "step": 1819 }, { "epoch": 1.8918918918918919, "grad_norm": 2.384172201156616, "learning_rate": 6.909216909216909e-06, "loss": 0.0757, "step": 1820 }, { "epoch": 1.892931392931393, "grad_norm": 8.121810913085938, "learning_rate": 6.906906906906907e-06, "loss": 0.2988, "step": 1821 }, { "epoch": 1.8939708939708941, "grad_norm": 7.533204078674316, "learning_rate": 6.9045969045969055e-06, "loss": 0.7463, "step": 1822 }, { "epoch": 1.8950103950103951, "grad_norm": 7.793642520904541, "learning_rate": 6.902286902286903e-06, "loss": 0.3933, "step": 1823 }, { "epoch": 1.8960498960498962, "grad_norm": 6.824788570404053, "learning_rate": 6.8999768999769e-06, "loss": 0.1922, "step": 1824 }, { "epoch": 1.8970893970893972, "grad_norm": 4.8206915855407715, "learning_rate": 6.897666897666898e-06, "loss": 0.3425, "step": 1825 }, { "epoch": 1.8981288981288982, "grad_norm": 12.999703407287598, "learning_rate": 6.895356895356896e-06, "loss": 0.4453, "step": 1826 }, { "epoch": 1.8991683991683992, "grad_norm": 7.11517333984375, "learning_rate": 6.893046893046894e-06, "loss": 0.343, "step": 1827 }, { "epoch": 1.9002079002079002, "grad_norm": 0.7533423900604248, "learning_rate": 6.890736890736891e-06, "loss": 0.0221, "step": 1828 }, { "epoch": 1.9012474012474012, "grad_norm": 8.051311492919922, "learning_rate": 6.888426888426889e-06, "loss": 0.6874, "step": 1829 }, { "epoch": 1.9022869022869022, "grad_norm": 1.4836094379425049, "learning_rate": 6.886116886116887e-06, "loss": 0.0364, "step": 1830 }, { "epoch": 1.9033264033264032, "grad_norm": 9.048811912536621, "learning_rate": 6.883806883806884e-06, "loss": 0.4148, "step": 1831 }, { "epoch": 1.9043659043659042, "grad_norm": 0.09245787560939789, "learning_rate": 6.8814968814968815e-06, "loss": 0.0034, "step": 1832 }, { "epoch": 1.9054054054054053, "grad_norm": 5.148186683654785, "learning_rate": 6.87918687918688e-06, "loss": 0.1343, "step": 1833 }, { "epoch": 1.9064449064449065, "grad_norm": 0.022289017215371132, "learning_rate": 6.876876876876878e-06, "loss": 0.0005, "step": 1834 }, { "epoch": 1.9074844074844075, "grad_norm": 15.062718391418457, "learning_rate": 6.874566874566874e-06, "loss": 0.6481, "step": 1835 }, { "epoch": 1.9085239085239085, "grad_norm": 0.6894482374191284, "learning_rate": 6.8722568722568726e-06, "loss": 0.0068, "step": 1836 }, { "epoch": 1.9095634095634095, "grad_norm": 7.909749984741211, "learning_rate": 6.869946869946871e-06, "loss": 0.5693, "step": 1837 }, { "epoch": 1.9106029106029108, "grad_norm": 5.598539352416992, "learning_rate": 6.867636867636867e-06, "loss": 0.2707, "step": 1838 }, { "epoch": 1.9116424116424118, "grad_norm": 5.760708332061768, "learning_rate": 6.8653268653268654e-06, "loss": 0.1846, "step": 1839 }, { "epoch": 1.9126819126819128, "grad_norm": 3.930959463119507, "learning_rate": 6.863016863016864e-06, "loss": 0.0626, "step": 1840 }, { "epoch": 1.9137214137214138, "grad_norm": 0.062302954494953156, "learning_rate": 6.860706860706862e-06, "loss": 0.0013, "step": 1841 }, { "epoch": 1.9147609147609148, "grad_norm": 2.7454307079315186, "learning_rate": 6.858396858396858e-06, "loss": 0.0692, "step": 1842 }, { "epoch": 1.9158004158004158, "grad_norm": 3.4849820137023926, "learning_rate": 6.8560868560868565e-06, "loss": 0.1363, "step": 1843 }, { "epoch": 1.9168399168399168, "grad_norm": 8.270879745483398, "learning_rate": 6.853776853776855e-06, "loss": 0.4004, "step": 1844 }, { "epoch": 1.9178794178794178, "grad_norm": 1.630702018737793, "learning_rate": 6.851466851466853e-06, "loss": 0.046, "step": 1845 }, { "epoch": 1.9189189189189189, "grad_norm": 0.4970608353614807, "learning_rate": 6.849156849156849e-06, "loss": 0.009, "step": 1846 }, { "epoch": 1.9199584199584199, "grad_norm": 0.20573386549949646, "learning_rate": 6.846846846846848e-06, "loss": 0.0041, "step": 1847 }, { "epoch": 1.9209979209979209, "grad_norm": 1.271240472793579, "learning_rate": 6.844536844536846e-06, "loss": 0.0353, "step": 1848 }, { "epoch": 1.922037422037422, "grad_norm": 0.43870508670806885, "learning_rate": 6.842226842226842e-06, "loss": 0.0148, "step": 1849 }, { "epoch": 1.9230769230769231, "grad_norm": 7.131784439086914, "learning_rate": 6.8399168399168405e-06, "loss": 0.3034, "step": 1850 }, { "epoch": 1.9241164241164241, "grad_norm": 5.150654315948486, "learning_rate": 6.837606837606839e-06, "loss": 0.1394, "step": 1851 }, { "epoch": 1.9251559251559252, "grad_norm": 13.46895694732666, "learning_rate": 6.835296835296836e-06, "loss": 1.7075, "step": 1852 }, { "epoch": 1.9261954261954262, "grad_norm": 0.38688424229621887, "learning_rate": 6.832986832986833e-06, "loss": 0.0091, "step": 1853 }, { "epoch": 1.9272349272349274, "grad_norm": 6.538180828094482, "learning_rate": 6.8306768306768315e-06, "loss": 0.7274, "step": 1854 }, { "epoch": 1.9282744282744284, "grad_norm": 5.841011047363281, "learning_rate": 6.828366828366829e-06, "loss": 0.1061, "step": 1855 }, { "epoch": 1.9293139293139294, "grad_norm": 10.935325622558594, "learning_rate": 6.826056826056826e-06, "loss": 1.1941, "step": 1856 }, { "epoch": 1.9303534303534304, "grad_norm": 10.663113594055176, "learning_rate": 6.823746823746824e-06, "loss": 0.792, "step": 1857 }, { "epoch": 1.9313929313929314, "grad_norm": 3.2219295501708984, "learning_rate": 6.821436821436822e-06, "loss": 0.1102, "step": 1858 }, { "epoch": 1.9324324324324325, "grad_norm": 0.5760859847068787, "learning_rate": 6.81912681912682e-06, "loss": 0.0083, "step": 1859 }, { "epoch": 1.9334719334719335, "grad_norm": 4.954216003417969, "learning_rate": 6.816816816816817e-06, "loss": 0.1843, "step": 1860 }, { "epoch": 1.9345114345114345, "grad_norm": 10.934431076049805, "learning_rate": 6.814506814506815e-06, "loss": 0.5659, "step": 1861 }, { "epoch": 1.9355509355509355, "grad_norm": 6.139547824859619, "learning_rate": 6.812196812196813e-06, "loss": 0.3543, "step": 1862 }, { "epoch": 1.9365904365904365, "grad_norm": 4.316939830780029, "learning_rate": 6.809886809886811e-06, "loss": 0.1155, "step": 1863 }, { "epoch": 1.9376299376299375, "grad_norm": 9.48974895477295, "learning_rate": 6.8075768075768075e-06, "loss": 0.64, "step": 1864 }, { "epoch": 1.9386694386694385, "grad_norm": 1.2552036046981812, "learning_rate": 6.805266805266806e-06, "loss": 0.0173, "step": 1865 }, { "epoch": 1.9397089397089398, "grad_norm": 0.9558032155036926, "learning_rate": 6.802956802956804e-06, "loss": 0.0376, "step": 1866 }, { "epoch": 1.9407484407484408, "grad_norm": 1.8135472536087036, "learning_rate": 6.8006468006468e-06, "loss": 0.0469, "step": 1867 }, { "epoch": 1.9417879417879418, "grad_norm": 7.1318511962890625, "learning_rate": 6.7983367983367986e-06, "loss": 0.7816, "step": 1868 }, { "epoch": 1.9428274428274428, "grad_norm": 0.13948020339012146, "learning_rate": 6.796026796026797e-06, "loss": 0.0017, "step": 1869 }, { "epoch": 1.943866943866944, "grad_norm": 0.0776764452457428, "learning_rate": 6.793716793716795e-06, "loss": 0.0011, "step": 1870 }, { "epoch": 1.944906444906445, "grad_norm": 0.2135138213634491, "learning_rate": 6.7914067914067915e-06, "loss": 0.0061, "step": 1871 }, { "epoch": 1.945945945945946, "grad_norm": 0.08584604412317276, "learning_rate": 6.78909678909679e-06, "loss": 0.0027, "step": 1872 }, { "epoch": 1.946985446985447, "grad_norm": 4.477102279663086, "learning_rate": 6.786786786786788e-06, "loss": 0.1093, "step": 1873 }, { "epoch": 1.948024948024948, "grad_norm": 0.3456413745880127, "learning_rate": 6.784476784476784e-06, "loss": 0.0076, "step": 1874 }, { "epoch": 1.949064449064449, "grad_norm": 4.4420013427734375, "learning_rate": 6.7821667821667825e-06, "loss": 0.2816, "step": 1875 }, { "epoch": 1.95010395010395, "grad_norm": 9.357203483581543, "learning_rate": 6.779856779856781e-06, "loss": 0.3661, "step": 1876 }, { "epoch": 1.9511434511434511, "grad_norm": 11.721562385559082, "learning_rate": 6.777546777546778e-06, "loss": 0.8204, "step": 1877 }, { "epoch": 1.9521829521829521, "grad_norm": 0.9358542561531067, "learning_rate": 6.775236775236775e-06, "loss": 0.0355, "step": 1878 }, { "epoch": 1.9532224532224531, "grad_norm": 10.567963600158691, "learning_rate": 6.772926772926774e-06, "loss": 0.4664, "step": 1879 }, { "epoch": 1.9542619542619541, "grad_norm": 3.096851348876953, "learning_rate": 6.770616770616772e-06, "loss": 0.0477, "step": 1880 }, { "epoch": 1.9553014553014552, "grad_norm": 6.202308177947998, "learning_rate": 6.768306768306769e-06, "loss": 0.4278, "step": 1881 }, { "epoch": 1.9563409563409564, "grad_norm": 1.1024806499481201, "learning_rate": 6.7659967659967665e-06, "loss": 0.0193, "step": 1882 }, { "epoch": 1.9573804573804574, "grad_norm": 0.1680949330329895, "learning_rate": 6.763686763686765e-06, "loss": 0.0034, "step": 1883 }, { "epoch": 1.9584199584199584, "grad_norm": 0.901910126209259, "learning_rate": 6.761376761376762e-06, "loss": 0.0241, "step": 1884 }, { "epoch": 1.9594594594594594, "grad_norm": 2.4601571559906006, "learning_rate": 6.759066759066759e-06, "loss": 0.0994, "step": 1885 }, { "epoch": 1.9604989604989607, "grad_norm": 4.021463871002197, "learning_rate": 6.7567567567567575e-06, "loss": 0.07, "step": 1886 }, { "epoch": 1.9615384615384617, "grad_norm": 0.061176471412181854, "learning_rate": 6.754446754446755e-06, "loss": 0.0015, "step": 1887 }, { "epoch": 1.9625779625779627, "grad_norm": 4.875643730163574, "learning_rate": 6.752136752136753e-06, "loss": 0.251, "step": 1888 }, { "epoch": 1.9636174636174637, "grad_norm": 2.630978584289551, "learning_rate": 6.74982674982675e-06, "loss": 0.0967, "step": 1889 }, { "epoch": 1.9646569646569647, "grad_norm": 1.9153410196304321, "learning_rate": 6.747516747516748e-06, "loss": 0.054, "step": 1890 }, { "epoch": 1.9656964656964657, "grad_norm": 0.2559313178062439, "learning_rate": 6.745206745206746e-06, "loss": 0.0044, "step": 1891 }, { "epoch": 1.9667359667359667, "grad_norm": 6.092691421508789, "learning_rate": 6.742896742896743e-06, "loss": 0.2965, "step": 1892 }, { "epoch": 1.9677754677754677, "grad_norm": 2.28495454788208, "learning_rate": 6.740586740586741e-06, "loss": 0.0319, "step": 1893 }, { "epoch": 1.9688149688149688, "grad_norm": 0.785499095916748, "learning_rate": 6.738276738276739e-06, "loss": 0.0205, "step": 1894 }, { "epoch": 1.9698544698544698, "grad_norm": 7.484012603759766, "learning_rate": 6.735966735966737e-06, "loss": 0.3765, "step": 1895 }, { "epoch": 1.9708939708939708, "grad_norm": 0.0553046315908432, "learning_rate": 6.7336567336567335e-06, "loss": 0.0015, "step": 1896 }, { "epoch": 1.9719334719334718, "grad_norm": 3.8778178691864014, "learning_rate": 6.731346731346732e-06, "loss": 0.2224, "step": 1897 }, { "epoch": 1.972972972972973, "grad_norm": 3.9583427906036377, "learning_rate": 6.72903672903673e-06, "loss": 0.1476, "step": 1898 }, { "epoch": 1.974012474012474, "grad_norm": 2.645249605178833, "learning_rate": 6.726726726726728e-06, "loss": 0.1043, "step": 1899 }, { "epoch": 1.975051975051975, "grad_norm": 0.6136985421180725, "learning_rate": 6.724416724416725e-06, "loss": 0.0089, "step": 1900 }, { "epoch": 1.976091476091476, "grad_norm": 10.764155387878418, "learning_rate": 6.722106722106723e-06, "loss": 0.6656, "step": 1901 }, { "epoch": 1.9771309771309773, "grad_norm": 1.2077161073684692, "learning_rate": 6.719796719796721e-06, "loss": 0.0225, "step": 1902 }, { "epoch": 1.9781704781704783, "grad_norm": 7.769934177398682, "learning_rate": 6.7174867174867175e-06, "loss": 0.2133, "step": 1903 }, { "epoch": 1.9792099792099793, "grad_norm": 10.644739151000977, "learning_rate": 6.715176715176716e-06, "loss": 1.0806, "step": 1904 }, { "epoch": 1.9802494802494803, "grad_norm": 4.899578094482422, "learning_rate": 6.712866712866714e-06, "loss": 0.3573, "step": 1905 }, { "epoch": 1.9812889812889813, "grad_norm": 8.201702117919922, "learning_rate": 6.710556710556711e-06, "loss": 0.2393, "step": 1906 }, { "epoch": 1.9823284823284824, "grad_norm": 2.221436023712158, "learning_rate": 6.7082467082467085e-06, "loss": 0.0599, "step": 1907 }, { "epoch": 1.9833679833679834, "grad_norm": 1.037449836730957, "learning_rate": 6.705936705936707e-06, "loss": 0.0224, "step": 1908 }, { "epoch": 1.9844074844074844, "grad_norm": 0.08712287992238998, "learning_rate": 6.703626703626704e-06, "loss": 0.0016, "step": 1909 }, { "epoch": 1.9854469854469854, "grad_norm": 3.80332350730896, "learning_rate": 6.701316701316701e-06, "loss": 0.1323, "step": 1910 }, { "epoch": 1.9864864864864864, "grad_norm": 0.1044154092669487, "learning_rate": 6.6990066990067e-06, "loss": 0.0022, "step": 1911 }, { "epoch": 1.9875259875259874, "grad_norm": 0.1784604787826538, "learning_rate": 6.696696696696697e-06, "loss": 0.0067, "step": 1912 }, { "epoch": 1.9885654885654884, "grad_norm": 0.02075844444334507, "learning_rate": 6.694386694386695e-06, "loss": 0.0004, "step": 1913 }, { "epoch": 1.9896049896049897, "grad_norm": 10.95056438446045, "learning_rate": 6.6920766920766925e-06, "loss": 1.0339, "step": 1914 }, { "epoch": 1.9906444906444907, "grad_norm": 12.065203666687012, "learning_rate": 6.68976668976669e-06, "loss": 1.3071, "step": 1915 }, { "epoch": 1.9916839916839917, "grad_norm": 7.796991348266602, "learning_rate": 6.687456687456688e-06, "loss": 0.4829, "step": 1916 }, { "epoch": 1.9927234927234927, "grad_norm": 6.960740566253662, "learning_rate": 6.685146685146686e-06, "loss": 0.1915, "step": 1917 }, { "epoch": 1.993762993762994, "grad_norm": 11.084587097167969, "learning_rate": 6.682836682836683e-06, "loss": 1.2358, "step": 1918 }, { "epoch": 1.994802494802495, "grad_norm": 4.746704578399658, "learning_rate": 6.680526680526681e-06, "loss": 0.3151, "step": 1919 }, { "epoch": 1.995841995841996, "grad_norm": 8.840492248535156, "learning_rate": 6.678216678216679e-06, "loss": 0.8359, "step": 1920 }, { "epoch": 1.996881496881497, "grad_norm": 2.5447044372558594, "learning_rate": 6.675906675906676e-06, "loss": 0.0656, "step": 1921 }, { "epoch": 1.997920997920998, "grad_norm": 0.010701720602810383, "learning_rate": 6.673596673596674e-06, "loss": 0.0002, "step": 1922 }, { "epoch": 1.998960498960499, "grad_norm": 9.35302448272705, "learning_rate": 6.671286671286672e-06, "loss": 0.5405, "step": 1923 }, { "epoch": 2.0, "grad_norm": 31.449922561645508, "learning_rate": 6.66897666897667e-06, "loss": 1.1776, "step": 1924 }, { "epoch": 2.001039501039501, "grad_norm": 14.181282043457031, "learning_rate": 6.666666666666667e-06, "loss": 1.1872, "step": 1925 }, { "epoch": 2.002079002079002, "grad_norm": 4.408305644989014, "learning_rate": 6.664356664356665e-06, "loss": 0.2631, "step": 1926 }, { "epoch": 2.003118503118503, "grad_norm": 1.584025502204895, "learning_rate": 6.662046662046663e-06, "loss": 0.0318, "step": 1927 }, { "epoch": 2.004158004158004, "grad_norm": 0.3120782673358917, "learning_rate": 6.659736659736661e-06, "loss": 0.0054, "step": 1928 }, { "epoch": 2.005197505197505, "grad_norm": 0.15407142043113708, "learning_rate": 6.657426657426658e-06, "loss": 0.0044, "step": 1929 }, { "epoch": 2.006237006237006, "grad_norm": 6.331607341766357, "learning_rate": 6.655116655116656e-06, "loss": 0.1093, "step": 1930 }, { "epoch": 2.007276507276507, "grad_norm": 6.542793273925781, "learning_rate": 6.652806652806654e-06, "loss": 0.2508, "step": 1931 }, { "epoch": 2.008316008316008, "grad_norm": 1.4102624654769897, "learning_rate": 6.650496650496651e-06, "loss": 0.0119, "step": 1932 }, { "epoch": 2.0093555093555096, "grad_norm": 0.21904535591602325, "learning_rate": 6.648186648186649e-06, "loss": 0.0042, "step": 1933 }, { "epoch": 2.0103950103950106, "grad_norm": 0.6024210453033447, "learning_rate": 6.645876645876647e-06, "loss": 0.0171, "step": 1934 }, { "epoch": 2.0114345114345116, "grad_norm": 3.4335834980010986, "learning_rate": 6.643566643566644e-06, "loss": 0.0419, "step": 1935 }, { "epoch": 2.0124740124740126, "grad_norm": 25.000988006591797, "learning_rate": 6.641256641256642e-06, "loss": 0.9182, "step": 1936 }, { "epoch": 2.0135135135135136, "grad_norm": 3.822493314743042, "learning_rate": 6.63894663894664e-06, "loss": 0.2063, "step": 1937 }, { "epoch": 2.0145530145530146, "grad_norm": 0.0064015756361186504, "learning_rate": 6.636636636636637e-06, "loss": 0.0001, "step": 1938 }, { "epoch": 2.0155925155925156, "grad_norm": 2.934967279434204, "learning_rate": 6.6343266343266345e-06, "loss": 0.0684, "step": 1939 }, { "epoch": 2.0166320166320166, "grad_norm": 3.974259614944458, "learning_rate": 6.632016632016633e-06, "loss": 0.0803, "step": 1940 }, { "epoch": 2.0176715176715176, "grad_norm": 8.846273422241211, "learning_rate": 6.62970662970663e-06, "loss": 0.2658, "step": 1941 }, { "epoch": 2.0187110187110187, "grad_norm": 0.822585940361023, "learning_rate": 6.627396627396628e-06, "loss": 0.0143, "step": 1942 }, { "epoch": 2.0197505197505197, "grad_norm": 1.864228367805481, "learning_rate": 6.625086625086626e-06, "loss": 0.0349, "step": 1943 }, { "epoch": 2.0207900207900207, "grad_norm": 0.940016508102417, "learning_rate": 6.622776622776623e-06, "loss": 0.0192, "step": 1944 }, { "epoch": 2.0218295218295217, "grad_norm": 5.850529193878174, "learning_rate": 6.620466620466621e-06, "loss": 0.3462, "step": 1945 }, { "epoch": 2.0228690228690227, "grad_norm": 3.6243691444396973, "learning_rate": 6.618156618156619e-06, "loss": 0.0886, "step": 1946 }, { "epoch": 2.0239085239085237, "grad_norm": 0.3001081645488739, "learning_rate": 6.615846615846616e-06, "loss": 0.0076, "step": 1947 }, { "epoch": 2.024948024948025, "grad_norm": 4.391566276550293, "learning_rate": 6.613536613536614e-06, "loss": 0.125, "step": 1948 }, { "epoch": 2.025987525987526, "grad_norm": 0.49894315004348755, "learning_rate": 6.611226611226612e-06, "loss": 0.0112, "step": 1949 }, { "epoch": 2.027027027027027, "grad_norm": 8.597855567932129, "learning_rate": 6.608916608916609e-06, "loss": 0.4718, "step": 1950 }, { "epoch": 2.028066528066528, "grad_norm": 7.421142101287842, "learning_rate": 6.606606606606607e-06, "loss": 0.1622, "step": 1951 }, { "epoch": 2.029106029106029, "grad_norm": 0.04691620543599129, "learning_rate": 6.604296604296605e-06, "loss": 0.0009, "step": 1952 }, { "epoch": 2.0301455301455302, "grad_norm": 0.14164651930332184, "learning_rate": 6.601986601986603e-06, "loss": 0.0031, "step": 1953 }, { "epoch": 2.0311850311850312, "grad_norm": 1.1138488054275513, "learning_rate": 6.5996765996766e-06, "loss": 0.0305, "step": 1954 }, { "epoch": 2.0322245322245323, "grad_norm": 0.37105488777160645, "learning_rate": 6.597366597366598e-06, "loss": 0.0055, "step": 1955 }, { "epoch": 2.0332640332640333, "grad_norm": 6.09819221496582, "learning_rate": 6.595056595056596e-06, "loss": 0.3586, "step": 1956 }, { "epoch": 2.0343035343035343, "grad_norm": 0.18604886531829834, "learning_rate": 6.592746592746593e-06, "loss": 0.0034, "step": 1957 }, { "epoch": 2.0353430353430353, "grad_norm": 5.702450752258301, "learning_rate": 6.590436590436591e-06, "loss": 0.1071, "step": 1958 }, { "epoch": 2.0363825363825363, "grad_norm": 0.5846487283706665, "learning_rate": 6.588126588126589e-06, "loss": 0.0127, "step": 1959 }, { "epoch": 2.0374220374220373, "grad_norm": 11.294425964355469, "learning_rate": 6.585816585816586e-06, "loss": 0.682, "step": 1960 }, { "epoch": 2.0384615384615383, "grad_norm": 0.3569445013999939, "learning_rate": 6.583506583506584e-06, "loss": 0.0099, "step": 1961 }, { "epoch": 2.0395010395010393, "grad_norm": 4.077264785766602, "learning_rate": 6.581196581196582e-06, "loss": 0.1448, "step": 1962 }, { "epoch": 2.0405405405405403, "grad_norm": 3.6174886226654053, "learning_rate": 6.578886578886579e-06, "loss": 0.1382, "step": 1963 }, { "epoch": 2.0415800415800414, "grad_norm": 0.5434964299201965, "learning_rate": 6.5765765765765775e-06, "loss": 0.0201, "step": 1964 }, { "epoch": 2.042619542619543, "grad_norm": 0.6819065809249878, "learning_rate": 6.574266574266575e-06, "loss": 0.0219, "step": 1965 }, { "epoch": 2.043659043659044, "grad_norm": 14.37651538848877, "learning_rate": 6.571956571956572e-06, "loss": 1.741, "step": 1966 }, { "epoch": 2.044698544698545, "grad_norm": 8.81903076171875, "learning_rate": 6.56964656964657e-06, "loss": 0.2543, "step": 1967 }, { "epoch": 2.045738045738046, "grad_norm": 6.487217903137207, "learning_rate": 6.567336567336568e-06, "loss": 0.2681, "step": 1968 }, { "epoch": 2.046777546777547, "grad_norm": 9.601208686828613, "learning_rate": 6.565026565026565e-06, "loss": 0.5295, "step": 1969 }, { "epoch": 2.047817047817048, "grad_norm": 14.670531272888184, "learning_rate": 6.562716562716563e-06, "loss": 2.0842, "step": 1970 }, { "epoch": 2.048856548856549, "grad_norm": 11.62336540222168, "learning_rate": 6.560406560406561e-06, "loss": 0.4462, "step": 1971 }, { "epoch": 2.04989604989605, "grad_norm": 0.23016346991062164, "learning_rate": 6.558096558096559e-06, "loss": 0.0071, "step": 1972 }, { "epoch": 2.050935550935551, "grad_norm": 5.836270332336426, "learning_rate": 6.555786555786556e-06, "loss": 0.3069, "step": 1973 }, { "epoch": 2.051975051975052, "grad_norm": 0.14788639545440674, "learning_rate": 6.553476553476554e-06, "loss": 0.003, "step": 1974 }, { "epoch": 2.053014553014553, "grad_norm": 5.114451885223389, "learning_rate": 6.551166551166552e-06, "loss": 0.3132, "step": 1975 }, { "epoch": 2.054054054054054, "grad_norm": 0.022919783368706703, "learning_rate": 6.548856548856549e-06, "loss": 0.0006, "step": 1976 }, { "epoch": 2.055093555093555, "grad_norm": 10.959732055664062, "learning_rate": 6.546546546546547e-06, "loss": 0.6432, "step": 1977 }, { "epoch": 2.056133056133056, "grad_norm": 0.09819675981998444, "learning_rate": 6.544236544236545e-06, "loss": 0.002, "step": 1978 }, { "epoch": 2.057172557172557, "grad_norm": 17.36860466003418, "learning_rate": 6.541926541926542e-06, "loss": 1.5427, "step": 1979 }, { "epoch": 2.0582120582120584, "grad_norm": 2.121630907058716, "learning_rate": 6.53961653961654e-06, "loss": 0.0472, "step": 1980 }, { "epoch": 2.0592515592515594, "grad_norm": 12.962278366088867, "learning_rate": 6.537306537306538e-06, "loss": 1.041, "step": 1981 }, { "epoch": 2.0602910602910605, "grad_norm": 8.771272659301758, "learning_rate": 6.534996534996536e-06, "loss": 0.389, "step": 1982 }, { "epoch": 2.0613305613305615, "grad_norm": 8.74985408782959, "learning_rate": 6.532686532686533e-06, "loss": 0.5721, "step": 1983 }, { "epoch": 2.0623700623700625, "grad_norm": 5.376676559448242, "learning_rate": 6.530376530376531e-06, "loss": 0.2728, "step": 1984 }, { "epoch": 2.0634095634095635, "grad_norm": 6.670430660247803, "learning_rate": 6.528066528066529e-06, "loss": 0.6577, "step": 1985 }, { "epoch": 2.0644490644490645, "grad_norm": 6.218632698059082, "learning_rate": 6.525756525756526e-06, "loss": 0.1806, "step": 1986 }, { "epoch": 2.0654885654885655, "grad_norm": 0.47197651863098145, "learning_rate": 6.523446523446524e-06, "loss": 0.0076, "step": 1987 }, { "epoch": 2.0665280665280665, "grad_norm": 3.8785648345947266, "learning_rate": 6.521136521136522e-06, "loss": 0.0395, "step": 1988 }, { "epoch": 2.0675675675675675, "grad_norm": 0.2819497883319855, "learning_rate": 6.5188265188265195e-06, "loss": 0.0078, "step": 1989 }, { "epoch": 2.0686070686070686, "grad_norm": 2.064453125, "learning_rate": 6.516516516516517e-06, "loss": 0.0416, "step": 1990 }, { "epoch": 2.0696465696465696, "grad_norm": 0.17872098088264465, "learning_rate": 6.514206514206515e-06, "loss": 0.007, "step": 1991 }, { "epoch": 2.0706860706860706, "grad_norm": 4.090879440307617, "learning_rate": 6.511896511896512e-06, "loss": 0.1246, "step": 1992 }, { "epoch": 2.0717255717255716, "grad_norm": 9.653943061828613, "learning_rate": 6.50958650958651e-06, "loss": 1.0796, "step": 1993 }, { "epoch": 2.0727650727650726, "grad_norm": 5.251039981842041, "learning_rate": 6.507276507276508e-06, "loss": 0.2909, "step": 1994 }, { "epoch": 2.0738045738045736, "grad_norm": 1.227087378501892, "learning_rate": 6.504966504966505e-06, "loss": 0.049, "step": 1995 }, { "epoch": 2.0748440748440746, "grad_norm": 8.762991905212402, "learning_rate": 6.5026565026565035e-06, "loss": 0.8551, "step": 1996 }, { "epoch": 2.075883575883576, "grad_norm": 2.4482312202453613, "learning_rate": 6.500346500346501e-06, "loss": 0.0475, "step": 1997 }, { "epoch": 2.076923076923077, "grad_norm": 9.400773048400879, "learning_rate": 6.498036498036498e-06, "loss": 0.9039, "step": 1998 }, { "epoch": 2.077962577962578, "grad_norm": 9.062102317810059, "learning_rate": 6.495726495726496e-06, "loss": 0.4474, "step": 1999 }, { "epoch": 2.079002079002079, "grad_norm": 6.728084564208984, "learning_rate": 6.4934164934164945e-06, "loss": 0.417, "step": 2000 }, { "epoch": 2.08004158004158, "grad_norm": 5.888612270355225, "learning_rate": 6.491106491106491e-06, "loss": 0.1305, "step": 2001 }, { "epoch": 2.081081081081081, "grad_norm": 1.8210113048553467, "learning_rate": 6.488796488796489e-06, "loss": 0.0545, "step": 2002 }, { "epoch": 2.082120582120582, "grad_norm": 3.953402280807495, "learning_rate": 6.486486486486487e-06, "loss": 0.156, "step": 2003 }, { "epoch": 2.083160083160083, "grad_norm": 1.5353587865829468, "learning_rate": 6.484176484176484e-06, "loss": 0.037, "step": 2004 }, { "epoch": 2.084199584199584, "grad_norm": 6.91809606552124, "learning_rate": 6.481866481866482e-06, "loss": 0.8773, "step": 2005 }, { "epoch": 2.085239085239085, "grad_norm": 4.051370620727539, "learning_rate": 6.47955647955648e-06, "loss": 0.2595, "step": 2006 }, { "epoch": 2.086278586278586, "grad_norm": 10.553234100341797, "learning_rate": 6.4772464772464785e-06, "loss": 0.7355, "step": 2007 }, { "epoch": 2.087318087318087, "grad_norm": 0.09537404030561447, "learning_rate": 6.474936474936475e-06, "loss": 0.0029, "step": 2008 }, { "epoch": 2.0883575883575882, "grad_norm": 11.784072875976562, "learning_rate": 6.472626472626473e-06, "loss": 0.1916, "step": 2009 }, { "epoch": 2.0893970893970892, "grad_norm": 1.224432349205017, "learning_rate": 6.470316470316471e-06, "loss": 0.0435, "step": 2010 }, { "epoch": 2.0904365904365902, "grad_norm": 3.884338617324829, "learning_rate": 6.468006468006468e-06, "loss": 0.0429, "step": 2011 }, { "epoch": 2.0914760914760917, "grad_norm": 2.474146842956543, "learning_rate": 6.465696465696466e-06, "loss": 0.062, "step": 2012 }, { "epoch": 2.0925155925155927, "grad_norm": 1.0791113376617432, "learning_rate": 6.463386463386464e-06, "loss": 0.0222, "step": 2013 }, { "epoch": 2.0935550935550937, "grad_norm": 9.401138305664062, "learning_rate": 6.461076461076462e-06, "loss": 0.6004, "step": 2014 }, { "epoch": 2.0945945945945947, "grad_norm": 1.2327183485031128, "learning_rate": 6.458766458766459e-06, "loss": 0.0233, "step": 2015 }, { "epoch": 2.0956340956340958, "grad_norm": 0.08808104693889618, "learning_rate": 6.456456456456457e-06, "loss": 0.0018, "step": 2016 }, { "epoch": 2.0966735966735968, "grad_norm": 4.59703254699707, "learning_rate": 6.454146454146455e-06, "loss": 0.0998, "step": 2017 }, { "epoch": 2.0977130977130978, "grad_norm": 8.11798095703125, "learning_rate": 6.451836451836453e-06, "loss": 0.5019, "step": 2018 }, { "epoch": 2.098752598752599, "grad_norm": 2.0150322914123535, "learning_rate": 6.44952644952645e-06, "loss": 0.0466, "step": 2019 }, { "epoch": 2.0997920997921, "grad_norm": 4.207459926605225, "learning_rate": 6.447216447216448e-06, "loss": 0.1545, "step": 2020 }, { "epoch": 2.100831600831601, "grad_norm": 2.6554224491119385, "learning_rate": 6.4449064449064455e-06, "loss": 0.0605, "step": 2021 }, { "epoch": 2.101871101871102, "grad_norm": 1.8698917627334595, "learning_rate": 6.442596442596443e-06, "loss": 0.0471, "step": 2022 }, { "epoch": 2.102910602910603, "grad_norm": 6.580633640289307, "learning_rate": 6.440286440286441e-06, "loss": 0.1459, "step": 2023 }, { "epoch": 2.103950103950104, "grad_norm": 8.103753089904785, "learning_rate": 6.437976437976438e-06, "loss": 0.2792, "step": 2024 }, { "epoch": 2.104989604989605, "grad_norm": 0.27463245391845703, "learning_rate": 6.435666435666437e-06, "loss": 0.0083, "step": 2025 }, { "epoch": 2.106029106029106, "grad_norm": 1.2727649211883545, "learning_rate": 6.433356433356434e-06, "loss": 0.0291, "step": 2026 }, { "epoch": 2.107068607068607, "grad_norm": 8.371627807617188, "learning_rate": 6.431046431046431e-06, "loss": 0.2413, "step": 2027 }, { "epoch": 2.108108108108108, "grad_norm": 7.761322021484375, "learning_rate": 6.4287364287364295e-06, "loss": 0.474, "step": 2028 }, { "epoch": 2.1091476091476093, "grad_norm": 3.2681360244750977, "learning_rate": 6.426426426426427e-06, "loss": 0.1832, "step": 2029 }, { "epoch": 2.1101871101871104, "grad_norm": 4.478689193725586, "learning_rate": 6.424116424116424e-06, "loss": 0.3446, "step": 2030 }, { "epoch": 2.1112266112266114, "grad_norm": 7.7292985916137695, "learning_rate": 6.421806421806422e-06, "loss": 0.2425, "step": 2031 }, { "epoch": 2.1122661122661124, "grad_norm": 3.809605598449707, "learning_rate": 6.4194964194964205e-06, "loss": 0.0945, "step": 2032 }, { "epoch": 2.1133056133056134, "grad_norm": 0.0223262719810009, "learning_rate": 6.417186417186417e-06, "loss": 0.0005, "step": 2033 }, { "epoch": 2.1143451143451144, "grad_norm": 3.581709861755371, "learning_rate": 6.414876414876415e-06, "loss": 0.0982, "step": 2034 }, { "epoch": 2.1153846153846154, "grad_norm": 1.2276995182037354, "learning_rate": 6.412566412566413e-06, "loss": 0.02, "step": 2035 }, { "epoch": 2.1164241164241164, "grad_norm": 11.483390808105469, "learning_rate": 6.410256410256412e-06, "loss": 1.2713, "step": 2036 }, { "epoch": 2.1174636174636174, "grad_norm": 0.03083518147468567, "learning_rate": 6.407946407946408e-06, "loss": 0.0009, "step": 2037 }, { "epoch": 2.1185031185031185, "grad_norm": 6.554765701293945, "learning_rate": 6.405636405636406e-06, "loss": 0.3977, "step": 2038 }, { "epoch": 2.1195426195426195, "grad_norm": 1.5237971544265747, "learning_rate": 6.4033264033264045e-06, "loss": 0.0328, "step": 2039 }, { "epoch": 2.1205821205821205, "grad_norm": 9.13638687133789, "learning_rate": 6.401016401016401e-06, "loss": 0.679, "step": 2040 }, { "epoch": 2.1216216216216215, "grad_norm": 6.517833709716797, "learning_rate": 6.398706398706399e-06, "loss": 0.2226, "step": 2041 }, { "epoch": 2.1226611226611225, "grad_norm": 0.15088814496994019, "learning_rate": 6.396396396396397e-06, "loss": 0.0047, "step": 2042 }, { "epoch": 2.1237006237006235, "grad_norm": 7.6674723625183105, "learning_rate": 6.394086394086395e-06, "loss": 0.4219, "step": 2043 }, { "epoch": 2.124740124740125, "grad_norm": 11.87431526184082, "learning_rate": 6.391776391776392e-06, "loss": 0.0741, "step": 2044 }, { "epoch": 2.125779625779626, "grad_norm": 9.523569107055664, "learning_rate": 6.38946638946639e-06, "loss": 0.6066, "step": 2045 }, { "epoch": 2.126819126819127, "grad_norm": 7.635879039764404, "learning_rate": 6.387156387156388e-06, "loss": 0.6455, "step": 2046 }, { "epoch": 2.127858627858628, "grad_norm": 1.5417159795761108, "learning_rate": 6.384846384846385e-06, "loss": 0.0354, "step": 2047 }, { "epoch": 2.128898128898129, "grad_norm": 5.412545204162598, "learning_rate": 6.382536382536383e-06, "loss": 0.1233, "step": 2048 }, { "epoch": 2.12993762993763, "grad_norm": 3.4599087238311768, "learning_rate": 6.3802263802263805e-06, "loss": 0.074, "step": 2049 }, { "epoch": 2.130977130977131, "grad_norm": 0.6040581464767456, "learning_rate": 6.377916377916379e-06, "loss": 0.0182, "step": 2050 }, { "epoch": 2.132016632016632, "grad_norm": 7.773816108703613, "learning_rate": 6.375606375606376e-06, "loss": 0.5669, "step": 2051 }, { "epoch": 2.133056133056133, "grad_norm": 0.4341773986816406, "learning_rate": 6.373296373296373e-06, "loss": 0.0112, "step": 2052 }, { "epoch": 2.134095634095634, "grad_norm": NaN, "learning_rate": 6.3709863709863715e-06, "loss": 0.2966, "step": 2053 }, { "epoch": 2.135135135135135, "grad_norm": 2.1740145683288574, "learning_rate": 6.36867636867637e-06, "loss": 0.0534, "step": 2054 }, { "epoch": 2.136174636174636, "grad_norm": 0.956126868724823, "learning_rate": 6.366366366366366e-06, "loss": 0.0263, "step": 2055 }, { "epoch": 2.137214137214137, "grad_norm": 2.815540075302124, "learning_rate": 6.364056364056364e-06, "loss": 0.0602, "step": 2056 }, { "epoch": 2.138253638253638, "grad_norm": 5.632743835449219, "learning_rate": 6.361746361746363e-06, "loss": 0.1353, "step": 2057 }, { "epoch": 2.139293139293139, "grad_norm": 7.705263614654541, "learning_rate": 6.359436359436359e-06, "loss": 1.0678, "step": 2058 }, { "epoch": 2.14033264033264, "grad_norm": 10.012617111206055, "learning_rate": 6.357126357126357e-06, "loss": 0.8383, "step": 2059 }, { "epoch": 2.141372141372141, "grad_norm": 11.808959007263184, "learning_rate": 6.3548163548163555e-06, "loss": 0.9213, "step": 2060 }, { "epoch": 2.1424116424116426, "grad_norm": 6.768054008483887, "learning_rate": 6.352506352506354e-06, "loss": 0.7591, "step": 2061 }, { "epoch": 2.1434511434511436, "grad_norm": 8.046747207641602, "learning_rate": 6.35019635019635e-06, "loss": 0.8153, "step": 2062 }, { "epoch": 2.1444906444906446, "grad_norm": 4.566589832305908, "learning_rate": 6.347886347886348e-06, "loss": 0.3171, "step": 2063 }, { "epoch": 2.1455301455301456, "grad_norm": 7.478272914886475, "learning_rate": 6.3455763455763465e-06, "loss": 0.5232, "step": 2064 }, { "epoch": 2.1465696465696467, "grad_norm": 9.267080307006836, "learning_rate": 6.343266343266343e-06, "loss": 0.2926, "step": 2065 }, { "epoch": 2.1476091476091477, "grad_norm": 12.988465309143066, "learning_rate": 6.340956340956341e-06, "loss": 0.3343, "step": 2066 }, { "epoch": 2.1486486486486487, "grad_norm": 7.443843841552734, "learning_rate": 6.3386463386463394e-06, "loss": 0.4648, "step": 2067 }, { "epoch": 2.1496881496881497, "grad_norm": 1.7085425853729248, "learning_rate": 6.336336336336338e-06, "loss": 0.0593, "step": 2068 }, { "epoch": 2.1507276507276507, "grad_norm": 1.6968820095062256, "learning_rate": 6.334026334026334e-06, "loss": 0.0684, "step": 2069 }, { "epoch": 2.1517671517671517, "grad_norm": 2.399233102798462, "learning_rate": 6.331716331716332e-06, "loss": 0.0861, "step": 2070 }, { "epoch": 2.1528066528066527, "grad_norm": 4.714275360107422, "learning_rate": 6.3294063294063305e-06, "loss": 0.2724, "step": 2071 }, { "epoch": 2.1538461538461537, "grad_norm": 8.669092178344727, "learning_rate": 6.327096327096328e-06, "loss": 0.5226, "step": 2072 }, { "epoch": 2.1548856548856548, "grad_norm": 0.6544547080993652, "learning_rate": 6.324786324786325e-06, "loss": 0.0181, "step": 2073 }, { "epoch": 2.1559251559251558, "grad_norm": 5.61388635635376, "learning_rate": 6.322476322476323e-06, "loss": 0.3196, "step": 2074 }, { "epoch": 2.156964656964657, "grad_norm": 11.46261215209961, "learning_rate": 6.320166320166321e-06, "loss": 0.6334, "step": 2075 }, { "epoch": 2.1580041580041582, "grad_norm": 1.828363060951233, "learning_rate": 6.317856317856318e-06, "loss": 0.062, "step": 2076 }, { "epoch": 2.1590436590436592, "grad_norm": 4.612522125244141, "learning_rate": 6.315546315546316e-06, "loss": 0.2484, "step": 2077 }, { "epoch": 2.1600831600831603, "grad_norm": 3.598356246948242, "learning_rate": 6.313236313236314e-06, "loss": 0.0767, "step": 2078 }, { "epoch": 2.1611226611226613, "grad_norm": 7.803494453430176, "learning_rate": 6.310926310926312e-06, "loss": 0.8641, "step": 2079 }, { "epoch": 2.1621621621621623, "grad_norm": 4.039019584655762, "learning_rate": 6.308616308616309e-06, "loss": 0.3692, "step": 2080 }, { "epoch": 2.1632016632016633, "grad_norm": 0.3767847716808319, "learning_rate": 6.3063063063063065e-06, "loss": 0.0117, "step": 2081 }, { "epoch": 2.1642411642411643, "grad_norm": 1.3667638301849365, "learning_rate": 6.303996303996305e-06, "loss": 0.0471, "step": 2082 }, { "epoch": 2.1652806652806653, "grad_norm": 0.47995445132255554, "learning_rate": 6.301686301686302e-06, "loss": 0.0116, "step": 2083 }, { "epoch": 2.1663201663201663, "grad_norm": 1.2004308700561523, "learning_rate": 6.299376299376299e-06, "loss": 0.0399, "step": 2084 }, { "epoch": 2.1673596673596673, "grad_norm": 5.898972988128662, "learning_rate": 6.2970662970662975e-06, "loss": 0.1451, "step": 2085 }, { "epoch": 2.1683991683991684, "grad_norm": 7.3125386238098145, "learning_rate": 6.294756294756296e-06, "loss": 0.4661, "step": 2086 }, { "epoch": 2.1694386694386694, "grad_norm": 6.4429521560668945, "learning_rate": 6.292446292446292e-06, "loss": 0.2921, "step": 2087 }, { "epoch": 2.1704781704781704, "grad_norm": 1.5604861974716187, "learning_rate": 6.29013629013629e-06, "loss": 0.0699, "step": 2088 }, { "epoch": 2.1715176715176714, "grad_norm": 5.064148902893066, "learning_rate": 6.287826287826289e-06, "loss": 0.2257, "step": 2089 }, { "epoch": 2.1725571725571724, "grad_norm": 0.1482357233762741, "learning_rate": 6.285516285516287e-06, "loss": 0.0038, "step": 2090 }, { "epoch": 2.1735966735966734, "grad_norm": 0.17243508994579315, "learning_rate": 6.283206283206283e-06, "loss": 0.0035, "step": 2091 }, { "epoch": 2.1746361746361744, "grad_norm": 5.148295879364014, "learning_rate": 6.2808962808962815e-06, "loss": 0.1856, "step": 2092 }, { "epoch": 2.175675675675676, "grad_norm": 1.1720030307769775, "learning_rate": 6.27858627858628e-06, "loss": 0.0359, "step": 2093 }, { "epoch": 2.176715176715177, "grad_norm": 0.01272608246654272, "learning_rate": 6.276276276276276e-06, "loss": 0.0003, "step": 2094 }, { "epoch": 2.177754677754678, "grad_norm": 0.7603575587272644, "learning_rate": 6.273966273966274e-06, "loss": 0.0186, "step": 2095 }, { "epoch": 2.178794178794179, "grad_norm": 6.515388011932373, "learning_rate": 6.2716562716562726e-06, "loss": 0.331, "step": 2096 }, { "epoch": 2.17983367983368, "grad_norm": 8.437469482421875, "learning_rate": 6.269346269346271e-06, "loss": 0.7164, "step": 2097 }, { "epoch": 2.180873180873181, "grad_norm": 5.623338222503662, "learning_rate": 6.267036267036267e-06, "loss": 0.5556, "step": 2098 }, { "epoch": 2.181912681912682, "grad_norm": 8.537968635559082, "learning_rate": 6.2647262647262654e-06, "loss": 0.2938, "step": 2099 }, { "epoch": 2.182952182952183, "grad_norm": 0.11690367013216019, "learning_rate": 6.262416262416264e-06, "loss": 0.0028, "step": 2100 }, { "epoch": 2.183991683991684, "grad_norm": 7.875816345214844, "learning_rate": 6.26010626010626e-06, "loss": 0.7371, "step": 2101 }, { "epoch": 2.185031185031185, "grad_norm": 10.33857250213623, "learning_rate": 6.257796257796258e-06, "loss": 0.4181, "step": 2102 }, { "epoch": 2.186070686070686, "grad_norm": 7.2350006103515625, "learning_rate": 6.2554862554862565e-06, "loss": 0.2492, "step": 2103 }, { "epoch": 2.187110187110187, "grad_norm": 0.10823694616556168, "learning_rate": 6.253176253176254e-06, "loss": 0.0022, "step": 2104 }, { "epoch": 2.188149688149688, "grad_norm": 9.957201957702637, "learning_rate": 6.250866250866251e-06, "loss": 0.8395, "step": 2105 }, { "epoch": 2.189189189189189, "grad_norm": 5.966064453125, "learning_rate": 6.248556248556249e-06, "loss": 0.1804, "step": 2106 }, { "epoch": 2.19022869022869, "grad_norm": 10.796107292175293, "learning_rate": 6.246246246246247e-06, "loss": 0.8583, "step": 2107 }, { "epoch": 2.1912681912681915, "grad_norm": 2.4162845611572266, "learning_rate": 6.243936243936245e-06, "loss": 0.0993, "step": 2108 }, { "epoch": 2.1923076923076925, "grad_norm": 2.9660913944244385, "learning_rate": 6.241626241626242e-06, "loss": 0.1438, "step": 2109 }, { "epoch": 2.1933471933471935, "grad_norm": 0.12198910117149353, "learning_rate": 6.23931623931624e-06, "loss": 0.0041, "step": 2110 }, { "epoch": 2.1943866943866945, "grad_norm": 7.1431193351745605, "learning_rate": 6.237006237006238e-06, "loss": 0.6707, "step": 2111 }, { "epoch": 2.1954261954261955, "grad_norm": 8.095113754272461, "learning_rate": 6.234696234696235e-06, "loss": 0.8671, "step": 2112 }, { "epoch": 2.1964656964656966, "grad_norm": 0.2271644026041031, "learning_rate": 6.2323862323862325e-06, "loss": 0.0055, "step": 2113 }, { "epoch": 2.1975051975051976, "grad_norm": 2.605755567550659, "learning_rate": 6.230076230076231e-06, "loss": 0.0869, "step": 2114 }, { "epoch": 2.1985446985446986, "grad_norm": 0.5366362929344177, "learning_rate": 6.227766227766229e-06, "loss": 0.017, "step": 2115 }, { "epoch": 2.1995841995841996, "grad_norm": 3.6487009525299072, "learning_rate": 6.225456225456225e-06, "loss": 0.3753, "step": 2116 }, { "epoch": 2.2006237006237006, "grad_norm": 1.8559633493423462, "learning_rate": 6.2231462231462235e-06, "loss": 0.0677, "step": 2117 }, { "epoch": 2.2016632016632016, "grad_norm": 3.4091577529907227, "learning_rate": 6.220836220836222e-06, "loss": 0.1174, "step": 2118 }, { "epoch": 2.2027027027027026, "grad_norm": 4.887331962585449, "learning_rate": 6.218526218526218e-06, "loss": 0.1881, "step": 2119 }, { "epoch": 2.2037422037422036, "grad_norm": 6.882686614990234, "learning_rate": 6.2162162162162164e-06, "loss": 0.1934, "step": 2120 }, { "epoch": 2.2047817047817047, "grad_norm": 3.879831075668335, "learning_rate": 6.213906213906215e-06, "loss": 0.14, "step": 2121 }, { "epoch": 2.2058212058212057, "grad_norm": 0.22170592844486237, "learning_rate": 6.211596211596213e-06, "loss": 0.0072, "step": 2122 }, { "epoch": 2.2068607068607067, "grad_norm": 3.5209622383117676, "learning_rate": 6.209286209286209e-06, "loss": 0.2002, "step": 2123 }, { "epoch": 2.2079002079002077, "grad_norm": 0.3155515491962433, "learning_rate": 6.2069762069762075e-06, "loss": 0.0072, "step": 2124 }, { "epoch": 2.208939708939709, "grad_norm": 10.6986722946167, "learning_rate": 6.204666204666206e-06, "loss": 1.0673, "step": 2125 }, { "epoch": 2.20997920997921, "grad_norm": 1.0658669471740723, "learning_rate": 6.202356202356203e-06, "loss": 0.033, "step": 2126 }, { "epoch": 2.211018711018711, "grad_norm": 2.3950462341308594, "learning_rate": 6.2000462000462e-06, "loss": 0.039, "step": 2127 }, { "epoch": 2.212058212058212, "grad_norm": 0.15888486802577972, "learning_rate": 6.1977361977361986e-06, "loss": 0.0045, "step": 2128 }, { "epoch": 2.213097713097713, "grad_norm": 0.27664533257484436, "learning_rate": 6.195426195426196e-06, "loss": 0.0083, "step": 2129 }, { "epoch": 2.214137214137214, "grad_norm": 5.933618068695068, "learning_rate": 6.193116193116193e-06, "loss": 0.0743, "step": 2130 }, { "epoch": 2.215176715176715, "grad_norm": 9.662496566772461, "learning_rate": 6.1908061908061914e-06, "loss": 1.094, "step": 2131 }, { "epoch": 2.2162162162162162, "grad_norm": 3.5638606548309326, "learning_rate": 6.188496188496189e-06, "loss": 0.109, "step": 2132 }, { "epoch": 2.2172557172557172, "grad_norm": 6.295956134796143, "learning_rate": 6.186186186186187e-06, "loss": 0.1275, "step": 2133 }, { "epoch": 2.2182952182952183, "grad_norm": 3.0353546142578125, "learning_rate": 6.183876183876184e-06, "loss": 0.0632, "step": 2134 }, { "epoch": 2.2193347193347193, "grad_norm": 0.2624787986278534, "learning_rate": 6.181566181566182e-06, "loss": 0.0086, "step": 2135 }, { "epoch": 2.2203742203742203, "grad_norm": 9.579380989074707, "learning_rate": 6.17925617925618e-06, "loss": 0.3809, "step": 2136 }, { "epoch": 2.2214137214137213, "grad_norm": 0.9549159407615662, "learning_rate": 6.176946176946177e-06, "loss": 0.0227, "step": 2137 }, { "epoch": 2.2224532224532223, "grad_norm": 10.02344036102295, "learning_rate": 6.1746361746361745e-06, "loss": 0.8545, "step": 2138 }, { "epoch": 2.2234927234927233, "grad_norm": 7.736490726470947, "learning_rate": 6.172326172326173e-06, "loss": 0.5988, "step": 2139 }, { "epoch": 2.2245322245322248, "grad_norm": 0.9537568092346191, "learning_rate": 6.170016170016171e-06, "loss": 0.0187, "step": 2140 }, { "epoch": 2.225571725571726, "grad_norm": 10.918013572692871, "learning_rate": 6.167706167706167e-06, "loss": 1.3098, "step": 2141 }, { "epoch": 2.226611226611227, "grad_norm": 3.5731489658355713, "learning_rate": 6.165396165396166e-06, "loss": 0.0904, "step": 2142 }, { "epoch": 2.227650727650728, "grad_norm": 1.4764057397842407, "learning_rate": 6.163086163086164e-06, "loss": 0.0278, "step": 2143 }, { "epoch": 2.228690228690229, "grad_norm": 1.8778538703918457, "learning_rate": 6.160776160776162e-06, "loss": 0.0557, "step": 2144 }, { "epoch": 2.22972972972973, "grad_norm": 4.918324947357178, "learning_rate": 6.1584661584661585e-06, "loss": 0.1372, "step": 2145 }, { "epoch": 2.230769230769231, "grad_norm": 2.1157333850860596, "learning_rate": 6.156156156156157e-06, "loss": 0.0588, "step": 2146 }, { "epoch": 2.231808731808732, "grad_norm": 0.9033392071723938, "learning_rate": 6.153846153846155e-06, "loss": 0.0458, "step": 2147 }, { "epoch": 2.232848232848233, "grad_norm": 3.425045967102051, "learning_rate": 6.151536151536151e-06, "loss": 0.0722, "step": 2148 }, { "epoch": 2.233887733887734, "grad_norm": 0.7332718968391418, "learning_rate": 6.1492261492261496e-06, "loss": 0.0222, "step": 2149 }, { "epoch": 2.234927234927235, "grad_norm": 3.4322588443756104, "learning_rate": 6.146916146916148e-06, "loss": 0.144, "step": 2150 }, { "epoch": 2.235966735966736, "grad_norm": 0.6750631928443909, "learning_rate": 6.144606144606146e-06, "loss": 0.0257, "step": 2151 }, { "epoch": 2.237006237006237, "grad_norm": 10.790980339050293, "learning_rate": 6.1422961422961424e-06, "loss": 0.7807, "step": 2152 }, { "epoch": 2.238045738045738, "grad_norm": 1.1641652584075928, "learning_rate": 6.139986139986141e-06, "loss": 0.035, "step": 2153 }, { "epoch": 2.239085239085239, "grad_norm": 9.322601318359375, "learning_rate": 6.137676137676139e-06, "loss": 0.5508, "step": 2154 }, { "epoch": 2.24012474012474, "grad_norm": 1.2201451063156128, "learning_rate": 6.135366135366135e-06, "loss": 0.0281, "step": 2155 }, { "epoch": 2.241164241164241, "grad_norm": 0.011377022601664066, "learning_rate": 6.1330561330561335e-06, "loss": 0.0003, "step": 2156 }, { "epoch": 2.2422037422037424, "grad_norm": 0.3870171010494232, "learning_rate": 6.130746130746132e-06, "loss": 0.0147, "step": 2157 }, { "epoch": 2.2432432432432434, "grad_norm": 1.231650710105896, "learning_rate": 6.128436128436129e-06, "loss": 0.0324, "step": 2158 }, { "epoch": 2.2442827442827444, "grad_norm": 2.190561532974243, "learning_rate": 6.126126126126126e-06, "loss": 0.0829, "step": 2159 }, { "epoch": 2.2453222453222454, "grad_norm": 14.648651123046875, "learning_rate": 6.1238161238161246e-06, "loss": 0.7248, "step": 2160 }, { "epoch": 2.2463617463617465, "grad_norm": 0.22202709317207336, "learning_rate": 6.121506121506122e-06, "loss": 0.0044, "step": 2161 }, { "epoch": 2.2474012474012475, "grad_norm": 15.588959693908691, "learning_rate": 6.11919611919612e-06, "loss": 0.6235, "step": 2162 }, { "epoch": 2.2484407484407485, "grad_norm": 11.007787704467773, "learning_rate": 6.1168861168861175e-06, "loss": 0.5128, "step": 2163 }, { "epoch": 2.2494802494802495, "grad_norm": 0.2996562123298645, "learning_rate": 6.114576114576115e-06, "loss": 0.0055, "step": 2164 }, { "epoch": 2.2505197505197505, "grad_norm": 3.888937473297119, "learning_rate": 6.112266112266113e-06, "loss": 0.0838, "step": 2165 }, { "epoch": 2.2515592515592515, "grad_norm": 2.6077065467834473, "learning_rate": 6.10995610995611e-06, "loss": 0.0882, "step": 2166 }, { "epoch": 2.2525987525987525, "grad_norm": 0.4822016954421997, "learning_rate": 6.107646107646108e-06, "loss": 0.0107, "step": 2167 }, { "epoch": 2.2536382536382535, "grad_norm": 0.07543380558490753, "learning_rate": 6.105336105336106e-06, "loss": 0.0027, "step": 2168 }, { "epoch": 2.2546777546777546, "grad_norm": 0.3507520258426666, "learning_rate": 6.103026103026104e-06, "loss": 0.0069, "step": 2169 }, { "epoch": 2.2557172557172556, "grad_norm": 7.969033241271973, "learning_rate": 6.1007161007161006e-06, "loss": 0.9687, "step": 2170 }, { "epoch": 2.2567567567567566, "grad_norm": 6.0069050788879395, "learning_rate": 6.098406098406099e-06, "loss": 0.1818, "step": 2171 }, { "epoch": 2.257796257796258, "grad_norm": 13.834513664245605, "learning_rate": 6.096096096096097e-06, "loss": 0.8807, "step": 2172 }, { "epoch": 2.258835758835759, "grad_norm": 13.67642593383789, "learning_rate": 6.0937860937860934e-06, "loss": 0.6369, "step": 2173 }, { "epoch": 2.25987525987526, "grad_norm": 5.61348295211792, "learning_rate": 6.091476091476092e-06, "loss": 0.238, "step": 2174 }, { "epoch": 2.260914760914761, "grad_norm": 0.3030984699726105, "learning_rate": 6.08916608916609e-06, "loss": 0.0069, "step": 2175 }, { "epoch": 2.261954261954262, "grad_norm": 10.117189407348633, "learning_rate": 6.086856086856088e-06, "loss": 0.8853, "step": 2176 }, { "epoch": 2.262993762993763, "grad_norm": 9.245833396911621, "learning_rate": 6.0845460845460845e-06, "loss": 0.6308, "step": 2177 }, { "epoch": 2.264033264033264, "grad_norm": 0.23321442306041718, "learning_rate": 6.082236082236083e-06, "loss": 0.008, "step": 2178 }, { "epoch": 2.265072765072765, "grad_norm": 10.990395545959473, "learning_rate": 6.079926079926081e-06, "loss": 1.062, "step": 2179 }, { "epoch": 2.266112266112266, "grad_norm": 0.5124699473381042, "learning_rate": 6.077616077616078e-06, "loss": 0.0099, "step": 2180 }, { "epoch": 2.267151767151767, "grad_norm": 0.020312217995524406, "learning_rate": 6.0753060753060756e-06, "loss": 0.0003, "step": 2181 }, { "epoch": 2.268191268191268, "grad_norm": 3.825268507003784, "learning_rate": 6.072996072996074e-06, "loss": 0.1286, "step": 2182 }, { "epoch": 2.269230769230769, "grad_norm": 0.7043087482452393, "learning_rate": 6.070686070686071e-06, "loss": 0.0146, "step": 2183 }, { "epoch": 2.27027027027027, "grad_norm": 11.096062660217285, "learning_rate": 6.0683760683760684e-06, "loss": 1.0362, "step": 2184 }, { "epoch": 2.271309771309771, "grad_norm": 7.280477523803711, "learning_rate": 6.066066066066067e-06, "loss": 0.2911, "step": 2185 }, { "epoch": 2.272349272349272, "grad_norm": 0.15028230845928192, "learning_rate": 6.063756063756065e-06, "loss": 0.005, "step": 2186 }, { "epoch": 2.273388773388773, "grad_norm": 9.149582862854004, "learning_rate": 6.061446061446062e-06, "loss": 0.4077, "step": 2187 }, { "epoch": 2.274428274428274, "grad_norm": 2.9158825874328613, "learning_rate": 6.0591360591360595e-06, "loss": 0.0471, "step": 2188 }, { "epoch": 2.2754677754677752, "grad_norm": 0.09294558316469193, "learning_rate": 6.056826056826058e-06, "loss": 0.0015, "step": 2189 }, { "epoch": 2.2765072765072767, "grad_norm": 10.406388282775879, "learning_rate": 6.054516054516055e-06, "loss": 0.8003, "step": 2190 }, { "epoch": 2.2775467775467777, "grad_norm": 0.1650853008031845, "learning_rate": 6.052206052206052e-06, "loss": 0.0029, "step": 2191 }, { "epoch": 2.2785862785862787, "grad_norm": 7.553506374359131, "learning_rate": 6.049896049896051e-06, "loss": 0.3467, "step": 2192 }, { "epoch": 2.2796257796257797, "grad_norm": 0.027777332812547684, "learning_rate": 6.047586047586048e-06, "loss": 0.0005, "step": 2193 }, { "epoch": 2.2806652806652807, "grad_norm": 0.02463296428322792, "learning_rate": 6.045276045276046e-06, "loss": 0.0004, "step": 2194 }, { "epoch": 2.2817047817047817, "grad_norm": 15.498047828674316, "learning_rate": 6.0429660429660435e-06, "loss": 2.1882, "step": 2195 }, { "epoch": 2.2827442827442828, "grad_norm": 8.333736419677734, "learning_rate": 6.040656040656041e-06, "loss": 1.0823, "step": 2196 }, { "epoch": 2.2837837837837838, "grad_norm": 0.007465460803359747, "learning_rate": 6.038346038346039e-06, "loss": 0.0002, "step": 2197 }, { "epoch": 2.284823284823285, "grad_norm": 0.2807893753051758, "learning_rate": 6.036036036036037e-06, "loss": 0.0062, "step": 2198 }, { "epoch": 2.285862785862786, "grad_norm": 4.3804612159729, "learning_rate": 6.033726033726034e-06, "loss": 0.2496, "step": 2199 }, { "epoch": 2.286902286902287, "grad_norm": 1.8110418319702148, "learning_rate": 6.031416031416032e-06, "loss": 0.0364, "step": 2200 }, { "epoch": 2.287941787941788, "grad_norm": 1.0826025009155273, "learning_rate": 6.02910602910603e-06, "loss": 0.0204, "step": 2201 }, { "epoch": 2.288981288981289, "grad_norm": 0.04327649995684624, "learning_rate": 6.0267960267960266e-06, "loss": 0.0013, "step": 2202 }, { "epoch": 2.29002079002079, "grad_norm": 5.8153204917907715, "learning_rate": 6.024486024486025e-06, "loss": 0.2086, "step": 2203 }, { "epoch": 2.2910602910602913, "grad_norm": 0.011570545844733715, "learning_rate": 6.022176022176023e-06, "loss": 0.0003, "step": 2204 }, { "epoch": 2.2920997920997923, "grad_norm": 0.08150535076856613, "learning_rate": 6.019866019866021e-06, "loss": 0.0018, "step": 2205 }, { "epoch": 2.2931392931392933, "grad_norm": 8.084456443786621, "learning_rate": 6.017556017556018e-06, "loss": 0.7637, "step": 2206 }, { "epoch": 2.2941787941787943, "grad_norm": 0.06949927657842636, "learning_rate": 6.015246015246016e-06, "loss": 0.0016, "step": 2207 }, { "epoch": 2.2952182952182953, "grad_norm": 4.646961212158203, "learning_rate": 6.012936012936014e-06, "loss": 0.0951, "step": 2208 }, { "epoch": 2.2962577962577964, "grad_norm": 5.929029941558838, "learning_rate": 6.0106260106260105e-06, "loss": 0.3063, "step": 2209 }, { "epoch": 2.2972972972972974, "grad_norm": 2.86704683303833, "learning_rate": 6.008316008316009e-06, "loss": 0.0558, "step": 2210 }, { "epoch": 2.2983367983367984, "grad_norm": 8.642908096313477, "learning_rate": 6.006006006006007e-06, "loss": 0.4241, "step": 2211 }, { "epoch": 2.2993762993762994, "grad_norm": 3.363530397415161, "learning_rate": 6.003696003696004e-06, "loss": 0.0692, "step": 2212 }, { "epoch": 2.3004158004158004, "grad_norm": 8.524832725524902, "learning_rate": 6.0013860013860016e-06, "loss": 0.2695, "step": 2213 }, { "epoch": 2.3014553014553014, "grad_norm": 0.012578320689499378, "learning_rate": 5.999075999076e-06, "loss": 0.0002, "step": 2214 }, { "epoch": 2.3024948024948024, "grad_norm": 13.207572937011719, "learning_rate": 5.996765996765997e-06, "loss": 1.2314, "step": 2215 }, { "epoch": 2.3035343035343034, "grad_norm": 0.14138144254684448, "learning_rate": 5.994455994455995e-06, "loss": 0.0035, "step": 2216 }, { "epoch": 2.3045738045738045, "grad_norm": 0.029010187834501266, "learning_rate": 5.992145992145993e-06, "loss": 0.0006, "step": 2217 }, { "epoch": 2.3056133056133055, "grad_norm": 5.9447503089904785, "learning_rate": 5.98983598983599e-06, "loss": 0.2724, "step": 2218 }, { "epoch": 2.3066528066528065, "grad_norm": 9.484906196594238, "learning_rate": 5.987525987525988e-06, "loss": 0.7582, "step": 2219 }, { "epoch": 2.3076923076923075, "grad_norm": 9.741586685180664, "learning_rate": 5.9852159852159855e-06, "loss": 0.6975, "step": 2220 }, { "epoch": 2.3087318087318085, "grad_norm": 0.3729715347290039, "learning_rate": 5.982905982905983e-06, "loss": 0.0058, "step": 2221 }, { "epoch": 2.30977130977131, "grad_norm": 0.010007351636886597, "learning_rate": 5.980595980595981e-06, "loss": 0.0002, "step": 2222 }, { "epoch": 2.310810810810811, "grad_norm": 5.923950672149658, "learning_rate": 5.978285978285979e-06, "loss": 0.3793, "step": 2223 }, { "epoch": 2.311850311850312, "grad_norm": 6.51254415512085, "learning_rate": 5.975975975975976e-06, "loss": 0.5332, "step": 2224 }, { "epoch": 2.312889812889813, "grad_norm": 5.990417003631592, "learning_rate": 5.973665973665974e-06, "loss": 0.2406, "step": 2225 }, { "epoch": 2.313929313929314, "grad_norm": 2.215427875518799, "learning_rate": 5.971355971355972e-06, "loss": 0.0655, "step": 2226 }, { "epoch": 2.314968814968815, "grad_norm": 6.656970977783203, "learning_rate": 5.969045969045969e-06, "loss": 0.2158, "step": 2227 }, { "epoch": 2.316008316008316, "grad_norm": 11.145777702331543, "learning_rate": 5.966735966735967e-06, "loss": 0.3597, "step": 2228 }, { "epoch": 2.317047817047817, "grad_norm": 7.602084159851074, "learning_rate": 5.964425964425965e-06, "loss": 0.2979, "step": 2229 }, { "epoch": 2.318087318087318, "grad_norm": 12.412657737731934, "learning_rate": 5.962115962115963e-06, "loss": 0.7629, "step": 2230 }, { "epoch": 2.319126819126819, "grad_norm": 12.661900520324707, "learning_rate": 5.95980595980596e-06, "loss": 0.6984, "step": 2231 }, { "epoch": 2.32016632016632, "grad_norm": 5.584156513214111, "learning_rate": 5.957495957495958e-06, "loss": 0.1575, "step": 2232 }, { "epoch": 2.321205821205821, "grad_norm": 8.416341781616211, "learning_rate": 5.955185955185956e-06, "loss": 0.7292, "step": 2233 }, { "epoch": 2.322245322245322, "grad_norm": 16.33599090576172, "learning_rate": 5.952875952875954e-06, "loss": 1.921, "step": 2234 }, { "epoch": 2.323284823284823, "grad_norm": 8.614033699035645, "learning_rate": 5.950565950565951e-06, "loss": 0.269, "step": 2235 }, { "epoch": 2.3243243243243246, "grad_norm": 2.379479169845581, "learning_rate": 5.948255948255949e-06, "loss": 0.0411, "step": 2236 }, { "epoch": 2.3253638253638256, "grad_norm": 0.23624595999717712, "learning_rate": 5.945945945945947e-06, "loss": 0.0092, "step": 2237 }, { "epoch": 2.3264033264033266, "grad_norm": 2.8122398853302, "learning_rate": 5.943635943635944e-06, "loss": 0.1048, "step": 2238 }, { "epoch": 2.3274428274428276, "grad_norm": 1.7436000108718872, "learning_rate": 5.941325941325942e-06, "loss": 0.2694, "step": 2239 }, { "epoch": 2.3284823284823286, "grad_norm": 1.277492642402649, "learning_rate": 5.93901593901594e-06, "loss": 0.023, "step": 2240 }, { "epoch": 2.3295218295218296, "grad_norm": 3.3322794437408447, "learning_rate": 5.936705936705937e-06, "loss": 0.274, "step": 2241 }, { "epoch": 2.3305613305613306, "grad_norm": 5.62230110168457, "learning_rate": 5.934395934395935e-06, "loss": 0.2909, "step": 2242 }, { "epoch": 2.3316008316008316, "grad_norm": 17.122243881225586, "learning_rate": 5.932085932085933e-06, "loss": 0.2684, "step": 2243 }, { "epoch": 2.3326403326403327, "grad_norm": 0.5729025602340698, "learning_rate": 5.92977592977593e-06, "loss": 0.0081, "step": 2244 }, { "epoch": 2.3336798336798337, "grad_norm": 0.029930051416158676, "learning_rate": 5.927465927465928e-06, "loss": 0.0007, "step": 2245 }, { "epoch": 2.3347193347193347, "grad_norm": 0.48070666193962097, "learning_rate": 5.925155925155926e-06, "loss": 0.0102, "step": 2246 }, { "epoch": 2.3357588357588357, "grad_norm": 4.338085651397705, "learning_rate": 5.922845922845923e-06, "loss": 0.0833, "step": 2247 }, { "epoch": 2.3367983367983367, "grad_norm": 2.9266693592071533, "learning_rate": 5.920535920535921e-06, "loss": 0.2003, "step": 2248 }, { "epoch": 2.3378378378378377, "grad_norm": 10.988312721252441, "learning_rate": 5.918225918225919e-06, "loss": 1.367, "step": 2249 }, { "epoch": 2.3388773388773387, "grad_norm": 2.4815878868103027, "learning_rate": 5.915915915915916e-06, "loss": 0.137, "step": 2250 }, { "epoch": 2.3399168399168397, "grad_norm": 7.028376579284668, "learning_rate": 5.913605913605914e-06, "loss": 0.3135, "step": 2251 }, { "epoch": 2.3409563409563408, "grad_norm": 9.232767105102539, "learning_rate": 5.911295911295912e-06, "loss": 0.752, "step": 2252 }, { "epoch": 2.3419958419958418, "grad_norm": 9.30560302734375, "learning_rate": 5.908985908985909e-06, "loss": 0.8332, "step": 2253 }, { "epoch": 2.343035343035343, "grad_norm": 1.2369141578674316, "learning_rate": 5.906675906675907e-06, "loss": 0.0361, "step": 2254 }, { "epoch": 2.3440748440748442, "grad_norm": 0.7799108028411865, "learning_rate": 5.904365904365905e-06, "loss": 0.0149, "step": 2255 }, { "epoch": 2.3451143451143452, "grad_norm": 1.8956689834594727, "learning_rate": 5.902055902055902e-06, "loss": 0.0897, "step": 2256 }, { "epoch": 2.3461538461538463, "grad_norm": 0.285441517829895, "learning_rate": 5.8997458997459e-06, "loss": 0.005, "step": 2257 }, { "epoch": 2.3471933471933473, "grad_norm": 0.18665236234664917, "learning_rate": 5.897435897435898e-06, "loss": 0.0059, "step": 2258 }, { "epoch": 2.3482328482328483, "grad_norm": 0.08902893960475922, "learning_rate": 5.895125895125896e-06, "loss": 0.0013, "step": 2259 }, { "epoch": 2.3492723492723493, "grad_norm": 12.768613815307617, "learning_rate": 5.892815892815893e-06, "loss": 1.074, "step": 2260 }, { "epoch": 2.3503118503118503, "grad_norm": 4.8039774894714355, "learning_rate": 5.890505890505891e-06, "loss": 0.1545, "step": 2261 }, { "epoch": 2.3513513513513513, "grad_norm": 0.005349677987396717, "learning_rate": 5.888195888195889e-06, "loss": 0.0001, "step": 2262 }, { "epoch": 2.3523908523908523, "grad_norm": 8.962315559387207, "learning_rate": 5.885885885885886e-06, "loss": 0.4207, "step": 2263 }, { "epoch": 2.3534303534303533, "grad_norm": 12.141010284423828, "learning_rate": 5.883575883575884e-06, "loss": 0.9134, "step": 2264 }, { "epoch": 2.3544698544698544, "grad_norm": 1.6301746368408203, "learning_rate": 5.881265881265882e-06, "loss": 0.0752, "step": 2265 }, { "epoch": 2.3555093555093554, "grad_norm": 9.701839447021484, "learning_rate": 5.8789558789558794e-06, "loss": 0.329, "step": 2266 }, { "epoch": 2.3565488565488564, "grad_norm": 0.2083820402622223, "learning_rate": 5.876645876645877e-06, "loss": 0.0077, "step": 2267 }, { "epoch": 2.357588357588358, "grad_norm": 0.5354583263397217, "learning_rate": 5.874335874335875e-06, "loss": 0.011, "step": 2268 }, { "epoch": 2.358627858627859, "grad_norm": 0.549380898475647, "learning_rate": 5.872025872025872e-06, "loss": 0.0078, "step": 2269 }, { "epoch": 2.35966735966736, "grad_norm": 0.06869293004274368, "learning_rate": 5.8697158697158705e-06, "loss": 0.0012, "step": 2270 }, { "epoch": 2.360706860706861, "grad_norm": 4.068930149078369, "learning_rate": 5.867405867405868e-06, "loss": 0.1136, "step": 2271 }, { "epoch": 2.361746361746362, "grad_norm": 1.1226253509521484, "learning_rate": 5.865095865095865e-06, "loss": 0.0287, "step": 2272 }, { "epoch": 2.362785862785863, "grad_norm": 9.672595977783203, "learning_rate": 5.862785862785863e-06, "loss": 0.152, "step": 2273 }, { "epoch": 2.363825363825364, "grad_norm": 6.1760077476501465, "learning_rate": 5.860475860475861e-06, "loss": 0.3447, "step": 2274 }, { "epoch": 2.364864864864865, "grad_norm": 5.4050421714782715, "learning_rate": 5.858165858165858e-06, "loss": 0.0732, "step": 2275 }, { "epoch": 2.365904365904366, "grad_norm": 1.3560022115707397, "learning_rate": 5.855855855855856e-06, "loss": 0.0473, "step": 2276 }, { "epoch": 2.366943866943867, "grad_norm": 5.8554887771606445, "learning_rate": 5.8535458535458544e-06, "loss": 0.1202, "step": 2277 }, { "epoch": 2.367983367983368, "grad_norm": 11.231568336486816, "learning_rate": 5.851235851235851e-06, "loss": 1.1266, "step": 2278 }, { "epoch": 2.369022869022869, "grad_norm": 0.5555443167686462, "learning_rate": 5.848925848925849e-06, "loss": 0.0119, "step": 2279 }, { "epoch": 2.37006237006237, "grad_norm": 2.8198113441467285, "learning_rate": 5.846615846615847e-06, "loss": 0.175, "step": 2280 }, { "epoch": 2.371101871101871, "grad_norm": 1.1452579498291016, "learning_rate": 5.844305844305845e-06, "loss": 0.0255, "step": 2281 }, { "epoch": 2.372141372141372, "grad_norm": 3.1059226989746094, "learning_rate": 5.841995841995842e-06, "loss": 0.0457, "step": 2282 }, { "epoch": 2.373180873180873, "grad_norm": 1.5410337448120117, "learning_rate": 5.83968583968584e-06, "loss": 0.0515, "step": 2283 }, { "epoch": 2.374220374220374, "grad_norm": 4.821375846862793, "learning_rate": 5.837375837375838e-06, "loss": 0.1617, "step": 2284 }, { "epoch": 2.375259875259875, "grad_norm": 6.064678192138672, "learning_rate": 5.835065835065835e-06, "loss": 0.512, "step": 2285 }, { "epoch": 2.3762993762993765, "grad_norm": 1.9220199584960938, "learning_rate": 5.832755832755833e-06, "loss": 0.0384, "step": 2286 }, { "epoch": 2.3773388773388775, "grad_norm": 4.801299095153809, "learning_rate": 5.830445830445831e-06, "loss": 0.2309, "step": 2287 }, { "epoch": 2.3783783783783785, "grad_norm": 3.1393089294433594, "learning_rate": 5.8281358281358295e-06, "loss": 0.0715, "step": 2288 }, { "epoch": 2.3794178794178795, "grad_norm": 3.297896146774292, "learning_rate": 5.825825825825826e-06, "loss": 0.2937, "step": 2289 }, { "epoch": 2.3804573804573805, "grad_norm": 0.050158146768808365, "learning_rate": 5.823515823515824e-06, "loss": 0.001, "step": 2290 }, { "epoch": 2.3814968814968815, "grad_norm": 11.856502532958984, "learning_rate": 5.821205821205822e-06, "loss": 0.5053, "step": 2291 }, { "epoch": 2.3825363825363826, "grad_norm": 7.975099086761475, "learning_rate": 5.818895818895819e-06, "loss": 0.2572, "step": 2292 }, { "epoch": 2.3835758835758836, "grad_norm": 11.382410049438477, "learning_rate": 5.816585816585817e-06, "loss": 0.6613, "step": 2293 }, { "epoch": 2.3846153846153846, "grad_norm": 2.8917577266693115, "learning_rate": 5.814275814275815e-06, "loss": 0.1031, "step": 2294 }, { "epoch": 2.3856548856548856, "grad_norm": 4.3138298988342285, "learning_rate": 5.8119658119658126e-06, "loss": 0.1378, "step": 2295 }, { "epoch": 2.3866943866943866, "grad_norm": 6.972907066345215, "learning_rate": 5.80965580965581e-06, "loss": 0.2819, "step": 2296 }, { "epoch": 2.3877338877338876, "grad_norm": 3.7977538108825684, "learning_rate": 5.807345807345808e-06, "loss": 0.2146, "step": 2297 }, { "epoch": 2.3887733887733886, "grad_norm": 0.20148056745529175, "learning_rate": 5.8050358050358054e-06, "loss": 0.003, "step": 2298 }, { "epoch": 2.3898128898128896, "grad_norm": 9.711847305297852, "learning_rate": 5.802725802725803e-06, "loss": 1.2043, "step": 2299 }, { "epoch": 2.390852390852391, "grad_norm": 0.4300665855407715, "learning_rate": 5.800415800415801e-06, "loss": 0.0093, "step": 2300 }, { "epoch": 2.391891891891892, "grad_norm": 4.982692718505859, "learning_rate": 5.798105798105798e-06, "loss": 0.1541, "step": 2301 }, { "epoch": 2.392931392931393, "grad_norm": 13.709635734558105, "learning_rate": 5.7957957957957965e-06, "loss": 0.8311, "step": 2302 }, { "epoch": 2.393970893970894, "grad_norm": 8.99238109588623, "learning_rate": 5.793485793485794e-06, "loss": 0.3719, "step": 2303 }, { "epoch": 2.395010395010395, "grad_norm": 0.019779542461037636, "learning_rate": 5.791175791175791e-06, "loss": 0.0005, "step": 2304 }, { "epoch": 2.396049896049896, "grad_norm": 1.7408281564712524, "learning_rate": 5.788865788865789e-06, "loss": 0.0302, "step": 2305 }, { "epoch": 2.397089397089397, "grad_norm": 0.5736076831817627, "learning_rate": 5.7865557865557876e-06, "loss": 0.0094, "step": 2306 }, { "epoch": 2.398128898128898, "grad_norm": 0.351347953081131, "learning_rate": 5.784245784245784e-06, "loss": 0.0099, "step": 2307 }, { "epoch": 2.399168399168399, "grad_norm": 1.1329315900802612, "learning_rate": 5.781935781935782e-06, "loss": 0.0239, "step": 2308 }, { "epoch": 2.4002079002079, "grad_norm": 0.43927931785583496, "learning_rate": 5.7796257796257805e-06, "loss": 0.0071, "step": 2309 }, { "epoch": 2.401247401247401, "grad_norm": 9.422633171081543, "learning_rate": 5.777315777315777e-06, "loss": 0.2959, "step": 2310 }, { "epoch": 2.4022869022869022, "grad_norm": 22.3996639251709, "learning_rate": 5.775005775005775e-06, "loss": 1.3917, "step": 2311 }, { "epoch": 2.4033264033264032, "grad_norm": 7.138119697570801, "learning_rate": 5.772695772695773e-06, "loss": 0.2916, "step": 2312 }, { "epoch": 2.4043659043659042, "grad_norm": 0.04603596776723862, "learning_rate": 5.7703857703857715e-06, "loss": 0.0008, "step": 2313 }, { "epoch": 2.4054054054054053, "grad_norm": 0.22304362058639526, "learning_rate": 5.768075768075768e-06, "loss": 0.0032, "step": 2314 }, { "epoch": 2.4064449064449063, "grad_norm": 0.44586488604545593, "learning_rate": 5.765765765765766e-06, "loss": 0.0142, "step": 2315 }, { "epoch": 2.4074844074844073, "grad_norm": 1.6813427209854126, "learning_rate": 5.763455763455764e-06, "loss": 0.0507, "step": 2316 }, { "epoch": 2.4085239085239083, "grad_norm": 0.3971876800060272, "learning_rate": 5.761145761145761e-06, "loss": 0.0072, "step": 2317 }, { "epoch": 2.4095634095634098, "grad_norm": 0.08452802896499634, "learning_rate": 5.758835758835759e-06, "loss": 0.002, "step": 2318 }, { "epoch": 2.4106029106029108, "grad_norm": 11.850491523742676, "learning_rate": 5.756525756525757e-06, "loss": 0.4145, "step": 2319 }, { "epoch": 2.4116424116424118, "grad_norm": 2.9328062534332275, "learning_rate": 5.7542157542157555e-06, "loss": 0.0477, "step": 2320 }, { "epoch": 2.412681912681913, "grad_norm": 3.1229493618011475, "learning_rate": 5.751905751905752e-06, "loss": 0.0631, "step": 2321 }, { "epoch": 2.413721413721414, "grad_norm": 0.15812994539737701, "learning_rate": 5.74959574959575e-06, "loss": 0.005, "step": 2322 }, { "epoch": 2.414760914760915, "grad_norm": 0.660758376121521, "learning_rate": 5.747285747285748e-06, "loss": 0.0081, "step": 2323 }, { "epoch": 2.415800415800416, "grad_norm": 16.216384887695312, "learning_rate": 5.744975744975746e-06, "loss": 0.7467, "step": 2324 }, { "epoch": 2.416839916839917, "grad_norm": 8.676569938659668, "learning_rate": 5.742665742665743e-06, "loss": 0.3671, "step": 2325 }, { "epoch": 2.417879417879418, "grad_norm": 2.7714102268218994, "learning_rate": 5.740355740355741e-06, "loss": 0.1326, "step": 2326 }, { "epoch": 2.418918918918919, "grad_norm": 4.570979118347168, "learning_rate": 5.7380457380457386e-06, "loss": 0.1253, "step": 2327 }, { "epoch": 2.41995841995842, "grad_norm": 9.01137924194336, "learning_rate": 5.735735735735736e-06, "loss": 0.7805, "step": 2328 }, { "epoch": 2.420997920997921, "grad_norm": 10.329658508300781, "learning_rate": 5.733425733425734e-06, "loss": 0.3824, "step": 2329 }, { "epoch": 2.422037422037422, "grad_norm": 0.7224492430686951, "learning_rate": 5.7311157311157314e-06, "loss": 0.0235, "step": 2330 }, { "epoch": 2.423076923076923, "grad_norm": 0.059857260435819626, "learning_rate": 5.72880572880573e-06, "loss": 0.0011, "step": 2331 }, { "epoch": 2.4241164241164244, "grad_norm": 0.3503856956958771, "learning_rate": 5.726495726495727e-06, "loss": 0.0066, "step": 2332 }, { "epoch": 2.4251559251559254, "grad_norm": 0.03683709353208542, "learning_rate": 5.724185724185724e-06, "loss": 0.0008, "step": 2333 }, { "epoch": 2.4261954261954264, "grad_norm": 0.03195773437619209, "learning_rate": 5.7218757218757225e-06, "loss": 0.0008, "step": 2334 }, { "epoch": 2.4272349272349274, "grad_norm": 2.0734236240386963, "learning_rate": 5.71956571956572e-06, "loss": 0.0504, "step": 2335 }, { "epoch": 2.4282744282744284, "grad_norm": 1.467271089553833, "learning_rate": 5.717255717255717e-06, "loss": 0.0275, "step": 2336 }, { "epoch": 2.4293139293139294, "grad_norm": 10.107986450195312, "learning_rate": 5.714945714945715e-06, "loss": 0.3636, "step": 2337 }, { "epoch": 2.4303534303534304, "grad_norm": 4.867897033691406, "learning_rate": 5.712635712635714e-06, "loss": 0.1089, "step": 2338 }, { "epoch": 2.4313929313929314, "grad_norm": 1.6725810766220093, "learning_rate": 5.71032571032571e-06, "loss": 0.0591, "step": 2339 }, { "epoch": 2.4324324324324325, "grad_norm": 5.271655082702637, "learning_rate": 5.708015708015708e-06, "loss": 0.1665, "step": 2340 }, { "epoch": 2.4334719334719335, "grad_norm": 0.596553385257721, "learning_rate": 5.7057057057057065e-06, "loss": 0.0163, "step": 2341 }, { "epoch": 2.4345114345114345, "grad_norm": 8.481026649475098, "learning_rate": 5.703395703395705e-06, "loss": 0.351, "step": 2342 }, { "epoch": 2.4355509355509355, "grad_norm": 0.14047005772590637, "learning_rate": 5.701085701085701e-06, "loss": 0.003, "step": 2343 }, { "epoch": 2.4365904365904365, "grad_norm": 1.8173028230667114, "learning_rate": 5.698775698775699e-06, "loss": 0.036, "step": 2344 }, { "epoch": 2.4376299376299375, "grad_norm": 0.9283024668693542, "learning_rate": 5.6964656964656975e-06, "loss": 0.0173, "step": 2345 }, { "epoch": 2.4386694386694385, "grad_norm": 10.197988510131836, "learning_rate": 5.694155694155694e-06, "loss": 0.4031, "step": 2346 }, { "epoch": 2.4397089397089395, "grad_norm": 9.023080825805664, "learning_rate": 5.691845691845692e-06, "loss": 0.8144, "step": 2347 }, { "epoch": 2.4407484407484406, "grad_norm": 0.4014405906200409, "learning_rate": 5.68953568953569e-06, "loss": 0.008, "step": 2348 }, { "epoch": 2.4417879417879416, "grad_norm": 1.333750605583191, "learning_rate": 5.687225687225688e-06, "loss": 0.046, "step": 2349 }, { "epoch": 2.442827442827443, "grad_norm": 4.866448879241943, "learning_rate": 5.684915684915685e-06, "loss": 0.0696, "step": 2350 }, { "epoch": 2.443866943866944, "grad_norm": 4.256906509399414, "learning_rate": 5.682605682605683e-06, "loss": 0.1184, "step": 2351 }, { "epoch": 2.444906444906445, "grad_norm": 6.979046821594238, "learning_rate": 5.680295680295681e-06, "loss": 0.1771, "step": 2352 }, { "epoch": 2.445945945945946, "grad_norm": 6.416190147399902, "learning_rate": 5.677985677985678e-06, "loss": 0.4255, "step": 2353 }, { "epoch": 2.446985446985447, "grad_norm": 0.5552463531494141, "learning_rate": 5.675675675675676e-06, "loss": 0.0138, "step": 2354 }, { "epoch": 2.448024948024948, "grad_norm": 3.588365077972412, "learning_rate": 5.6733656733656735e-06, "loss": 0.1254, "step": 2355 }, { "epoch": 2.449064449064449, "grad_norm": 0.11385034769773483, "learning_rate": 5.671055671055672e-06, "loss": 0.0024, "step": 2356 }, { "epoch": 2.45010395010395, "grad_norm": 3.5646817684173584, "learning_rate": 5.668745668745669e-06, "loss": 0.0715, "step": 2357 }, { "epoch": 2.451143451143451, "grad_norm": 2.0259387493133545, "learning_rate": 5.666435666435666e-06, "loss": 0.042, "step": 2358 }, { "epoch": 2.452182952182952, "grad_norm": 3.4722325801849365, "learning_rate": 5.6641256641256646e-06, "loss": 0.086, "step": 2359 }, { "epoch": 2.453222453222453, "grad_norm": 1.5582636594772339, "learning_rate": 5.661815661815663e-06, "loss": 0.0273, "step": 2360 }, { "epoch": 2.454261954261954, "grad_norm": 3.767815589904785, "learning_rate": 5.659505659505659e-06, "loss": 0.0896, "step": 2361 }, { "epoch": 2.455301455301455, "grad_norm": 4.259250640869141, "learning_rate": 5.6571956571956575e-06, "loss": 0.1208, "step": 2362 }, { "epoch": 2.456340956340956, "grad_norm": 0.0018083971226587892, "learning_rate": 5.654885654885656e-06, "loss": 0.0, "step": 2363 }, { "epoch": 2.4573804573804576, "grad_norm": 2.261906385421753, "learning_rate": 5.652575652575652e-06, "loss": 0.0576, "step": 2364 }, { "epoch": 2.4584199584199586, "grad_norm": 0.015318424440920353, "learning_rate": 5.65026565026565e-06, "loss": 0.0005, "step": 2365 }, { "epoch": 2.4594594594594597, "grad_norm": 1.0688711404800415, "learning_rate": 5.6479556479556485e-06, "loss": 0.0209, "step": 2366 }, { "epoch": 2.4604989604989607, "grad_norm": 9.626909255981445, "learning_rate": 5.645645645645647e-06, "loss": 1.2241, "step": 2367 }, { "epoch": 2.4615384615384617, "grad_norm": 0.9243369102478027, "learning_rate": 5.643335643335643e-06, "loss": 0.0235, "step": 2368 }, { "epoch": 2.4625779625779627, "grad_norm": 4.78314208984375, "learning_rate": 5.641025641025641e-06, "loss": 0.2238, "step": 2369 }, { "epoch": 2.4636174636174637, "grad_norm": 0.6715689301490784, "learning_rate": 5.63871563871564e-06, "loss": 0.0188, "step": 2370 }, { "epoch": 2.4646569646569647, "grad_norm": 0.03074975498020649, "learning_rate": 5.636405636405636e-06, "loss": 0.0007, "step": 2371 }, { "epoch": 2.4656964656964657, "grad_norm": 11.54360294342041, "learning_rate": 5.634095634095634e-06, "loss": 0.8552, "step": 2372 }, { "epoch": 2.4667359667359667, "grad_norm": 0.5170840620994568, "learning_rate": 5.6317856317856325e-06, "loss": 0.0108, "step": 2373 }, { "epoch": 2.4677754677754677, "grad_norm": 5.690970420837402, "learning_rate": 5.629475629475631e-06, "loss": 0.18, "step": 2374 }, { "epoch": 2.4688149688149688, "grad_norm": 10.498955726623535, "learning_rate": 5.627165627165627e-06, "loss": 1.3798, "step": 2375 }, { "epoch": 2.4698544698544698, "grad_norm": 7.402065753936768, "learning_rate": 5.624855624855625e-06, "loss": 0.2798, "step": 2376 }, { "epoch": 2.470893970893971, "grad_norm": 2.759883403778076, "learning_rate": 5.6225456225456235e-06, "loss": 0.1178, "step": 2377 }, { "epoch": 2.471933471933472, "grad_norm": 0.09848170727491379, "learning_rate": 5.620235620235621e-06, "loss": 0.0019, "step": 2378 }, { "epoch": 2.472972972972973, "grad_norm": 8.388592720031738, "learning_rate": 5.617925617925618e-06, "loss": 0.2469, "step": 2379 }, { "epoch": 2.474012474012474, "grad_norm": 5.854394912719727, "learning_rate": 5.615615615615616e-06, "loss": 0.3582, "step": 2380 }, { "epoch": 2.475051975051975, "grad_norm": 7.211421966552734, "learning_rate": 5.613305613305614e-06, "loss": 0.6539, "step": 2381 }, { "epoch": 2.4760914760914763, "grad_norm": 0.45193174481391907, "learning_rate": 5.610995610995611e-06, "loss": 0.0105, "step": 2382 }, { "epoch": 2.4771309771309773, "grad_norm": 0.1756490170955658, "learning_rate": 5.608685608685609e-06, "loss": 0.0051, "step": 2383 }, { "epoch": 2.4781704781704783, "grad_norm": 8.903878211975098, "learning_rate": 5.606375606375607e-06, "loss": 0.363, "step": 2384 }, { "epoch": 2.4792099792099793, "grad_norm": 4.330974102020264, "learning_rate": 5.604065604065605e-06, "loss": 0.1238, "step": 2385 }, { "epoch": 2.4802494802494803, "grad_norm": 8.367501258850098, "learning_rate": 5.601755601755602e-06, "loss": 0.3349, "step": 2386 }, { "epoch": 2.4812889812889813, "grad_norm": 0.2912248373031616, "learning_rate": 5.5994455994455995e-06, "loss": 0.0045, "step": 2387 }, { "epoch": 2.4823284823284824, "grad_norm": 3.1457712650299072, "learning_rate": 5.597135597135598e-06, "loss": 0.0796, "step": 2388 }, { "epoch": 2.4833679833679834, "grad_norm": 5.04201078414917, "learning_rate": 5.594825594825595e-06, "loss": 0.2289, "step": 2389 }, { "epoch": 2.4844074844074844, "grad_norm": 0.1276787668466568, "learning_rate": 5.592515592515592e-06, "loss": 0.0032, "step": 2390 }, { "epoch": 2.4854469854469854, "grad_norm": 12.047901153564453, "learning_rate": 5.590205590205591e-06, "loss": 0.4329, "step": 2391 }, { "epoch": 2.4864864864864864, "grad_norm": 9.832515716552734, "learning_rate": 5.587895587895589e-06, "loss": 0.9411, "step": 2392 }, { "epoch": 2.4875259875259874, "grad_norm": 10.79465103149414, "learning_rate": 5.585585585585585e-06, "loss": 0.2082, "step": 2393 }, { "epoch": 2.4885654885654884, "grad_norm": 2.7653582096099854, "learning_rate": 5.5832755832755835e-06, "loss": 0.0389, "step": 2394 }, { "epoch": 2.4896049896049894, "grad_norm": 7.273566722869873, "learning_rate": 5.580965580965582e-06, "loss": 0.1308, "step": 2395 }, { "epoch": 2.490644490644491, "grad_norm": 0.6354008316993713, "learning_rate": 5.57865557865558e-06, "loss": 0.0125, "step": 2396 }, { "epoch": 2.491683991683992, "grad_norm": 0.014644395560026169, "learning_rate": 5.576345576345576e-06, "loss": 0.0004, "step": 2397 }, { "epoch": 2.492723492723493, "grad_norm": 8.341543197631836, "learning_rate": 5.5740355740355745e-06, "loss": 1.623, "step": 2398 }, { "epoch": 2.493762993762994, "grad_norm": 5.564713001251221, "learning_rate": 5.571725571725573e-06, "loss": 0.0785, "step": 2399 }, { "epoch": 2.494802494802495, "grad_norm": 0.19482921063899994, "learning_rate": 5.569415569415569e-06, "loss": 0.004, "step": 2400 }, { "epoch": 2.495841995841996, "grad_norm": 2.786550998687744, "learning_rate": 5.567105567105567e-06, "loss": 0.0618, "step": 2401 }, { "epoch": 2.496881496881497, "grad_norm": 10.698128700256348, "learning_rate": 5.564795564795566e-06, "loss": 0.4151, "step": 2402 }, { "epoch": 2.497920997920998, "grad_norm": 8.118881225585938, "learning_rate": 5.562485562485564e-06, "loss": 0.297, "step": 2403 }, { "epoch": 2.498960498960499, "grad_norm": 0.05869365483522415, "learning_rate": 5.56017556017556e-06, "loss": 0.0013, "step": 2404 }, { "epoch": 2.5, "grad_norm": 6.432159423828125, "learning_rate": 5.5578655578655585e-06, "loss": 0.1399, "step": 2405 }, { "epoch": 2.501039501039501, "grad_norm": 7.843135356903076, "learning_rate": 5.555555555555557e-06, "loss": 0.4493, "step": 2406 }, { "epoch": 2.502079002079002, "grad_norm": 13.558523178100586, "learning_rate": 5.553245553245554e-06, "loss": 0.1005, "step": 2407 }, { "epoch": 2.503118503118503, "grad_norm": 0.7968465685844421, "learning_rate": 5.550935550935551e-06, "loss": 0.0185, "step": 2408 }, { "epoch": 2.504158004158004, "grad_norm": 2.6324613094329834, "learning_rate": 5.5486255486255495e-06, "loss": 0.0922, "step": 2409 }, { "epoch": 2.505197505197505, "grad_norm": 0.7506412267684937, "learning_rate": 5.546315546315547e-06, "loss": 0.0148, "step": 2410 }, { "epoch": 2.506237006237006, "grad_norm": 0.028840284794569016, "learning_rate": 5.544005544005544e-06, "loss": 0.001, "step": 2411 }, { "epoch": 2.507276507276507, "grad_norm": 6.280975341796875, "learning_rate": 5.5416955416955424e-06, "loss": 0.2602, "step": 2412 }, { "epoch": 2.508316008316008, "grad_norm": 2.0573740005493164, "learning_rate": 5.53938553938554e-06, "loss": 0.0502, "step": 2413 }, { "epoch": 2.509355509355509, "grad_norm": 9.995739936828613, "learning_rate": 5.537075537075538e-06, "loss": 0.9421, "step": 2414 }, { "epoch": 2.51039501039501, "grad_norm": 0.29023870825767517, "learning_rate": 5.534765534765535e-06, "loss": 0.0064, "step": 2415 }, { "epoch": 2.5114345114345116, "grad_norm": 0.6130965352058411, "learning_rate": 5.532455532455533e-06, "loss": 0.016, "step": 2416 }, { "epoch": 2.5124740124740126, "grad_norm": 12.549509048461914, "learning_rate": 5.530145530145531e-06, "loss": 0.7434, "step": 2417 }, { "epoch": 2.5135135135135136, "grad_norm": 11.806109428405762, "learning_rate": 5.527835527835528e-06, "loss": 0.8109, "step": 2418 }, { "epoch": 2.5145530145530146, "grad_norm": 0.16034944355487823, "learning_rate": 5.5255255255255255e-06, "loss": 0.003, "step": 2419 }, { "epoch": 2.5155925155925156, "grad_norm": 1.2844282388687134, "learning_rate": 5.523215523215524e-06, "loss": 0.0154, "step": 2420 }, { "epoch": 2.5166320166320166, "grad_norm": 6.30752420425415, "learning_rate": 5.520905520905522e-06, "loss": 0.1711, "step": 2421 }, { "epoch": 2.5176715176715176, "grad_norm": 9.42320728302002, "learning_rate": 5.518595518595518e-06, "loss": 0.5665, "step": 2422 }, { "epoch": 2.5187110187110187, "grad_norm": 8.125027656555176, "learning_rate": 5.516285516285517e-06, "loss": 0.5858, "step": 2423 }, { "epoch": 2.5197505197505197, "grad_norm": 11.059926986694336, "learning_rate": 5.513975513975515e-06, "loss": 0.7028, "step": 2424 }, { "epoch": 2.5207900207900207, "grad_norm": 1.8393372297286987, "learning_rate": 5.511665511665513e-06, "loss": 0.0419, "step": 2425 }, { "epoch": 2.5218295218295217, "grad_norm": 1.991614818572998, "learning_rate": 5.5093555093555095e-06, "loss": 0.0236, "step": 2426 }, { "epoch": 2.5228690228690227, "grad_norm": 0.1596309095621109, "learning_rate": 5.507045507045508e-06, "loss": 0.0015, "step": 2427 }, { "epoch": 2.523908523908524, "grad_norm": 3.80548095703125, "learning_rate": 5.504735504735506e-06, "loss": 0.1103, "step": 2428 }, { "epoch": 2.524948024948025, "grad_norm": 0.8834503889083862, "learning_rate": 5.502425502425502e-06, "loss": 0.2529, "step": 2429 }, { "epoch": 2.525987525987526, "grad_norm": 1.7620978355407715, "learning_rate": 5.5001155001155005e-06, "loss": 0.0571, "step": 2430 }, { "epoch": 2.527027027027027, "grad_norm": 8.883015632629395, "learning_rate": 5.497805497805499e-06, "loss": 0.2079, "step": 2431 }, { "epoch": 2.528066528066528, "grad_norm": 0.02310093306005001, "learning_rate": 5.495495495495496e-06, "loss": 0.0006, "step": 2432 }, { "epoch": 2.529106029106029, "grad_norm": 0.1949092447757721, "learning_rate": 5.493185493185493e-06, "loss": 0.0058, "step": 2433 }, { "epoch": 2.5301455301455302, "grad_norm": 0.004292343743145466, "learning_rate": 5.490875490875492e-06, "loss": 0.0, "step": 2434 }, { "epoch": 2.5311850311850312, "grad_norm": 6.701157093048096, "learning_rate": 5.488565488565489e-06, "loss": 0.4612, "step": 2435 }, { "epoch": 2.5322245322245323, "grad_norm": 2.1965763568878174, "learning_rate": 5.486255486255486e-06, "loss": 0.0605, "step": 2436 }, { "epoch": 2.5332640332640333, "grad_norm": 5.151659965515137, "learning_rate": 5.4839454839454845e-06, "loss": 0.135, "step": 2437 }, { "epoch": 2.5343035343035343, "grad_norm": 0.12913161516189575, "learning_rate": 5.481635481635482e-06, "loss": 0.0034, "step": 2438 }, { "epoch": 2.5353430353430353, "grad_norm": 3.0704658031463623, "learning_rate": 5.47932547932548e-06, "loss": 0.0782, "step": 2439 }, { "epoch": 2.5363825363825363, "grad_norm": 1.6351587772369385, "learning_rate": 5.477015477015477e-06, "loss": 0.1708, "step": 2440 }, { "epoch": 2.5374220374220373, "grad_norm": 0.03910132497549057, "learning_rate": 5.474705474705475e-06, "loss": 0.0012, "step": 2441 }, { "epoch": 2.5384615384615383, "grad_norm": 3.443721294403076, "learning_rate": 5.472395472395473e-06, "loss": 0.0967, "step": 2442 }, { "epoch": 2.5395010395010393, "grad_norm": 3.9343323707580566, "learning_rate": 5.470085470085471e-06, "loss": 0.1029, "step": 2443 }, { "epoch": 2.5405405405405403, "grad_norm": 8.639856338500977, "learning_rate": 5.467775467775468e-06, "loss": 0.6861, "step": 2444 }, { "epoch": 2.5415800415800414, "grad_norm": 3.257338047027588, "learning_rate": 5.465465465465466e-06, "loss": 0.1017, "step": 2445 }, { "epoch": 2.5426195426195424, "grad_norm": 0.10939653217792511, "learning_rate": 5.463155463155464e-06, "loss": 0.0017, "step": 2446 }, { "epoch": 2.5436590436590434, "grad_norm": 0.6103467345237732, "learning_rate": 5.4608454608454605e-06, "loss": 0.0167, "step": 2447 }, { "epoch": 2.544698544698545, "grad_norm": 0.9619538187980652, "learning_rate": 5.458535458535459e-06, "loss": 0.015, "step": 2448 }, { "epoch": 2.545738045738046, "grad_norm": 12.676862716674805, "learning_rate": 5.456225456225457e-06, "loss": 1.0984, "step": 2449 }, { "epoch": 2.546777546777547, "grad_norm": 13.927125930786133, "learning_rate": 5.453915453915455e-06, "loss": 1.5171, "step": 2450 }, { "epoch": 2.547817047817048, "grad_norm": 8.507790565490723, "learning_rate": 5.4516054516054515e-06, "loss": 0.2561, "step": 2451 }, { "epoch": 2.548856548856549, "grad_norm": 10.253948211669922, "learning_rate": 5.44929544929545e-06, "loss": 0.6676, "step": 2452 }, { "epoch": 2.54989604989605, "grad_norm": 3.853451728820801, "learning_rate": 5.446985446985448e-06, "loss": 0.1021, "step": 2453 }, { "epoch": 2.550935550935551, "grad_norm": 7.904906749725342, "learning_rate": 5.444675444675444e-06, "loss": 1.0231, "step": 2454 }, { "epoch": 2.551975051975052, "grad_norm": 10.921005249023438, "learning_rate": 5.442365442365443e-06, "loss": 0.4457, "step": 2455 }, { "epoch": 2.553014553014553, "grad_norm": 7.966489315032959, "learning_rate": 5.440055440055441e-06, "loss": 0.8084, "step": 2456 }, { "epoch": 2.554054054054054, "grad_norm": 0.3931668698787689, "learning_rate": 5.437745437745439e-06, "loss": 0.0101, "step": 2457 }, { "epoch": 2.555093555093555, "grad_norm": 0.6197918653488159, "learning_rate": 5.4354354354354355e-06, "loss": 0.0187, "step": 2458 }, { "epoch": 2.556133056133056, "grad_norm": 5.144542217254639, "learning_rate": 5.433125433125434e-06, "loss": 0.1461, "step": 2459 }, { "epoch": 2.5571725571725574, "grad_norm": 0.13302533328533173, "learning_rate": 5.430815430815432e-06, "loss": 0.0023, "step": 2460 }, { "epoch": 2.5582120582120584, "grad_norm": 0.4320324659347534, "learning_rate": 5.428505428505429e-06, "loss": 0.0132, "step": 2461 }, { "epoch": 2.5592515592515594, "grad_norm": 11.432194709777832, "learning_rate": 5.4261954261954265e-06, "loss": 1.0018, "step": 2462 }, { "epoch": 2.5602910602910605, "grad_norm": 6.14457893371582, "learning_rate": 5.423885423885425e-06, "loss": 0.2512, "step": 2463 }, { "epoch": 2.5613305613305615, "grad_norm": 10.37625503540039, "learning_rate": 5.421575421575422e-06, "loss": 0.8207, "step": 2464 }, { "epoch": 2.5623700623700625, "grad_norm": 7.509644508361816, "learning_rate": 5.4192654192654194e-06, "loss": 1.0264, "step": 2465 }, { "epoch": 2.5634095634095635, "grad_norm": 14.384940147399902, "learning_rate": 5.416955416955418e-06, "loss": 0.8239, "step": 2466 }, { "epoch": 2.5644490644490645, "grad_norm": 6.205191612243652, "learning_rate": 5.414645414645415e-06, "loss": 0.1594, "step": 2467 }, { "epoch": 2.5654885654885655, "grad_norm": 9.01181411743164, "learning_rate": 5.412335412335413e-06, "loss": 0.297, "step": 2468 }, { "epoch": 2.5665280665280665, "grad_norm": 1.9883480072021484, "learning_rate": 5.4100254100254105e-06, "loss": 0.0604, "step": 2469 }, { "epoch": 2.5675675675675675, "grad_norm": 0.9856329560279846, "learning_rate": 5.407715407715408e-06, "loss": 0.0192, "step": 2470 }, { "epoch": 2.5686070686070686, "grad_norm": 0.8478329181671143, "learning_rate": 5.405405405405406e-06, "loss": 0.0229, "step": 2471 }, { "epoch": 2.5696465696465696, "grad_norm": 0.5473334789276123, "learning_rate": 5.403095403095403e-06, "loss": 0.0162, "step": 2472 }, { "epoch": 2.5706860706860706, "grad_norm": 11.216007232666016, "learning_rate": 5.400785400785401e-06, "loss": 0.6939, "step": 2473 }, { "epoch": 2.5717255717255716, "grad_norm": 0.467229962348938, "learning_rate": 5.398475398475399e-06, "loss": 0.0103, "step": 2474 }, { "epoch": 2.5727650727650726, "grad_norm": 1.6031438112258911, "learning_rate": 5.396165396165397e-06, "loss": 0.0448, "step": 2475 }, { "epoch": 2.5738045738045736, "grad_norm": 8.846580505371094, "learning_rate": 5.393855393855394e-06, "loss": 0.7402, "step": 2476 }, { "epoch": 2.5748440748440746, "grad_norm": 4.420310020446777, "learning_rate": 5.391545391545392e-06, "loss": 0.1099, "step": 2477 }, { "epoch": 2.5758835758835756, "grad_norm": 9.618853569030762, "learning_rate": 5.38923538923539e-06, "loss": 0.643, "step": 2478 }, { "epoch": 2.5769230769230766, "grad_norm": 8.873306274414062, "learning_rate": 5.386925386925388e-06, "loss": 0.7841, "step": 2479 }, { "epoch": 2.577962577962578, "grad_norm": 18.581398010253906, "learning_rate": 5.384615384615385e-06, "loss": 1.7637, "step": 2480 }, { "epoch": 2.579002079002079, "grad_norm": 10.233980178833008, "learning_rate": 5.382305382305383e-06, "loss": 0.764, "step": 2481 }, { "epoch": 2.58004158004158, "grad_norm": 1.2297041416168213, "learning_rate": 5.379995379995381e-06, "loss": 0.0358, "step": 2482 }, { "epoch": 2.581081081081081, "grad_norm": 6.728565216064453, "learning_rate": 5.3776853776853775e-06, "loss": 0.5458, "step": 2483 }, { "epoch": 2.582120582120582, "grad_norm": 3.641315221786499, "learning_rate": 5.375375375375376e-06, "loss": 0.04, "step": 2484 }, { "epoch": 2.583160083160083, "grad_norm": 0.34038031101226807, "learning_rate": 5.373065373065374e-06, "loss": 0.0113, "step": 2485 }, { "epoch": 2.584199584199584, "grad_norm": 0.9422138333320618, "learning_rate": 5.370755370755371e-06, "loss": 0.0299, "step": 2486 }, { "epoch": 2.585239085239085, "grad_norm": 0.12205638736486435, "learning_rate": 5.368445368445369e-06, "loss": 0.0042, "step": 2487 }, { "epoch": 2.586278586278586, "grad_norm": 7.378511905670166, "learning_rate": 5.366135366135367e-06, "loss": 0.2885, "step": 2488 }, { "epoch": 2.587318087318087, "grad_norm": 0.26321983337402344, "learning_rate": 5.363825363825364e-06, "loss": 0.007, "step": 2489 }, { "epoch": 2.5883575883575882, "grad_norm": 6.9181342124938965, "learning_rate": 5.3615153615153615e-06, "loss": 0.2783, "step": 2490 }, { "epoch": 2.5893970893970892, "grad_norm": 0.12993164360523224, "learning_rate": 5.35920535920536e-06, "loss": 0.0037, "step": 2491 }, { "epoch": 2.5904365904365907, "grad_norm": 0.17028915882110596, "learning_rate": 5.356895356895357e-06, "loss": 0.0057, "step": 2492 }, { "epoch": 2.5914760914760917, "grad_norm": 5.357726573944092, "learning_rate": 5.354585354585355e-06, "loss": 0.553, "step": 2493 }, { "epoch": 2.5925155925155927, "grad_norm": 0.08897285163402557, "learning_rate": 5.3522753522753526e-06, "loss": 0.0022, "step": 2494 }, { "epoch": 2.5935550935550937, "grad_norm": 9.253763198852539, "learning_rate": 5.349965349965351e-06, "loss": 0.6422, "step": 2495 }, { "epoch": 2.5945945945945947, "grad_norm": 0.5697972774505615, "learning_rate": 5.347655347655348e-06, "loss": 0.0165, "step": 2496 }, { "epoch": 2.5956340956340958, "grad_norm": 4.335848331451416, "learning_rate": 5.345345345345346e-06, "loss": 0.2975, "step": 2497 }, { "epoch": 2.5966735966735968, "grad_norm": 4.842863082885742, "learning_rate": 5.343035343035344e-06, "loss": 0.1894, "step": 2498 }, { "epoch": 2.5977130977130978, "grad_norm": 0.643163800239563, "learning_rate": 5.340725340725341e-06, "loss": 0.0129, "step": 2499 }, { "epoch": 2.598752598752599, "grad_norm": 6.766790390014648, "learning_rate": 5.338415338415339e-06, "loss": 0.4936, "step": 2500 }, { "epoch": 2.5997920997921, "grad_norm": 0.04399247094988823, "learning_rate": 5.3361053361053365e-06, "loss": 0.0013, "step": 2501 }, { "epoch": 2.600831600831601, "grad_norm": 9.183923721313477, "learning_rate": 5.333795333795334e-06, "loss": 0.4783, "step": 2502 }, { "epoch": 2.601871101871102, "grad_norm": 4.868300437927246, "learning_rate": 5.331485331485332e-06, "loss": 0.3321, "step": 2503 }, { "epoch": 2.602910602910603, "grad_norm": 7.282240867614746, "learning_rate": 5.32917532917533e-06, "loss": 0.5301, "step": 2504 }, { "epoch": 2.603950103950104, "grad_norm": 3.4226865768432617, "learning_rate": 5.326865326865327e-06, "loss": 0.1984, "step": 2505 }, { "epoch": 2.604989604989605, "grad_norm": 4.259052276611328, "learning_rate": 5.324555324555325e-06, "loss": 0.2619, "step": 2506 }, { "epoch": 2.606029106029106, "grad_norm": 6.480755805969238, "learning_rate": 5.322245322245323e-06, "loss": 0.3234, "step": 2507 }, { "epoch": 2.607068607068607, "grad_norm": 0.06338009238243103, "learning_rate": 5.31993531993532e-06, "loss": 0.0018, "step": 2508 }, { "epoch": 2.608108108108108, "grad_norm": 3.376009464263916, "learning_rate": 5.317625317625318e-06, "loss": 0.1565, "step": 2509 }, { "epoch": 2.609147609147609, "grad_norm": 2.3504629135131836, "learning_rate": 5.315315315315316e-06, "loss": 0.0513, "step": 2510 }, { "epoch": 2.61018711018711, "grad_norm": 3.9377503395080566, "learning_rate": 5.313005313005314e-06, "loss": 0.3037, "step": 2511 }, { "epoch": 2.6112266112266114, "grad_norm": 2.743152141571045, "learning_rate": 5.310695310695311e-06, "loss": 0.1154, "step": 2512 }, { "epoch": 2.6122661122661124, "grad_norm": 1.6104410886764526, "learning_rate": 5.308385308385309e-06, "loss": 0.0512, "step": 2513 }, { "epoch": 2.6133056133056134, "grad_norm": 0.4035050868988037, "learning_rate": 5.306075306075307e-06, "loss": 0.0109, "step": 2514 }, { "epoch": 2.6143451143451144, "grad_norm": 1.360506534576416, "learning_rate": 5.303765303765304e-06, "loss": 0.0475, "step": 2515 }, { "epoch": 2.6153846153846154, "grad_norm": 9.4105863571167, "learning_rate": 5.301455301455302e-06, "loss": 1.3063, "step": 2516 }, { "epoch": 2.6164241164241164, "grad_norm": 5.020283222198486, "learning_rate": 5.2991452991453e-06, "loss": 0.2818, "step": 2517 }, { "epoch": 2.6174636174636174, "grad_norm": 0.8245450258255005, "learning_rate": 5.296835296835297e-06, "loss": 0.0157, "step": 2518 }, { "epoch": 2.6185031185031185, "grad_norm": 5.981417655944824, "learning_rate": 5.294525294525295e-06, "loss": 0.1196, "step": 2519 }, { "epoch": 2.6195426195426195, "grad_norm": 1.7131268978118896, "learning_rate": 5.292215292215293e-06, "loss": 0.0433, "step": 2520 }, { "epoch": 2.6205821205821205, "grad_norm": 8.64378833770752, "learning_rate": 5.28990528990529e-06, "loss": 0.5102, "step": 2521 }, { "epoch": 2.6216216216216215, "grad_norm": 11.704745292663574, "learning_rate": 5.287595287595288e-06, "loss": 0.6297, "step": 2522 }, { "epoch": 2.6226611226611225, "grad_norm": 8.039158821105957, "learning_rate": 5.285285285285286e-06, "loss": 0.3796, "step": 2523 }, { "epoch": 2.623700623700624, "grad_norm": 13.407707214355469, "learning_rate": 5.282975282975283e-06, "loss": 0.5751, "step": 2524 }, { "epoch": 2.624740124740125, "grad_norm": 4.56635046005249, "learning_rate": 5.280665280665281e-06, "loss": 0.2088, "step": 2525 }, { "epoch": 2.625779625779626, "grad_norm": 5.892931938171387, "learning_rate": 5.2783552783552786e-06, "loss": 0.2088, "step": 2526 }, { "epoch": 2.626819126819127, "grad_norm": 5.208812713623047, "learning_rate": 5.276045276045276e-06, "loss": 0.3068, "step": 2527 }, { "epoch": 2.627858627858628, "grad_norm": 1.0516034364700317, "learning_rate": 5.273735273735274e-06, "loss": 0.0139, "step": 2528 }, { "epoch": 2.628898128898129, "grad_norm": 6.1416239738464355, "learning_rate": 5.271425271425272e-06, "loss": 0.3791, "step": 2529 }, { "epoch": 2.62993762993763, "grad_norm": 4.7939863204956055, "learning_rate": 5.269115269115269e-06, "loss": 0.3712, "step": 2530 }, { "epoch": 2.630977130977131, "grad_norm": 3.7652533054351807, "learning_rate": 5.266805266805267e-06, "loss": 0.1008, "step": 2531 }, { "epoch": 2.632016632016632, "grad_norm": 7.166833877563477, "learning_rate": 5.264495264495265e-06, "loss": 0.4992, "step": 2532 }, { "epoch": 2.633056133056133, "grad_norm": 8.62056827545166, "learning_rate": 5.262185262185263e-06, "loss": 0.3708, "step": 2533 }, { "epoch": 2.634095634095634, "grad_norm": 2.3554868698120117, "learning_rate": 5.25987525987526e-06, "loss": 0.0478, "step": 2534 }, { "epoch": 2.635135135135135, "grad_norm": 8.775339126586914, "learning_rate": 5.257565257565258e-06, "loss": 0.3151, "step": 2535 }, { "epoch": 2.636174636174636, "grad_norm": 2.1053552627563477, "learning_rate": 5.255255255255256e-06, "loss": 0.2953, "step": 2536 }, { "epoch": 2.637214137214137, "grad_norm": 4.951183795928955, "learning_rate": 5.252945252945253e-06, "loss": 0.1592, "step": 2537 }, { "epoch": 2.638253638253638, "grad_norm": 0.02089507319033146, "learning_rate": 5.250635250635251e-06, "loss": 0.0005, "step": 2538 }, { "epoch": 2.639293139293139, "grad_norm": 0.20281526446342468, "learning_rate": 5.248325248325249e-06, "loss": 0.0041, "step": 2539 }, { "epoch": 2.64033264033264, "grad_norm": 7.6722002029418945, "learning_rate": 5.246015246015247e-06, "loss": 0.4628, "step": 2540 }, { "epoch": 2.641372141372141, "grad_norm": 9.445036888122559, "learning_rate": 5.243705243705244e-06, "loss": 0.8533, "step": 2541 }, { "epoch": 2.642411642411642, "grad_norm": 0.6576030254364014, "learning_rate": 5.241395241395242e-06, "loss": 0.0151, "step": 2542 }, { "epoch": 2.643451143451143, "grad_norm": 5.462561130523682, "learning_rate": 5.23908523908524e-06, "loss": 0.1482, "step": 2543 }, { "epoch": 2.6444906444906446, "grad_norm": 0.2012774497270584, "learning_rate": 5.236775236775237e-06, "loss": 0.0048, "step": 2544 }, { "epoch": 2.6455301455301456, "grad_norm": 6.6424174308776855, "learning_rate": 5.234465234465235e-06, "loss": 0.2679, "step": 2545 }, { "epoch": 2.6465696465696467, "grad_norm": 7.657466411590576, "learning_rate": 5.232155232155233e-06, "loss": 0.2041, "step": 2546 }, { "epoch": 2.6476091476091477, "grad_norm": 0.44189453125, "learning_rate": 5.22984522984523e-06, "loss": 0.013, "step": 2547 }, { "epoch": 2.6486486486486487, "grad_norm": 1.6354267597198486, "learning_rate": 5.227535227535228e-06, "loss": 0.0265, "step": 2548 }, { "epoch": 2.6496881496881497, "grad_norm": 4.967313766479492, "learning_rate": 5.225225225225226e-06, "loss": 0.1779, "step": 2549 }, { "epoch": 2.6507276507276507, "grad_norm": 0.6553028225898743, "learning_rate": 5.222915222915223e-06, "loss": 0.0178, "step": 2550 }, { "epoch": 2.6517671517671517, "grad_norm": 5.932888031005859, "learning_rate": 5.2206052206052215e-06, "loss": 0.2465, "step": 2551 }, { "epoch": 2.6528066528066527, "grad_norm": 1.0269923210144043, "learning_rate": 5.218295218295219e-06, "loss": 0.0224, "step": 2552 }, { "epoch": 2.6538461538461537, "grad_norm": 4.396491050720215, "learning_rate": 5.215985215985216e-06, "loss": 0.1572, "step": 2553 }, { "epoch": 2.6548856548856548, "grad_norm": 0.5629084706306458, "learning_rate": 5.213675213675214e-06, "loss": 0.0216, "step": 2554 }, { "epoch": 2.6559251559251558, "grad_norm": 0.1629319041967392, "learning_rate": 5.211365211365212e-06, "loss": 0.0029, "step": 2555 }, { "epoch": 2.6569646569646572, "grad_norm": 8.487727165222168, "learning_rate": 5.209055209055209e-06, "loss": 1.2105, "step": 2556 }, { "epoch": 2.6580041580041582, "grad_norm": 6.604908466339111, "learning_rate": 5.206745206745207e-06, "loss": 0.3326, "step": 2557 }, { "epoch": 2.6590436590436592, "grad_norm": 9.358423233032227, "learning_rate": 5.2044352044352054e-06, "loss": 0.6385, "step": 2558 }, { "epoch": 2.6600831600831603, "grad_norm": 0.8450285196304321, "learning_rate": 5.202125202125202e-06, "loss": 0.013, "step": 2559 }, { "epoch": 2.6611226611226613, "grad_norm": 8.273629188537598, "learning_rate": 5.1998151998152e-06, "loss": 0.2557, "step": 2560 }, { "epoch": 2.6621621621621623, "grad_norm": 0.030706413090229034, "learning_rate": 5.197505197505198e-06, "loss": 0.0007, "step": 2561 }, { "epoch": 2.6632016632016633, "grad_norm": 3.37729811668396, "learning_rate": 5.195195195195195e-06, "loss": 0.1005, "step": 2562 }, { "epoch": 2.6642411642411643, "grad_norm": 0.17317216098308563, "learning_rate": 5.192885192885193e-06, "loss": 0.0039, "step": 2563 }, { "epoch": 2.6652806652806653, "grad_norm": 3.6512608528137207, "learning_rate": 5.190575190575191e-06, "loss": 0.1058, "step": 2564 }, { "epoch": 2.6663201663201663, "grad_norm": 2.5752034187316895, "learning_rate": 5.188265188265189e-06, "loss": 0.1316, "step": 2565 }, { "epoch": 2.6673596673596673, "grad_norm": 1.0415226221084595, "learning_rate": 5.185955185955186e-06, "loss": 0.0336, "step": 2566 }, { "epoch": 2.6683991683991684, "grad_norm": 3.302722454071045, "learning_rate": 5.183645183645184e-06, "loss": 0.0712, "step": 2567 }, { "epoch": 2.6694386694386694, "grad_norm": 1.884563684463501, "learning_rate": 5.181335181335182e-06, "loss": 0.1313, "step": 2568 }, { "epoch": 2.6704781704781704, "grad_norm": 1.5982835292816162, "learning_rate": 5.17902517902518e-06, "loss": 0.0511, "step": 2569 }, { "epoch": 2.6715176715176714, "grad_norm": 5.131686687469482, "learning_rate": 5.176715176715177e-06, "loss": 0.2519, "step": 2570 }, { "epoch": 2.6725571725571724, "grad_norm": 7.3811564445495605, "learning_rate": 5.174405174405175e-06, "loss": 0.4257, "step": 2571 }, { "epoch": 2.6735966735966734, "grad_norm": 0.13512037694454193, "learning_rate": 5.1720951720951725e-06, "loss": 0.0034, "step": 2572 }, { "epoch": 2.6746361746361744, "grad_norm": 3.146843194961548, "learning_rate": 5.16978516978517e-06, "loss": 0.123, "step": 2573 }, { "epoch": 2.6756756756756754, "grad_norm": 0.004672181326895952, "learning_rate": 5.167475167475168e-06, "loss": 0.0001, "step": 2574 }, { "epoch": 2.6767151767151764, "grad_norm": 1.4410141706466675, "learning_rate": 5.165165165165165e-06, "loss": 0.0345, "step": 2575 }, { "epoch": 2.677754677754678, "grad_norm": 0.6777349710464478, "learning_rate": 5.1628551628551635e-06, "loss": 0.0282, "step": 2576 }, { "epoch": 2.678794178794179, "grad_norm": 0.03477311134338379, "learning_rate": 5.160545160545161e-06, "loss": 0.0012, "step": 2577 }, { "epoch": 2.67983367983368, "grad_norm": 4.510110378265381, "learning_rate": 5.158235158235158e-06, "loss": 0.0668, "step": 2578 }, { "epoch": 2.680873180873181, "grad_norm": 13.6720609664917, "learning_rate": 5.155925155925156e-06, "loss": 2.3046, "step": 2579 }, { "epoch": 2.681912681912682, "grad_norm": 0.8042404055595398, "learning_rate": 5.153615153615154e-06, "loss": 0.0317, "step": 2580 }, { "epoch": 2.682952182952183, "grad_norm": 2.2485601902008057, "learning_rate": 5.151305151305151e-06, "loss": 0.0665, "step": 2581 }, { "epoch": 2.683991683991684, "grad_norm": 0.08799292892217636, "learning_rate": 5.148995148995149e-06, "loss": 0.0025, "step": 2582 }, { "epoch": 2.685031185031185, "grad_norm": 2.3404698371887207, "learning_rate": 5.1466851466851475e-06, "loss": 0.0433, "step": 2583 }, { "epoch": 2.686070686070686, "grad_norm": 0.5272285342216492, "learning_rate": 5.144375144375144e-06, "loss": 0.0118, "step": 2584 }, { "epoch": 2.687110187110187, "grad_norm": 0.020404517650604248, "learning_rate": 5.142065142065142e-06, "loss": 0.0006, "step": 2585 }, { "epoch": 2.688149688149688, "grad_norm": 0.5791797041893005, "learning_rate": 5.13975513975514e-06, "loss": 0.0176, "step": 2586 }, { "epoch": 2.689189189189189, "grad_norm": 7.628298282623291, "learning_rate": 5.1374451374451386e-06, "loss": 0.3845, "step": 2587 }, { "epoch": 2.6902286902286905, "grad_norm": 3.025831460952759, "learning_rate": 5.135135135135135e-06, "loss": 0.2582, "step": 2588 }, { "epoch": 2.6912681912681915, "grad_norm": 6.124800682067871, "learning_rate": 5.132825132825133e-06, "loss": 0.1955, "step": 2589 }, { "epoch": 2.6923076923076925, "grad_norm": 2.600257158279419, "learning_rate": 5.1305151305151314e-06, "loss": 0.0648, "step": 2590 }, { "epoch": 2.6933471933471935, "grad_norm": 4.131895542144775, "learning_rate": 5.128205128205128e-06, "loss": 0.097, "step": 2591 }, { "epoch": 2.6943866943866945, "grad_norm": 0.0008171483059413731, "learning_rate": 5.125895125895126e-06, "loss": 0.0, "step": 2592 }, { "epoch": 2.6954261954261955, "grad_norm": 14.881963729858398, "learning_rate": 5.123585123585124e-06, "loss": 1.7013, "step": 2593 }, { "epoch": 2.6964656964656966, "grad_norm": 2.45265531539917, "learning_rate": 5.1212751212751225e-06, "loss": 0.0527, "step": 2594 }, { "epoch": 2.6975051975051976, "grad_norm": 11.552932739257812, "learning_rate": 5.118965118965119e-06, "loss": 0.6112, "step": 2595 }, { "epoch": 2.6985446985446986, "grad_norm": 0.15990565717220306, "learning_rate": 5.116655116655117e-06, "loss": 0.0037, "step": 2596 }, { "epoch": 2.6995841995841996, "grad_norm": 0.015272875316441059, "learning_rate": 5.114345114345115e-06, "loss": 0.0003, "step": 2597 }, { "epoch": 2.7006237006237006, "grad_norm": 10.404386520385742, "learning_rate": 5.112035112035112e-06, "loss": 1.4427, "step": 2598 }, { "epoch": 2.7016632016632016, "grad_norm": 3.653311252593994, "learning_rate": 5.10972510972511e-06, "loss": 0.1365, "step": 2599 }, { "epoch": 2.7027027027027026, "grad_norm": 6.013213634490967, "learning_rate": 5.107415107415108e-06, "loss": 0.1969, "step": 2600 }, { "epoch": 2.7037422037422036, "grad_norm": 0.7424914240837097, "learning_rate": 5.105105105105106e-06, "loss": 0.0127, "step": 2601 }, { "epoch": 2.7047817047817047, "grad_norm": 0.16230179369449615, "learning_rate": 5.102795102795103e-06, "loss": 0.0046, "step": 2602 }, { "epoch": 2.7058212058212057, "grad_norm": 8.930994033813477, "learning_rate": 5.100485100485101e-06, "loss": 0.5425, "step": 2603 }, { "epoch": 2.7068607068607067, "grad_norm": 1.592777132987976, "learning_rate": 5.0981750981750985e-06, "loss": 0.029, "step": 2604 }, { "epoch": 2.7079002079002077, "grad_norm": 2.0030179023742676, "learning_rate": 5.095865095865097e-06, "loss": 0.042, "step": 2605 }, { "epoch": 2.7089397089397087, "grad_norm": 4.972201824188232, "learning_rate": 5.093555093555094e-06, "loss": 0.2896, "step": 2606 }, { "epoch": 2.7099792099792097, "grad_norm": 0.016869381070137024, "learning_rate": 5.091245091245091e-06, "loss": 0.0004, "step": 2607 }, { "epoch": 2.711018711018711, "grad_norm": 2.6783928871154785, "learning_rate": 5.0889350889350895e-06, "loss": 0.1094, "step": 2608 }, { "epoch": 2.712058212058212, "grad_norm": 11.518509864807129, "learning_rate": 5.086625086625087e-06, "loss": 0.5764, "step": 2609 }, { "epoch": 2.713097713097713, "grad_norm": 2.1756484508514404, "learning_rate": 5.084315084315084e-06, "loss": 0.0298, "step": 2610 }, { "epoch": 2.714137214137214, "grad_norm": 2.0496127605438232, "learning_rate": 5.0820050820050824e-06, "loss": 0.034, "step": 2611 }, { "epoch": 2.715176715176715, "grad_norm": 5.0031561851501465, "learning_rate": 5.079695079695081e-06, "loss": 0.1148, "step": 2612 }, { "epoch": 2.7162162162162162, "grad_norm": 3.1190743446350098, "learning_rate": 5.077385077385077e-06, "loss": 0.0171, "step": 2613 }, { "epoch": 2.7172557172557172, "grad_norm": 2.5663068294525146, "learning_rate": 5.075075075075075e-06, "loss": 0.0828, "step": 2614 }, { "epoch": 2.7182952182952183, "grad_norm": 10.824786186218262, "learning_rate": 5.0727650727650735e-06, "loss": 0.263, "step": 2615 }, { "epoch": 2.7193347193347193, "grad_norm": 6.609222412109375, "learning_rate": 5.07045507045507e-06, "loss": 0.4434, "step": 2616 }, { "epoch": 2.7203742203742203, "grad_norm": 4.525443077087402, "learning_rate": 5.068145068145068e-06, "loss": 0.2959, "step": 2617 }, { "epoch": 2.7214137214137213, "grad_norm": 0.9088320732116699, "learning_rate": 5.065835065835066e-06, "loss": 0.0218, "step": 2618 }, { "epoch": 2.7224532224532223, "grad_norm": 11.098006248474121, "learning_rate": 5.0635250635250646e-06, "loss": 0.5951, "step": 2619 }, { "epoch": 2.7234927234927238, "grad_norm": 7.352163314819336, "learning_rate": 5.061215061215061e-06, "loss": 0.4864, "step": 2620 }, { "epoch": 2.7245322245322248, "grad_norm": 0.17772333323955536, "learning_rate": 5.058905058905059e-06, "loss": 0.0057, "step": 2621 }, { "epoch": 2.725571725571726, "grad_norm": 0.08428037911653519, "learning_rate": 5.0565950565950574e-06, "loss": 0.0021, "step": 2622 }, { "epoch": 2.726611226611227, "grad_norm": 0.06792586296796799, "learning_rate": 5.054285054285056e-06, "loss": 0.0015, "step": 2623 }, { "epoch": 2.727650727650728, "grad_norm": 11.568456649780273, "learning_rate": 5.051975051975052e-06, "loss": 1.3688, "step": 2624 }, { "epoch": 2.728690228690229, "grad_norm": 0.07964438199996948, "learning_rate": 5.04966504966505e-06, "loss": 0.0021, "step": 2625 }, { "epoch": 2.72972972972973, "grad_norm": 0.07185760885477066, "learning_rate": 5.0473550473550485e-06, "loss": 0.0014, "step": 2626 }, { "epoch": 2.730769230769231, "grad_norm": 6.794638633728027, "learning_rate": 5.045045045045045e-06, "loss": 0.221, "step": 2627 }, { "epoch": 2.731808731808732, "grad_norm": 7.2844557762146, "learning_rate": 5.042735042735043e-06, "loss": 0.2326, "step": 2628 }, { "epoch": 2.732848232848233, "grad_norm": 2.7436108589172363, "learning_rate": 5.040425040425041e-06, "loss": 0.0487, "step": 2629 }, { "epoch": 2.733887733887734, "grad_norm": 1.4171820878982544, "learning_rate": 5.038115038115039e-06, "loss": 0.0206, "step": 2630 }, { "epoch": 2.734927234927235, "grad_norm": 11.093990325927734, "learning_rate": 5.035805035805036e-06, "loss": 1.1055, "step": 2631 }, { "epoch": 2.735966735966736, "grad_norm": 5.9497599601745605, "learning_rate": 5.033495033495034e-06, "loss": 0.3682, "step": 2632 }, { "epoch": 2.737006237006237, "grad_norm": 5.0965423583984375, "learning_rate": 5.031185031185032e-06, "loss": 0.1029, "step": 2633 }, { "epoch": 2.738045738045738, "grad_norm": 3.5077013969421387, "learning_rate": 5.028875028875029e-06, "loss": 0.1539, "step": 2634 }, { "epoch": 2.739085239085239, "grad_norm": 0.2913844585418701, "learning_rate": 5.026565026565027e-06, "loss": 0.0082, "step": 2635 }, { "epoch": 2.74012474012474, "grad_norm": 5.6061248779296875, "learning_rate": 5.0242550242550245e-06, "loss": 0.2412, "step": 2636 }, { "epoch": 2.741164241164241, "grad_norm": 3.2970173358917236, "learning_rate": 5.021945021945023e-06, "loss": 0.1059, "step": 2637 }, { "epoch": 2.742203742203742, "grad_norm": 4.574480056762695, "learning_rate": 5.01963501963502e-06, "loss": 0.109, "step": 2638 }, { "epoch": 2.743243243243243, "grad_norm": 7.852790355682373, "learning_rate": 5.017325017325017e-06, "loss": 0.4485, "step": 2639 }, { "epoch": 2.7442827442827444, "grad_norm": 1.053780198097229, "learning_rate": 5.0150150150150156e-06, "loss": 0.0278, "step": 2640 }, { "epoch": 2.7453222453222454, "grad_norm": 6.505682468414307, "learning_rate": 5.012705012705014e-06, "loss": 0.2793, "step": 2641 }, { "epoch": 2.7463617463617465, "grad_norm": 9.323391914367676, "learning_rate": 5.01039501039501e-06, "loss": 0.6098, "step": 2642 }, { "epoch": 2.7474012474012475, "grad_norm": 8.641895294189453, "learning_rate": 5.0080850080850084e-06, "loss": 0.7653, "step": 2643 }, { "epoch": 2.7484407484407485, "grad_norm": 9.316137313842773, "learning_rate": 5.005775005775007e-06, "loss": 0.671, "step": 2644 }, { "epoch": 2.7494802494802495, "grad_norm": 2.7795755863189697, "learning_rate": 5.003465003465003e-06, "loss": 0.0525, "step": 2645 }, { "epoch": 2.7505197505197505, "grad_norm": 0.08307638019323349, "learning_rate": 5.001155001155001e-06, "loss": 0.0021, "step": 2646 }, { "epoch": 2.7515592515592515, "grad_norm": 0.11343641579151154, "learning_rate": 4.9988449988449995e-06, "loss": 0.0022, "step": 2647 }, { "epoch": 2.7525987525987525, "grad_norm": 1.186592936515808, "learning_rate": 4.996534996534997e-06, "loss": 0.0236, "step": 2648 }, { "epoch": 2.7536382536382535, "grad_norm": 0.11585807055234909, "learning_rate": 4.994224994224995e-06, "loss": 0.0026, "step": 2649 }, { "epoch": 2.7546777546777546, "grad_norm": 3.3414664268493652, "learning_rate": 4.991914991914992e-06, "loss": 0.0571, "step": 2650 }, { "epoch": 2.7557172557172556, "grad_norm": 4.851938724517822, "learning_rate": 4.9896049896049906e-06, "loss": 0.3162, "step": 2651 }, { "epoch": 2.756756756756757, "grad_norm": 0.75993412733078, "learning_rate": 4.987294987294988e-06, "loss": 0.0164, "step": 2652 }, { "epoch": 2.757796257796258, "grad_norm": 9.393717765808105, "learning_rate": 4.984984984984985e-06, "loss": 0.3516, "step": 2653 }, { "epoch": 2.758835758835759, "grad_norm": 2.778585195541382, "learning_rate": 4.9826749826749835e-06, "loss": 0.1368, "step": 2654 }, { "epoch": 2.75987525987526, "grad_norm": 1.8893013000488281, "learning_rate": 4.980364980364981e-06, "loss": 0.0591, "step": 2655 }, { "epoch": 2.760914760914761, "grad_norm": 0.5797641277313232, "learning_rate": 4.978054978054979e-06, "loss": 0.0104, "step": 2656 }, { "epoch": 2.761954261954262, "grad_norm": 10.824493408203125, "learning_rate": 4.975744975744976e-06, "loss": 0.3103, "step": 2657 }, { "epoch": 2.762993762993763, "grad_norm": 1.135068416595459, "learning_rate": 4.973434973434974e-06, "loss": 0.029, "step": 2658 }, { "epoch": 2.764033264033264, "grad_norm": 7.167160987854004, "learning_rate": 4.971124971124972e-06, "loss": 0.3448, "step": 2659 }, { "epoch": 2.765072765072765, "grad_norm": 0.10508804768323898, "learning_rate": 4.968814968814969e-06, "loss": 0.0025, "step": 2660 }, { "epoch": 2.766112266112266, "grad_norm": 13.092395782470703, "learning_rate": 4.9665049665049666e-06, "loss": 1.2022, "step": 2661 }, { "epoch": 2.767151767151767, "grad_norm": 0.007611572276800871, "learning_rate": 4.964194964194965e-06, "loss": 0.0002, "step": 2662 }, { "epoch": 2.768191268191268, "grad_norm": 2.667224884033203, "learning_rate": 4.961884961884962e-06, "loss": 0.0571, "step": 2663 }, { "epoch": 2.769230769230769, "grad_norm": 2.348841428756714, "learning_rate": 4.9595749595749594e-06, "loss": 0.0717, "step": 2664 }, { "epoch": 2.77027027027027, "grad_norm": 1.9521201848983765, "learning_rate": 4.957264957264958e-06, "loss": 0.0525, "step": 2665 }, { "epoch": 2.771309771309771, "grad_norm": 9.152738571166992, "learning_rate": 4.954954954954955e-06, "loss": 0.5686, "step": 2666 }, { "epoch": 2.772349272349272, "grad_norm": 2.37243390083313, "learning_rate": 4.952644952644953e-06, "loss": 0.0804, "step": 2667 }, { "epoch": 2.773388773388773, "grad_norm": 7.31588888168335, "learning_rate": 4.9503349503349505e-06, "loss": 0.3661, "step": 2668 }, { "epoch": 2.774428274428274, "grad_norm": 5.044492721557617, "learning_rate": 4.948024948024949e-06, "loss": 0.2454, "step": 2669 }, { "epoch": 2.7754677754677752, "grad_norm": 0.23255555331707, "learning_rate": 4.945714945714946e-06, "loss": 0.0058, "step": 2670 }, { "epoch": 2.7765072765072762, "grad_norm": 0.09882773458957672, "learning_rate": 4.943404943404943e-06, "loss": 0.0019, "step": 2671 }, { "epoch": 2.7775467775467777, "grad_norm": 1.9447699785232544, "learning_rate": 4.9410949410949416e-06, "loss": 0.0621, "step": 2672 }, { "epoch": 2.7785862785862787, "grad_norm": 0.6730137467384338, "learning_rate": 4.938784938784939e-06, "loss": 0.0317, "step": 2673 }, { "epoch": 2.7796257796257797, "grad_norm": 9.202969551086426, "learning_rate": 4.936474936474937e-06, "loss": 0.3289, "step": 2674 }, { "epoch": 2.7806652806652807, "grad_norm": 3.387312173843384, "learning_rate": 4.9341649341649344e-06, "loss": 0.1333, "step": 2675 }, { "epoch": 2.7817047817047817, "grad_norm": 0.5006962418556213, "learning_rate": 4.931854931854933e-06, "loss": 0.0106, "step": 2676 }, { "epoch": 2.7827442827442828, "grad_norm": 11.020606994628906, "learning_rate": 4.92954492954493e-06, "loss": 1.8051, "step": 2677 }, { "epoch": 2.7837837837837838, "grad_norm": 11.423202514648438, "learning_rate": 4.927234927234928e-06, "loss": 0.6795, "step": 2678 }, { "epoch": 2.784823284823285, "grad_norm": 4.255502700805664, "learning_rate": 4.9249249249249255e-06, "loss": 0.334, "step": 2679 }, { "epoch": 2.785862785862786, "grad_norm": 3.1610569953918457, "learning_rate": 4.922614922614923e-06, "loss": 0.1322, "step": 2680 }, { "epoch": 2.786902286902287, "grad_norm": 7.8450469970703125, "learning_rate": 4.920304920304921e-06, "loss": 0.2823, "step": 2681 }, { "epoch": 2.787941787941788, "grad_norm": 4.556313991546631, "learning_rate": 4.917994917994918e-06, "loss": 0.3017, "step": 2682 }, { "epoch": 2.788981288981289, "grad_norm": 10.682342529296875, "learning_rate": 4.915684915684917e-06, "loss": 0.5142, "step": 2683 }, { "epoch": 2.7900207900207903, "grad_norm": 1.8365347385406494, "learning_rate": 4.913374913374914e-06, "loss": 0.0443, "step": 2684 }, { "epoch": 2.7910602910602913, "grad_norm": 0.04636334255337715, "learning_rate": 4.911064911064911e-06, "loss": 0.0008, "step": 2685 }, { "epoch": 2.7920997920997923, "grad_norm": 5.025690078735352, "learning_rate": 4.9087549087549095e-06, "loss": 0.3426, "step": 2686 }, { "epoch": 2.7931392931392933, "grad_norm": 8.523011207580566, "learning_rate": 4.906444906444907e-06, "loss": 0.5284, "step": 2687 }, { "epoch": 2.7941787941787943, "grad_norm": 9.12713623046875, "learning_rate": 4.904134904134904e-06, "loss": 0.393, "step": 2688 }, { "epoch": 2.7952182952182953, "grad_norm": 5.500669479370117, "learning_rate": 4.901824901824902e-06, "loss": 0.1616, "step": 2689 }, { "epoch": 2.7962577962577964, "grad_norm": 1.8790087699890137, "learning_rate": 4.8995148995149e-06, "loss": 0.0492, "step": 2690 }, { "epoch": 2.7972972972972974, "grad_norm": 0.5691194534301758, "learning_rate": 4.897204897204897e-06, "loss": 0.0147, "step": 2691 }, { "epoch": 2.7983367983367984, "grad_norm": 12.712238311767578, "learning_rate": 4.894894894894895e-06, "loss": 1.2213, "step": 2692 }, { "epoch": 2.7993762993762994, "grad_norm": 0.6193615794181824, "learning_rate": 4.8925848925848926e-06, "loss": 0.0165, "step": 2693 }, { "epoch": 2.8004158004158004, "grad_norm": 7.124426364898682, "learning_rate": 4.890274890274891e-06, "loss": 0.4729, "step": 2694 }, { "epoch": 2.8014553014553014, "grad_norm": 3.7972962856292725, "learning_rate": 4.887964887964888e-06, "loss": 0.1297, "step": 2695 }, { "epoch": 2.8024948024948024, "grad_norm": 8.690361022949219, "learning_rate": 4.885654885654886e-06, "loss": 1.0041, "step": 2696 }, { "epoch": 2.8035343035343034, "grad_norm": 0.5796958804130554, "learning_rate": 4.883344883344884e-06, "loss": 0.012, "step": 2697 }, { "epoch": 2.8045738045738045, "grad_norm": 5.679058074951172, "learning_rate": 4.881034881034881e-06, "loss": 0.2276, "step": 2698 }, { "epoch": 2.8056133056133055, "grad_norm": 0.24074715375900269, "learning_rate": 4.878724878724879e-06, "loss": 0.0077, "step": 2699 }, { "epoch": 2.8066528066528065, "grad_norm": 0.01518248114734888, "learning_rate": 4.8764148764148765e-06, "loss": 0.0003, "step": 2700 }, { "epoch": 2.8076923076923075, "grad_norm": 10.412557601928711, "learning_rate": 4.874104874104875e-06, "loss": 0.38, "step": 2701 }, { "epoch": 2.8087318087318085, "grad_norm": 0.05676966533064842, "learning_rate": 4.871794871794872e-06, "loss": 0.0009, "step": 2702 }, { "epoch": 2.8097713097713095, "grad_norm": 9.158880233764648, "learning_rate": 4.86948486948487e-06, "loss": 0.5184, "step": 2703 }, { "epoch": 2.810810810810811, "grad_norm": 3.9636144638061523, "learning_rate": 4.8671748671748676e-06, "loss": 0.1219, "step": 2704 }, { "epoch": 2.811850311850312, "grad_norm": 2.21620774269104, "learning_rate": 4.864864864864866e-06, "loss": 0.0783, "step": 2705 }, { "epoch": 2.812889812889813, "grad_norm": 0.4864761233329773, "learning_rate": 4.862554862554863e-06, "loss": 0.0161, "step": 2706 }, { "epoch": 2.813929313929314, "grad_norm": 5.6542158126831055, "learning_rate": 4.8602448602448605e-06, "loss": 0.4343, "step": 2707 }, { "epoch": 2.814968814968815, "grad_norm": 1.9403719902038574, "learning_rate": 4.857934857934859e-06, "loss": 0.1049, "step": 2708 }, { "epoch": 2.816008316008316, "grad_norm": 0.18522211909294128, "learning_rate": 4.855624855624856e-06, "loss": 0.0049, "step": 2709 }, { "epoch": 2.817047817047817, "grad_norm": 0.2546488344669342, "learning_rate": 4.853314853314854e-06, "loss": 0.0072, "step": 2710 }, { "epoch": 2.818087318087318, "grad_norm": 0.18962424993515015, "learning_rate": 4.8510048510048515e-06, "loss": 0.003, "step": 2711 }, { "epoch": 2.819126819126819, "grad_norm": 2.3680453300476074, "learning_rate": 4.84869484869485e-06, "loss": 0.0469, "step": 2712 }, { "epoch": 2.82016632016632, "grad_norm": 0.13189195096492767, "learning_rate": 4.846384846384847e-06, "loss": 0.0038, "step": 2713 }, { "epoch": 2.821205821205821, "grad_norm": 7.136310577392578, "learning_rate": 4.844074844074844e-06, "loss": 0.2235, "step": 2714 }, { "epoch": 2.822245322245322, "grad_norm": 9.05678653717041, "learning_rate": 4.841764841764843e-06, "loss": 0.7505, "step": 2715 }, { "epoch": 2.8232848232848236, "grad_norm": 7.28842306137085, "learning_rate": 4.83945483945484e-06, "loss": 0.5211, "step": 2716 }, { "epoch": 2.8243243243243246, "grad_norm": 5.649138927459717, "learning_rate": 4.837144837144837e-06, "loss": 0.1875, "step": 2717 }, { "epoch": 2.8253638253638256, "grad_norm": 12.433136940002441, "learning_rate": 4.8348348348348355e-06, "loss": 0.8207, "step": 2718 }, { "epoch": 2.8264033264033266, "grad_norm": 8.163975715637207, "learning_rate": 4.832524832524833e-06, "loss": 0.8473, "step": 2719 }, { "epoch": 2.8274428274428276, "grad_norm": 5.121532917022705, "learning_rate": 4.83021483021483e-06, "loss": 0.1324, "step": 2720 }, { "epoch": 2.8284823284823286, "grad_norm": 0.06569239497184753, "learning_rate": 4.827904827904828e-06, "loss": 0.0021, "step": 2721 }, { "epoch": 2.8295218295218296, "grad_norm": 7.412308216094971, "learning_rate": 4.825594825594826e-06, "loss": 0.7194, "step": 2722 }, { "epoch": 2.8305613305613306, "grad_norm": 2.4917805194854736, "learning_rate": 4.823284823284824e-06, "loss": 0.0472, "step": 2723 }, { "epoch": 2.8316008316008316, "grad_norm": 7.543792724609375, "learning_rate": 4.820974820974821e-06, "loss": 0.5375, "step": 2724 }, { "epoch": 2.8326403326403327, "grad_norm": 0.24890974164009094, "learning_rate": 4.8186648186648186e-06, "loss": 0.0066, "step": 2725 }, { "epoch": 2.8336798336798337, "grad_norm": 4.648128032684326, "learning_rate": 4.816354816354817e-06, "loss": 0.1709, "step": 2726 }, { "epoch": 2.8347193347193347, "grad_norm": 5.207144260406494, "learning_rate": 4.814044814044814e-06, "loss": 0.1652, "step": 2727 }, { "epoch": 2.8357588357588357, "grad_norm": 3.2448599338531494, "learning_rate": 4.811734811734812e-06, "loss": 0.0716, "step": 2728 }, { "epoch": 2.8367983367983367, "grad_norm": 4.957944393157959, "learning_rate": 4.80942480942481e-06, "loss": 0.332, "step": 2729 }, { "epoch": 2.8378378378378377, "grad_norm": 0.13377763330936432, "learning_rate": 4.807114807114808e-06, "loss": 0.0031, "step": 2730 }, { "epoch": 2.8388773388773387, "grad_norm": 9.341425895690918, "learning_rate": 4.804804804804805e-06, "loss": 0.7092, "step": 2731 }, { "epoch": 2.8399168399168397, "grad_norm": 4.316988945007324, "learning_rate": 4.802494802494803e-06, "loss": 0.176, "step": 2732 }, { "epoch": 2.8409563409563408, "grad_norm": 0.034980062395334244, "learning_rate": 4.800184800184801e-06, "loss": 0.0009, "step": 2733 }, { "epoch": 2.8419958419958418, "grad_norm": 0.17463620007038116, "learning_rate": 4.797874797874798e-06, "loss": 0.0038, "step": 2734 }, { "epoch": 2.8430353430353428, "grad_norm": 4.154630661010742, "learning_rate": 4.795564795564796e-06, "loss": 0.1333, "step": 2735 }, { "epoch": 2.8440748440748442, "grad_norm": 6.52298641204834, "learning_rate": 4.793254793254794e-06, "loss": 0.5034, "step": 2736 }, { "epoch": 2.8451143451143452, "grad_norm": 4.983566761016846, "learning_rate": 4.790944790944792e-06, "loss": 0.2822, "step": 2737 }, { "epoch": 2.8461538461538463, "grad_norm": 4.204535484313965, "learning_rate": 4.788634788634789e-06, "loss": 0.2029, "step": 2738 }, { "epoch": 2.8471933471933473, "grad_norm": 6.820010662078857, "learning_rate": 4.786324786324787e-06, "loss": 0.2191, "step": 2739 }, { "epoch": 2.8482328482328483, "grad_norm": 3.214674949645996, "learning_rate": 4.784014784014785e-06, "loss": 0.152, "step": 2740 }, { "epoch": 2.8492723492723493, "grad_norm": 7.441554546356201, "learning_rate": 4.781704781704782e-06, "loss": 0.3018, "step": 2741 }, { "epoch": 2.8503118503118503, "grad_norm": 6.902109146118164, "learning_rate": 4.77939477939478e-06, "loss": 0.4387, "step": 2742 }, { "epoch": 2.8513513513513513, "grad_norm": 1.1773970127105713, "learning_rate": 4.7770847770847775e-06, "loss": 0.0262, "step": 2743 }, { "epoch": 2.8523908523908523, "grad_norm": 13.740028381347656, "learning_rate": 4.774774774774775e-06, "loss": 0.9091, "step": 2744 }, { "epoch": 2.8534303534303533, "grad_norm": 2.1474075317382812, "learning_rate": 4.772464772464773e-06, "loss": 0.0869, "step": 2745 }, { "epoch": 2.8544698544698544, "grad_norm": 5.147608757019043, "learning_rate": 4.77015477015477e-06, "loss": 0.4671, "step": 2746 }, { "epoch": 2.8555093555093554, "grad_norm": 0.08837344497442245, "learning_rate": 4.767844767844768e-06, "loss": 0.0027, "step": 2747 }, { "epoch": 2.856548856548857, "grad_norm": 9.307243347167969, "learning_rate": 4.765534765534766e-06, "loss": 0.2708, "step": 2748 }, { "epoch": 2.857588357588358, "grad_norm": 0.17253902554512024, "learning_rate": 4.763224763224763e-06, "loss": 0.003, "step": 2749 }, { "epoch": 2.858627858627859, "grad_norm": 0.9690131545066833, "learning_rate": 4.7609147609147615e-06, "loss": 0.0471, "step": 2750 }, { "epoch": 2.85966735966736, "grad_norm": 10.780189514160156, "learning_rate": 4.758604758604759e-06, "loss": 0.7136, "step": 2751 }, { "epoch": 2.860706860706861, "grad_norm": 0.4083833396434784, "learning_rate": 4.756294756294756e-06, "loss": 0.0103, "step": 2752 }, { "epoch": 2.861746361746362, "grad_norm": 7.011819362640381, "learning_rate": 4.753984753984754e-06, "loss": 0.1977, "step": 2753 }, { "epoch": 2.862785862785863, "grad_norm": 0.6745598316192627, "learning_rate": 4.751674751674752e-06, "loss": 0.0166, "step": 2754 }, { "epoch": 2.863825363825364, "grad_norm": 2.218971014022827, "learning_rate": 4.74936474936475e-06, "loss": 0.0656, "step": 2755 }, { "epoch": 2.864864864864865, "grad_norm": 0.19676561653614044, "learning_rate": 4.747054747054747e-06, "loss": 0.0069, "step": 2756 }, { "epoch": 2.865904365904366, "grad_norm": 8.756501197814941, "learning_rate": 4.7447447447447454e-06, "loss": 0.972, "step": 2757 }, { "epoch": 2.866943866943867, "grad_norm": 0.0635393038392067, "learning_rate": 4.742434742434743e-06, "loss": 0.0023, "step": 2758 }, { "epoch": 2.867983367983368, "grad_norm": 6.238755226135254, "learning_rate": 4.740124740124741e-06, "loss": 0.2034, "step": 2759 }, { "epoch": 2.869022869022869, "grad_norm": 8.922481536865234, "learning_rate": 4.737814737814738e-06, "loss": 0.6653, "step": 2760 }, { "epoch": 2.87006237006237, "grad_norm": 0.8628990650177002, "learning_rate": 4.735504735504736e-06, "loss": 0.0221, "step": 2761 }, { "epoch": 2.871101871101871, "grad_norm": 0.3078502118587494, "learning_rate": 4.733194733194734e-06, "loss": 0.0061, "step": 2762 }, { "epoch": 2.872141372141372, "grad_norm": 13.62101936340332, "learning_rate": 4.730884730884731e-06, "loss": 0.8565, "step": 2763 }, { "epoch": 2.873180873180873, "grad_norm": 3.0611491203308105, "learning_rate": 4.728574728574729e-06, "loss": 0.0806, "step": 2764 }, { "epoch": 2.874220374220374, "grad_norm": 0.7029672861099243, "learning_rate": 4.726264726264727e-06, "loss": 0.0181, "step": 2765 }, { "epoch": 2.875259875259875, "grad_norm": 0.2899930775165558, "learning_rate": 4.723954723954725e-06, "loss": 0.0055, "step": 2766 }, { "epoch": 2.876299376299376, "grad_norm": 1.9155933856964111, "learning_rate": 4.721644721644722e-06, "loss": 0.0693, "step": 2767 }, { "epoch": 2.8773388773388775, "grad_norm": 8.559905052185059, "learning_rate": 4.71933471933472e-06, "loss": 0.3092, "step": 2768 }, { "epoch": 2.8783783783783785, "grad_norm": 0.37766215205192566, "learning_rate": 4.717024717024718e-06, "loss": 0.0097, "step": 2769 }, { "epoch": 2.8794178794178795, "grad_norm": 0.4770283102989197, "learning_rate": 4.714714714714715e-06, "loss": 0.012, "step": 2770 }, { "epoch": 2.8804573804573805, "grad_norm": 0.04901864379644394, "learning_rate": 4.7124047124047125e-06, "loss": 0.0016, "step": 2771 }, { "epoch": 2.8814968814968815, "grad_norm": 10.537938117980957, "learning_rate": 4.710094710094711e-06, "loss": 0.7172, "step": 2772 }, { "epoch": 2.8825363825363826, "grad_norm": 1.2608181238174438, "learning_rate": 4.707784707784708e-06, "loss": 0.0336, "step": 2773 }, { "epoch": 2.8835758835758836, "grad_norm": 0.531566321849823, "learning_rate": 4.705474705474705e-06, "loss": 0.0206, "step": 2774 }, { "epoch": 2.8846153846153846, "grad_norm": 5.012516498565674, "learning_rate": 4.7031647031647035e-06, "loss": 0.2701, "step": 2775 }, { "epoch": 2.8856548856548856, "grad_norm": 2.223706007003784, "learning_rate": 4.700854700854701e-06, "loss": 0.0858, "step": 2776 }, { "epoch": 2.8866943866943866, "grad_norm": 2.592135190963745, "learning_rate": 4.698544698544699e-06, "loss": 0.073, "step": 2777 }, { "epoch": 2.8877338877338876, "grad_norm": 11.445849418640137, "learning_rate": 4.696234696234696e-06, "loss": 1.3401, "step": 2778 }, { "epoch": 2.8887733887733886, "grad_norm": 1.9957225322723389, "learning_rate": 4.693924693924694e-06, "loss": 0.0379, "step": 2779 }, { "epoch": 2.88981288981289, "grad_norm": 0.9727466702461243, "learning_rate": 4.691614691614692e-06, "loss": 0.0179, "step": 2780 }, { "epoch": 2.890852390852391, "grad_norm": 2.8076136112213135, "learning_rate": 4.689304689304689e-06, "loss": 0.0701, "step": 2781 }, { "epoch": 2.891891891891892, "grad_norm": 0.028624217957258224, "learning_rate": 4.6869946869946875e-06, "loss": 0.0006, "step": 2782 }, { "epoch": 2.892931392931393, "grad_norm": 2.858949899673462, "learning_rate": 4.684684684684685e-06, "loss": 0.1371, "step": 2783 }, { "epoch": 2.893970893970894, "grad_norm": 1.272584319114685, "learning_rate": 4.682374682374683e-06, "loss": 0.0191, "step": 2784 }, { "epoch": 2.895010395010395, "grad_norm": 2.8619091510772705, "learning_rate": 4.68006468006468e-06, "loss": 0.1527, "step": 2785 }, { "epoch": 2.896049896049896, "grad_norm": 3.4362850189208984, "learning_rate": 4.6777546777546786e-06, "loss": 0.3499, "step": 2786 }, { "epoch": 2.897089397089397, "grad_norm": 5.900140762329102, "learning_rate": 4.675444675444676e-06, "loss": 0.2825, "step": 2787 }, { "epoch": 2.898128898128898, "grad_norm": 12.06615161895752, "learning_rate": 4.673134673134673e-06, "loss": 1.3305, "step": 2788 }, { "epoch": 2.899168399168399, "grad_norm": 8.633148193359375, "learning_rate": 4.6708246708246714e-06, "loss": 0.3222, "step": 2789 }, { "epoch": 2.9002079002079, "grad_norm": 7.318634986877441, "learning_rate": 4.668514668514669e-06, "loss": 0.2946, "step": 2790 }, { "epoch": 2.901247401247401, "grad_norm": 0.35540270805358887, "learning_rate": 4.666204666204667e-06, "loss": 0.0078, "step": 2791 }, { "epoch": 2.9022869022869022, "grad_norm": 0.6462912559509277, "learning_rate": 4.663894663894664e-06, "loss": 0.0091, "step": 2792 }, { "epoch": 2.9033264033264032, "grad_norm": 0.5021451711654663, "learning_rate": 4.6615846615846625e-06, "loss": 0.0135, "step": 2793 }, { "epoch": 2.9043659043659042, "grad_norm": 0.36885151267051697, "learning_rate": 4.65927465927466e-06, "loss": 0.0108, "step": 2794 }, { "epoch": 2.9054054054054053, "grad_norm": 5.507650375366211, "learning_rate": 4.656964656964657e-06, "loss": 0.329, "step": 2795 }, { "epoch": 2.9064449064449063, "grad_norm": 0.02889976091682911, "learning_rate": 4.654654654654655e-06, "loss": 0.0008, "step": 2796 }, { "epoch": 2.9074844074844073, "grad_norm": 4.460318565368652, "learning_rate": 4.652344652344653e-06, "loss": 0.2253, "step": 2797 }, { "epoch": 2.9085239085239083, "grad_norm": 0.10087776929140091, "learning_rate": 4.65003465003465e-06, "loss": 0.0025, "step": 2798 }, { "epoch": 2.9095634095634093, "grad_norm": 0.39357542991638184, "learning_rate": 4.647724647724648e-06, "loss": 0.0094, "step": 2799 }, { "epoch": 2.9106029106029108, "grad_norm": 0.20674152672290802, "learning_rate": 4.645414645414646e-06, "loss": 0.0045, "step": 2800 }, { "epoch": 2.9116424116424118, "grad_norm": 1.4034574031829834, "learning_rate": 4.643104643104644e-06, "loss": 0.0323, "step": 2801 }, { "epoch": 2.912681912681913, "grad_norm": 7.4032416343688965, "learning_rate": 4.640794640794641e-06, "loss": 0.5195, "step": 2802 }, { "epoch": 2.913721413721414, "grad_norm": 3.127525806427002, "learning_rate": 4.6384846384846385e-06, "loss": 0.0481, "step": 2803 }, { "epoch": 2.914760914760915, "grad_norm": 1.7245556116104126, "learning_rate": 4.636174636174637e-06, "loss": 0.0282, "step": 2804 }, { "epoch": 2.915800415800416, "grad_norm": 9.243249893188477, "learning_rate": 4.633864633864634e-06, "loss": 0.3518, "step": 2805 }, { "epoch": 2.916839916839917, "grad_norm": 2.03916597366333, "learning_rate": 4.631554631554631e-06, "loss": 0.0527, "step": 2806 }, { "epoch": 2.917879417879418, "grad_norm": 7.111269474029541, "learning_rate": 4.6292446292446295e-06, "loss": 0.2616, "step": 2807 }, { "epoch": 2.918918918918919, "grad_norm": 3.627591848373413, "learning_rate": 4.626934626934627e-06, "loss": 0.088, "step": 2808 }, { "epoch": 2.91995841995842, "grad_norm": 2.4315571784973145, "learning_rate": 4.624624624624625e-06, "loss": 0.0711, "step": 2809 }, { "epoch": 2.920997920997921, "grad_norm": 7.548877716064453, "learning_rate": 4.6223146223146224e-06, "loss": 0.6083, "step": 2810 }, { "epoch": 2.922037422037422, "grad_norm": 13.10161304473877, "learning_rate": 4.620004620004621e-06, "loss": 1.686, "step": 2811 }, { "epoch": 2.9230769230769234, "grad_norm": 1.7761461734771729, "learning_rate": 4.617694617694618e-06, "loss": 0.036, "step": 2812 }, { "epoch": 2.9241164241164244, "grad_norm": 0.5098281502723694, "learning_rate": 4.615384615384616e-06, "loss": 0.0071, "step": 2813 }, { "epoch": 2.9251559251559254, "grad_norm": 337.7049255371094, "learning_rate": 4.6130746130746135e-06, "loss": 0.4585, "step": 2814 }, { "epoch": 2.9261954261954264, "grad_norm": 4.641257286071777, "learning_rate": 4.610764610764611e-06, "loss": 0.1252, "step": 2815 }, { "epoch": 2.9272349272349274, "grad_norm": 2.1013717651367188, "learning_rate": 4.608454608454609e-06, "loss": 0.0614, "step": 2816 }, { "epoch": 2.9282744282744284, "grad_norm": 2.736609697341919, "learning_rate": 4.606144606144606e-06, "loss": 0.0907, "step": 2817 }, { "epoch": 2.9293139293139294, "grad_norm": 8.260156631469727, "learning_rate": 4.6038346038346046e-06, "loss": 0.4214, "step": 2818 }, { "epoch": 2.9303534303534304, "grad_norm": 0.2285541146993637, "learning_rate": 4.601524601524602e-06, "loss": 0.0052, "step": 2819 }, { "epoch": 2.9313929313929314, "grad_norm": 2.672311782836914, "learning_rate": 4.5992145992146e-06, "loss": 0.0733, "step": 2820 }, { "epoch": 2.9324324324324325, "grad_norm": 3.377425193786621, "learning_rate": 4.5969045969045974e-06, "loss": 0.0867, "step": 2821 }, { "epoch": 2.9334719334719335, "grad_norm": 9.351137161254883, "learning_rate": 4.594594594594596e-06, "loss": 0.7431, "step": 2822 }, { "epoch": 2.9345114345114345, "grad_norm": 0.11065870523452759, "learning_rate": 4.592284592284593e-06, "loss": 0.002, "step": 2823 }, { "epoch": 2.9355509355509355, "grad_norm": 0.9388895630836487, "learning_rate": 4.58997458997459e-06, "loss": 0.0377, "step": 2824 }, { "epoch": 2.9365904365904365, "grad_norm": 1.9601761102676392, "learning_rate": 4.5876645876645885e-06, "loss": 0.0892, "step": 2825 }, { "epoch": 2.9376299376299375, "grad_norm": 5.889634132385254, "learning_rate": 4.585354585354586e-06, "loss": 0.3781, "step": 2826 }, { "epoch": 2.9386694386694385, "grad_norm": 5.7516770362854, "learning_rate": 4.583044583044583e-06, "loss": 0.4462, "step": 2827 }, { "epoch": 2.9397089397089395, "grad_norm": 2.089272975921631, "learning_rate": 4.580734580734581e-06, "loss": 0.0607, "step": 2828 }, { "epoch": 2.9407484407484406, "grad_norm": 10.779491424560547, "learning_rate": 4.578424578424579e-06, "loss": 0.3636, "step": 2829 }, { "epoch": 2.9417879417879416, "grad_norm": 2.810117721557617, "learning_rate": 4.576114576114576e-06, "loss": 0.0683, "step": 2830 }, { "epoch": 2.9428274428274426, "grad_norm": 0.02683236077427864, "learning_rate": 4.573804573804574e-06, "loss": 0.0007, "step": 2831 }, { "epoch": 2.943866943866944, "grad_norm": 0.10576693713665009, "learning_rate": 4.571494571494572e-06, "loss": 0.0034, "step": 2832 }, { "epoch": 2.944906444906445, "grad_norm": 1.2334619760513306, "learning_rate": 4.569184569184569e-06, "loss": 0.0306, "step": 2833 }, { "epoch": 2.945945945945946, "grad_norm": 8.295530319213867, "learning_rate": 4.566874566874567e-06, "loss": 0.359, "step": 2834 }, { "epoch": 2.946985446985447, "grad_norm": 3.9480338096618652, "learning_rate": 4.5645645645645645e-06, "loss": 0.0801, "step": 2835 }, { "epoch": 2.948024948024948, "grad_norm": 2.964906692504883, "learning_rate": 4.562254562254563e-06, "loss": 0.0966, "step": 2836 }, { "epoch": 2.949064449064449, "grad_norm": 9.53171443939209, "learning_rate": 4.55994455994456e-06, "loss": 0.694, "step": 2837 }, { "epoch": 2.95010395010395, "grad_norm": 0.06429767608642578, "learning_rate": 4.557634557634558e-06, "loss": 0.0014, "step": 2838 }, { "epoch": 2.951143451143451, "grad_norm": 1.9759715795516968, "learning_rate": 4.5553245553245556e-06, "loss": 0.0314, "step": 2839 }, { "epoch": 2.952182952182952, "grad_norm": 5.269041538238525, "learning_rate": 4.553014553014554e-06, "loss": 0.1935, "step": 2840 }, { "epoch": 2.953222453222453, "grad_norm": 5.443349361419678, "learning_rate": 4.550704550704551e-06, "loss": 0.1412, "step": 2841 }, { "epoch": 2.954261954261954, "grad_norm": 11.945755004882812, "learning_rate": 4.5483945483945484e-06, "loss": 1.0154, "step": 2842 }, { "epoch": 2.955301455301455, "grad_norm": 2.846902847290039, "learning_rate": 4.546084546084547e-06, "loss": 0.0632, "step": 2843 }, { "epoch": 2.9563409563409566, "grad_norm": 10.10546588897705, "learning_rate": 4.543774543774544e-06, "loss": 0.4177, "step": 2844 }, { "epoch": 2.9573804573804576, "grad_norm": 0.19464100897312164, "learning_rate": 4.541464541464542e-06, "loss": 0.0054, "step": 2845 }, { "epoch": 2.9584199584199586, "grad_norm": 7.272435665130615, "learning_rate": 4.5391545391545395e-06, "loss": 0.3266, "step": 2846 }, { "epoch": 2.9594594594594597, "grad_norm": 0.25439026951789856, "learning_rate": 4.536844536844538e-06, "loss": 0.0033, "step": 2847 }, { "epoch": 2.9604989604989607, "grad_norm": 0.2873782217502594, "learning_rate": 4.534534534534535e-06, "loss": 0.0045, "step": 2848 }, { "epoch": 2.9615384615384617, "grad_norm": 0.18630096316337585, "learning_rate": 4.532224532224533e-06, "loss": 0.0037, "step": 2849 }, { "epoch": 2.9625779625779627, "grad_norm": 1.1513333320617676, "learning_rate": 4.5299145299145306e-06, "loss": 0.0216, "step": 2850 }, { "epoch": 2.9636174636174637, "grad_norm": 6.185680866241455, "learning_rate": 4.527604527604528e-06, "loss": 0.2914, "step": 2851 }, { "epoch": 2.9646569646569647, "grad_norm": 2.607320785522461, "learning_rate": 4.525294525294526e-06, "loss": 0.2749, "step": 2852 }, { "epoch": 2.9656964656964657, "grad_norm": 15.155496597290039, "learning_rate": 4.5229845229845235e-06, "loss": 1.5991, "step": 2853 }, { "epoch": 2.9667359667359667, "grad_norm": 1.463409185409546, "learning_rate": 4.520674520674521e-06, "loss": 0.0406, "step": 2854 }, { "epoch": 2.9677754677754677, "grad_norm": 1.1975452899932861, "learning_rate": 4.518364518364519e-06, "loss": 0.0287, "step": 2855 }, { "epoch": 2.9688149688149688, "grad_norm": 0.2848760187625885, "learning_rate": 4.516054516054516e-06, "loss": 0.0038, "step": 2856 }, { "epoch": 2.9698544698544698, "grad_norm": 14.927924156188965, "learning_rate": 4.513744513744514e-06, "loss": 1.2295, "step": 2857 }, { "epoch": 2.970893970893971, "grad_norm": 5.528604507446289, "learning_rate": 4.511434511434512e-06, "loss": 0.2973, "step": 2858 }, { "epoch": 2.971933471933472, "grad_norm": 5.287175178527832, "learning_rate": 4.509124509124509e-06, "loss": 0.1407, "step": 2859 }, { "epoch": 2.972972972972973, "grad_norm": 2.9800124168395996, "learning_rate": 4.5068145068145066e-06, "loss": 0.0748, "step": 2860 }, { "epoch": 2.974012474012474, "grad_norm": 3.8337507247924805, "learning_rate": 4.504504504504505e-06, "loss": 0.1819, "step": 2861 }, { "epoch": 2.975051975051975, "grad_norm": 8.198847770690918, "learning_rate": 4.502194502194502e-06, "loss": 0.2563, "step": 2862 }, { "epoch": 2.976091476091476, "grad_norm": 4.668789863586426, "learning_rate": 4.4998844998845e-06, "loss": 0.1607, "step": 2863 }, { "epoch": 2.9771309771309773, "grad_norm": 3.7583305835723877, "learning_rate": 4.497574497574498e-06, "loss": 0.1952, "step": 2864 }, { "epoch": 2.9781704781704783, "grad_norm": 1.488911747932434, "learning_rate": 4.495264495264496e-06, "loss": 0.0382, "step": 2865 }, { "epoch": 2.9792099792099793, "grad_norm": 0.9805212020874023, "learning_rate": 4.492954492954493e-06, "loss": 0.2726, "step": 2866 }, { "epoch": 2.9802494802494803, "grad_norm": 0.03673991560935974, "learning_rate": 4.490644490644491e-06, "loss": 0.0005, "step": 2867 }, { "epoch": 2.9812889812889813, "grad_norm": 1.645326018333435, "learning_rate": 4.488334488334489e-06, "loss": 0.0279, "step": 2868 }, { "epoch": 2.9823284823284824, "grad_norm": 2.383275032043457, "learning_rate": 4.486024486024486e-06, "loss": 0.117, "step": 2869 }, { "epoch": 2.9833679833679834, "grad_norm": 1.2870901823043823, "learning_rate": 4.483714483714484e-06, "loss": 0.0229, "step": 2870 }, { "epoch": 2.9844074844074844, "grad_norm": 0.06789594888687134, "learning_rate": 4.4814044814044816e-06, "loss": 0.0017, "step": 2871 }, { "epoch": 2.9854469854469854, "grad_norm": 0.3903137445449829, "learning_rate": 4.47909447909448e-06, "loss": 0.0081, "step": 2872 }, { "epoch": 2.9864864864864864, "grad_norm": 9.606902122497559, "learning_rate": 4.476784476784477e-06, "loss": 1.0147, "step": 2873 }, { "epoch": 2.9875259875259874, "grad_norm": 4.721467971801758, "learning_rate": 4.474474474474475e-06, "loss": 0.0904, "step": 2874 }, { "epoch": 2.9885654885654884, "grad_norm": 0.037113163620233536, "learning_rate": 4.472164472164473e-06, "loss": 0.0009, "step": 2875 }, { "epoch": 2.98960498960499, "grad_norm": 3.5482118129730225, "learning_rate": 4.469854469854471e-06, "loss": 0.1736, "step": 2876 }, { "epoch": 2.990644490644491, "grad_norm": 1.804797887802124, "learning_rate": 4.467544467544468e-06, "loss": 0.0349, "step": 2877 }, { "epoch": 2.991683991683992, "grad_norm": 0.5807712078094482, "learning_rate": 4.4652344652344655e-06, "loss": 0.0149, "step": 2878 }, { "epoch": 2.992723492723493, "grad_norm": 8.496133804321289, "learning_rate": 4.462924462924464e-06, "loss": 0.2932, "step": 2879 }, { "epoch": 2.993762993762994, "grad_norm": 4.594316482543945, "learning_rate": 4.460614460614461e-06, "loss": 0.1771, "step": 2880 }, { "epoch": 2.994802494802495, "grad_norm": 1.7351125478744507, "learning_rate": 4.458304458304458e-06, "loss": 0.0422, "step": 2881 }, { "epoch": 2.995841995841996, "grad_norm": 14.885557174682617, "learning_rate": 4.455994455994457e-06, "loss": 1.0164, "step": 2882 }, { "epoch": 2.996881496881497, "grad_norm": 0.8425683975219727, "learning_rate": 4.453684453684454e-06, "loss": 0.0199, "step": 2883 }, { "epoch": 2.997920997920998, "grad_norm": 0.0038593353237956762, "learning_rate": 4.451374451374451e-06, "loss": 0.0001, "step": 2884 }, { "epoch": 2.998960498960499, "grad_norm": 4.419792175292969, "learning_rate": 4.4490644490644495e-06, "loss": 0.1397, "step": 2885 }, { "epoch": 3.0, "grad_norm": 1.9082928895950317, "learning_rate": 4.446754446754447e-06, "loss": 0.15, "step": 2886 }, { "epoch": 3.001039501039501, "grad_norm": 0.09658217430114746, "learning_rate": 4.444444444444444e-06, "loss": 0.0024, "step": 2887 }, { "epoch": 3.002079002079002, "grad_norm": 0.6714063882827759, "learning_rate": 4.442134442134442e-06, "loss": 0.0136, "step": 2888 }, { "epoch": 3.003118503118503, "grad_norm": 0.06490445137023926, "learning_rate": 4.43982443982444e-06, "loss": 0.0019, "step": 2889 }, { "epoch": 3.004158004158004, "grad_norm": 3.7229204177856445, "learning_rate": 4.437514437514438e-06, "loss": 0.256, "step": 2890 }, { "epoch": 3.005197505197505, "grad_norm": 1.2547167539596558, "learning_rate": 4.435204435204435e-06, "loss": 0.0268, "step": 2891 }, { "epoch": 3.006237006237006, "grad_norm": 0.8360212445259094, "learning_rate": 4.432894432894433e-06, "loss": 0.0228, "step": 2892 }, { "epoch": 3.007276507276507, "grad_norm": 9.071552276611328, "learning_rate": 4.430584430584431e-06, "loss": 0.2006, "step": 2893 }, { "epoch": 3.008316008316008, "grad_norm": 3.2643280029296875, "learning_rate": 4.428274428274429e-06, "loss": 0.1102, "step": 2894 }, { "epoch": 3.0093555093555096, "grad_norm": 8.89312744140625, "learning_rate": 4.425964425964426e-06, "loss": 0.9518, "step": 2895 }, { "epoch": 3.0103950103950106, "grad_norm": 1.1194871664047241, "learning_rate": 4.423654423654424e-06, "loss": 0.0156, "step": 2896 }, { "epoch": 3.0114345114345116, "grad_norm": 0.031017301604151726, "learning_rate": 4.421344421344422e-06, "loss": 0.0007, "step": 2897 }, { "epoch": 3.0124740124740126, "grad_norm": 9.319890022277832, "learning_rate": 4.419034419034419e-06, "loss": 0.2443, "step": 2898 }, { "epoch": 3.0135135135135136, "grad_norm": 0.7562645077705383, "learning_rate": 4.416724416724417e-06, "loss": 0.0138, "step": 2899 }, { "epoch": 3.0145530145530146, "grad_norm": 5.006690502166748, "learning_rate": 4.414414414414415e-06, "loss": 0.0781, "step": 2900 }, { "epoch": 3.0155925155925156, "grad_norm": 1.8787916898727417, "learning_rate": 4.412104412104413e-06, "loss": 0.0643, "step": 2901 }, { "epoch": 3.0166320166320166, "grad_norm": 7.709594249725342, "learning_rate": 4.40979440979441e-06, "loss": 0.5319, "step": 2902 }, { "epoch": 3.0176715176715176, "grad_norm": 8.625652313232422, "learning_rate": 4.4074844074844084e-06, "loss": 0.214, "step": 2903 }, { "epoch": 3.0187110187110187, "grad_norm": 8.986515045166016, "learning_rate": 4.405174405174406e-06, "loss": 0.4628, "step": 2904 }, { "epoch": 3.0197505197505197, "grad_norm": 6.721210956573486, "learning_rate": 4.402864402864403e-06, "loss": 0.269, "step": 2905 }, { "epoch": 3.0207900207900207, "grad_norm": 9.072442054748535, "learning_rate": 4.400554400554401e-06, "loss": 0.3887, "step": 2906 }, { "epoch": 3.0218295218295217, "grad_norm": 0.03076879307627678, "learning_rate": 4.398244398244399e-06, "loss": 0.0007, "step": 2907 }, { "epoch": 3.0228690228690227, "grad_norm": 1.009333848953247, "learning_rate": 4.395934395934397e-06, "loss": 0.2635, "step": 2908 }, { "epoch": 3.0239085239085237, "grad_norm": 2.1226351261138916, "learning_rate": 4.393624393624394e-06, "loss": 0.0672, "step": 2909 }, { "epoch": 3.024948024948025, "grad_norm": 8.85048770904541, "learning_rate": 4.3913143913143915e-06, "loss": 0.7063, "step": 2910 }, { "epoch": 3.025987525987526, "grad_norm": 2.2179903984069824, "learning_rate": 4.38900438900439e-06, "loss": 0.1219, "step": 2911 }, { "epoch": 3.027027027027027, "grad_norm": 2.5947070121765137, "learning_rate": 4.386694386694387e-06, "loss": 0.0804, "step": 2912 }, { "epoch": 3.028066528066528, "grad_norm": 6.031675815582275, "learning_rate": 4.384384384384384e-06, "loss": 0.1997, "step": 2913 }, { "epoch": 3.029106029106029, "grad_norm": 1.8736892938613892, "learning_rate": 4.382074382074383e-06, "loss": 0.3105, "step": 2914 }, { "epoch": 3.0301455301455302, "grad_norm": 0.2389967292547226, "learning_rate": 4.37976437976438e-06, "loss": 0.0042, "step": 2915 }, { "epoch": 3.0311850311850312, "grad_norm": 0.41082292795181274, "learning_rate": 4.377454377454377e-06, "loss": 0.008, "step": 2916 }, { "epoch": 3.0322245322245323, "grad_norm": 5.320422172546387, "learning_rate": 4.3751443751443755e-06, "loss": 0.1933, "step": 2917 }, { "epoch": 3.0332640332640333, "grad_norm": 3.031292676925659, "learning_rate": 4.372834372834373e-06, "loss": 0.0862, "step": 2918 }, { "epoch": 3.0343035343035343, "grad_norm": 4.29350471496582, "learning_rate": 4.370524370524371e-06, "loss": 0.3082, "step": 2919 }, { "epoch": 3.0353430353430353, "grad_norm": 9.461339950561523, "learning_rate": 4.368214368214368e-06, "loss": 0.4198, "step": 2920 }, { "epoch": 3.0363825363825363, "grad_norm": 2.768932580947876, "learning_rate": 4.3659043659043665e-06, "loss": 0.0551, "step": 2921 }, { "epoch": 3.0374220374220373, "grad_norm": 18.20948600769043, "learning_rate": 4.363594363594364e-06, "loss": 0.7916, "step": 2922 }, { "epoch": 3.0384615384615383, "grad_norm": 11.076471328735352, "learning_rate": 4.361284361284361e-06, "loss": 0.8374, "step": 2923 }, { "epoch": 3.0395010395010393, "grad_norm": 9.023805618286133, "learning_rate": 4.358974358974359e-06, "loss": 1.1295, "step": 2924 }, { "epoch": 3.0405405405405403, "grad_norm": 8.089066505432129, "learning_rate": 4.356664356664357e-06, "loss": 0.4043, "step": 2925 }, { "epoch": 3.0415800415800414, "grad_norm": 7.827662944793701, "learning_rate": 4.354354354354355e-06, "loss": 0.1605, "step": 2926 }, { "epoch": 3.042619542619543, "grad_norm": 0.02499527856707573, "learning_rate": 4.352044352044352e-06, "loss": 0.0007, "step": 2927 }, { "epoch": 3.043659043659044, "grad_norm": 8.047962188720703, "learning_rate": 4.3497343497343505e-06, "loss": 0.6512, "step": 2928 }, { "epoch": 3.044698544698545, "grad_norm": 0.04987876117229462, "learning_rate": 4.347424347424348e-06, "loss": 0.0012, "step": 2929 }, { "epoch": 3.045738045738046, "grad_norm": 6.60944938659668, "learning_rate": 4.345114345114346e-06, "loss": 0.0833, "step": 2930 }, { "epoch": 3.046777546777547, "grad_norm": 0.04554561525583267, "learning_rate": 4.342804342804343e-06, "loss": 0.0008, "step": 2931 }, { "epoch": 3.047817047817048, "grad_norm": 5.888713836669922, "learning_rate": 4.340494340494341e-06, "loss": 0.2992, "step": 2932 }, { "epoch": 3.048856548856549, "grad_norm": 2.1652138233184814, "learning_rate": 4.338184338184339e-06, "loss": 0.0709, "step": 2933 }, { "epoch": 3.04989604989605, "grad_norm": 4.219902515411377, "learning_rate": 4.335874335874336e-06, "loss": 0.1107, "step": 2934 }, { "epoch": 3.050935550935551, "grad_norm": 0.6310142874717712, "learning_rate": 4.3335643335643344e-06, "loss": 0.0095, "step": 2935 }, { "epoch": 3.051975051975052, "grad_norm": 0.9771106839179993, "learning_rate": 4.331254331254332e-06, "loss": 0.0239, "step": 2936 }, { "epoch": 3.053014553014553, "grad_norm": 8.032059669494629, "learning_rate": 4.328944328944329e-06, "loss": 0.3691, "step": 2937 }, { "epoch": 3.054054054054054, "grad_norm": 16.941720962524414, "learning_rate": 4.326634326634327e-06, "loss": 2.0285, "step": 2938 }, { "epoch": 3.055093555093555, "grad_norm": 1.6030668020248413, "learning_rate": 4.324324324324325e-06, "loss": 0.0434, "step": 2939 }, { "epoch": 3.056133056133056, "grad_norm": 0.09329555928707123, "learning_rate": 4.322014322014322e-06, "loss": 0.0016, "step": 2940 }, { "epoch": 3.057172557172557, "grad_norm": 0.026322314515709877, "learning_rate": 4.31970431970432e-06, "loss": 0.0007, "step": 2941 }, { "epoch": 3.0582120582120584, "grad_norm": 1.3071532249450684, "learning_rate": 4.3173943173943175e-06, "loss": 0.0296, "step": 2942 }, { "epoch": 3.0592515592515594, "grad_norm": 0.10186092555522919, "learning_rate": 4.315084315084315e-06, "loss": 0.0022, "step": 2943 }, { "epoch": 3.0602910602910605, "grad_norm": 3.7780628204345703, "learning_rate": 4.312774312774313e-06, "loss": 0.1064, "step": 2944 }, { "epoch": 3.0613305613305615, "grad_norm": 2.3046798706054688, "learning_rate": 4.31046431046431e-06, "loss": 0.2597, "step": 2945 }, { "epoch": 3.0623700623700625, "grad_norm": 5.836421966552734, "learning_rate": 4.308154308154309e-06, "loss": 0.3987, "step": 2946 }, { "epoch": 3.0634095634095635, "grad_norm": 0.18187105655670166, "learning_rate": 4.305844305844306e-06, "loss": 0.0068, "step": 2947 }, { "epoch": 3.0644490644490645, "grad_norm": 0.03847048431634903, "learning_rate": 4.303534303534304e-06, "loss": 0.0011, "step": 2948 }, { "epoch": 3.0654885654885655, "grad_norm": 0.19393624365329742, "learning_rate": 4.3012243012243015e-06, "loss": 0.0052, "step": 2949 }, { "epoch": 3.0665280665280665, "grad_norm": 11.248908996582031, "learning_rate": 4.298914298914299e-06, "loss": 0.4773, "step": 2950 }, { "epoch": 3.0675675675675675, "grad_norm": 0.041529685258865356, "learning_rate": 4.296604296604297e-06, "loss": 0.0007, "step": 2951 }, { "epoch": 3.0686070686070686, "grad_norm": 17.413583755493164, "learning_rate": 4.294294294294294e-06, "loss": 0.7445, "step": 2952 }, { "epoch": 3.0696465696465696, "grad_norm": 0.8456288576126099, "learning_rate": 4.2919842919842925e-06, "loss": 0.0247, "step": 2953 }, { "epoch": 3.0706860706860706, "grad_norm": 7.395755290985107, "learning_rate": 4.28967428967429e-06, "loss": 0.3355, "step": 2954 }, { "epoch": 3.0717255717255716, "grad_norm": 6.521220684051514, "learning_rate": 4.287364287364288e-06, "loss": 0.2584, "step": 2955 }, { "epoch": 3.0727650727650726, "grad_norm": 0.22646839916706085, "learning_rate": 4.2850542850542854e-06, "loss": 0.002, "step": 2956 }, { "epoch": 3.0738045738045736, "grad_norm": 17.819766998291016, "learning_rate": 4.282744282744284e-06, "loss": 0.9187, "step": 2957 }, { "epoch": 3.0748440748440746, "grad_norm": 4.8948564529418945, "learning_rate": 4.280434280434281e-06, "loss": 0.1083, "step": 2958 }, { "epoch": 3.075883575883576, "grad_norm": 5.55280065536499, "learning_rate": 4.278124278124278e-06, "loss": 0.1584, "step": 2959 }, { "epoch": 3.076923076923077, "grad_norm": 9.615129470825195, "learning_rate": 4.2758142758142765e-06, "loss": 1.0435, "step": 2960 }, { "epoch": 3.077962577962578, "grad_norm": 5.408070087432861, "learning_rate": 4.273504273504274e-06, "loss": 0.1181, "step": 2961 }, { "epoch": 3.079002079002079, "grad_norm": 16.333675384521484, "learning_rate": 4.271194271194272e-06, "loss": 0.9409, "step": 2962 }, { "epoch": 3.08004158004158, "grad_norm": 0.8769080638885498, "learning_rate": 4.268884268884269e-06, "loss": 0.0207, "step": 2963 }, { "epoch": 3.081081081081081, "grad_norm": 4.57643985748291, "learning_rate": 4.266574266574267e-06, "loss": 0.1731, "step": 2964 }, { "epoch": 3.082120582120582, "grad_norm": 5.595919132232666, "learning_rate": 4.264264264264265e-06, "loss": 0.5219, "step": 2965 }, { "epoch": 3.083160083160083, "grad_norm": 0.01822960004210472, "learning_rate": 4.261954261954262e-06, "loss": 0.0006, "step": 2966 }, { "epoch": 3.084199584199584, "grad_norm": 4.2607102394104, "learning_rate": 4.25964425964426e-06, "loss": 0.2796, "step": 2967 }, { "epoch": 3.085239085239085, "grad_norm": 0.007498827762901783, "learning_rate": 4.257334257334258e-06, "loss": 0.0002, "step": 2968 }, { "epoch": 3.086278586278586, "grad_norm": 1.3731168508529663, "learning_rate": 4.255024255024255e-06, "loss": 0.0231, "step": 2969 }, { "epoch": 3.087318087318087, "grad_norm": 1.0861774682998657, "learning_rate": 4.2527142527142525e-06, "loss": 0.0377, "step": 2970 }, { "epoch": 3.0883575883575882, "grad_norm": 0.18928325176239014, "learning_rate": 4.250404250404251e-06, "loss": 0.005, "step": 2971 }, { "epoch": 3.0893970893970892, "grad_norm": 11.383058547973633, "learning_rate": 4.248094248094248e-06, "loss": 0.6004, "step": 2972 }, { "epoch": 3.0904365904365902, "grad_norm": 1.9454607963562012, "learning_rate": 4.245784245784246e-06, "loss": 0.0708, "step": 2973 }, { "epoch": 3.0914760914760917, "grad_norm": 7.079811096191406, "learning_rate": 4.2434742434742435e-06, "loss": 0.1478, "step": 2974 }, { "epoch": 3.0925155925155927, "grad_norm": 0.37563154101371765, "learning_rate": 4.241164241164242e-06, "loss": 0.0132, "step": 2975 }, { "epoch": 3.0935550935550937, "grad_norm": 10.523004531860352, "learning_rate": 4.238854238854239e-06, "loss": 0.8742, "step": 2976 }, { "epoch": 3.0945945945945947, "grad_norm": 2.2287039756774902, "learning_rate": 4.236544236544236e-06, "loss": 0.0512, "step": 2977 }, { "epoch": 3.0956340956340958, "grad_norm": 4.583277225494385, "learning_rate": 4.234234234234235e-06, "loss": 0.2034, "step": 2978 }, { "epoch": 3.0966735966735968, "grad_norm": 0.5153438448905945, "learning_rate": 4.231924231924232e-06, "loss": 0.0113, "step": 2979 }, { "epoch": 3.0977130977130978, "grad_norm": 12.180269241333008, "learning_rate": 4.22961422961423e-06, "loss": 0.6624, "step": 2980 }, { "epoch": 3.098752598752599, "grad_norm": 0.4038905203342438, "learning_rate": 4.2273042273042275e-06, "loss": 0.0079, "step": 2981 }, { "epoch": 3.0997920997921, "grad_norm": 0.38581734895706177, "learning_rate": 4.224994224994226e-06, "loss": 0.0049, "step": 2982 }, { "epoch": 3.100831600831601, "grad_norm": 6.3126420974731445, "learning_rate": 4.222684222684223e-06, "loss": 0.3044, "step": 2983 }, { "epoch": 3.101871101871102, "grad_norm": 9.254770278930664, "learning_rate": 4.220374220374221e-06, "loss": 0.6881, "step": 2984 }, { "epoch": 3.102910602910603, "grad_norm": 0.7424444556236267, "learning_rate": 4.2180642180642186e-06, "loss": 0.0077, "step": 2985 }, { "epoch": 3.103950103950104, "grad_norm": 5.989633083343506, "learning_rate": 4.215754215754216e-06, "loss": 0.1914, "step": 2986 }, { "epoch": 3.104989604989605, "grad_norm": 4.006546974182129, "learning_rate": 4.213444213444214e-06, "loss": 0.0936, "step": 2987 }, { "epoch": 3.106029106029106, "grad_norm": 0.39120224118232727, "learning_rate": 4.2111342111342114e-06, "loss": 0.0091, "step": 2988 }, { "epoch": 3.107068607068607, "grad_norm": 7.164828300476074, "learning_rate": 4.20882420882421e-06, "loss": 0.3958, "step": 2989 }, { "epoch": 3.108108108108108, "grad_norm": 1.572396159172058, "learning_rate": 4.206514206514207e-06, "loss": 0.0273, "step": 2990 }, { "epoch": 3.1091476091476093, "grad_norm": 0.8376824259757996, "learning_rate": 4.204204204204204e-06, "loss": 0.2671, "step": 2991 }, { "epoch": 3.1101871101871104, "grad_norm": 8.45645523071289, "learning_rate": 4.2018942018942025e-06, "loss": 0.5827, "step": 2992 }, { "epoch": 3.1112266112266114, "grad_norm": 6.415794372558594, "learning_rate": 4.1995841995842e-06, "loss": 0.2154, "step": 2993 }, { "epoch": 3.1122661122661124, "grad_norm": 2.558643341064453, "learning_rate": 4.197274197274197e-06, "loss": 0.0716, "step": 2994 }, { "epoch": 3.1133056133056134, "grad_norm": 0.21552839875221252, "learning_rate": 4.194964194964195e-06, "loss": 0.0052, "step": 2995 }, { "epoch": 3.1143451143451144, "grad_norm": 8.271615028381348, "learning_rate": 4.192654192654193e-06, "loss": 0.2212, "step": 2996 }, { "epoch": 3.1153846153846154, "grad_norm": 5.949480056762695, "learning_rate": 4.19034419034419e-06, "loss": 0.3214, "step": 2997 }, { "epoch": 3.1164241164241164, "grad_norm": 4.731964588165283, "learning_rate": 4.188034188034188e-06, "loss": 0.1788, "step": 2998 }, { "epoch": 3.1174636174636174, "grad_norm": 11.179065704345703, "learning_rate": 4.185724185724186e-06, "loss": 0.4589, "step": 2999 }, { "epoch": 3.1185031185031185, "grad_norm": 6.513021469116211, "learning_rate": 4.183414183414184e-06, "loss": 0.2555, "step": 3000 }, { "epoch": 3.1195426195426195, "grad_norm": 4.68575382232666, "learning_rate": 4.181104181104181e-06, "loss": 0.1547, "step": 3001 }, { "epoch": 3.1205821205821205, "grad_norm": 0.18110813200473785, "learning_rate": 4.178794178794179e-06, "loss": 0.0055, "step": 3002 }, { "epoch": 3.1216216216216215, "grad_norm": 8.774043083190918, "learning_rate": 4.176484176484177e-06, "loss": 0.3781, "step": 3003 }, { "epoch": 3.1226611226611225, "grad_norm": 0.7165976166725159, "learning_rate": 4.174174174174174e-06, "loss": 0.0105, "step": 3004 }, { "epoch": 3.1237006237006235, "grad_norm": 9.499658584594727, "learning_rate": 4.171864171864172e-06, "loss": 0.7248, "step": 3005 }, { "epoch": 3.124740124740125, "grad_norm": 12.022478103637695, "learning_rate": 4.1695541695541696e-06, "loss": 0.8418, "step": 3006 }, { "epoch": 3.125779625779626, "grad_norm": 0.8170308470726013, "learning_rate": 4.167244167244168e-06, "loss": 0.0184, "step": 3007 }, { "epoch": 3.126819126819127, "grad_norm": 10.470986366271973, "learning_rate": 4.164934164934165e-06, "loss": 0.7221, "step": 3008 }, { "epoch": 3.127858627858628, "grad_norm": 0.19149138033390045, "learning_rate": 4.162624162624163e-06, "loss": 0.0038, "step": 3009 }, { "epoch": 3.128898128898129, "grad_norm": 7.027655124664307, "learning_rate": 4.160314160314161e-06, "loss": 0.1312, "step": 3010 }, { "epoch": 3.12993762993763, "grad_norm": 1.6696555614471436, "learning_rate": 4.158004158004159e-06, "loss": 0.027, "step": 3011 }, { "epoch": 3.130977130977131, "grad_norm": 0.03533538803458214, "learning_rate": 4.155694155694156e-06, "loss": 0.0005, "step": 3012 }, { "epoch": 3.132016632016632, "grad_norm": 1.7083377838134766, "learning_rate": 4.1533841533841535e-06, "loss": 0.045, "step": 3013 }, { "epoch": 3.133056133056133, "grad_norm": 0.1288733333349228, "learning_rate": 4.151074151074152e-06, "loss": 0.0049, "step": 3014 }, { "epoch": 3.134095634095634, "grad_norm": 6.5716233253479, "learning_rate": 4.148764148764149e-06, "loss": 0.3688, "step": 3015 }, { "epoch": 3.135135135135135, "grad_norm": 10.444952011108398, "learning_rate": 4.146454146454147e-06, "loss": 1.0296, "step": 3016 }, { "epoch": 3.136174636174636, "grad_norm": 0.9561419486999512, "learning_rate": 4.1441441441441446e-06, "loss": 0.0408, "step": 3017 }, { "epoch": 3.137214137214137, "grad_norm": 5.194184303283691, "learning_rate": 4.141834141834143e-06, "loss": 0.1648, "step": 3018 }, { "epoch": 3.138253638253638, "grad_norm": 3.6494336128234863, "learning_rate": 4.13952413952414e-06, "loss": 0.1334, "step": 3019 }, { "epoch": 3.139293139293139, "grad_norm": 3.497851848602295, "learning_rate": 4.1372141372141374e-06, "loss": 0.0748, "step": 3020 }, { "epoch": 3.14033264033264, "grad_norm": 9.248802185058594, "learning_rate": 4.134904134904136e-06, "loss": 0.342, "step": 3021 }, { "epoch": 3.141372141372141, "grad_norm": 6.574548721313477, "learning_rate": 4.132594132594133e-06, "loss": 0.24, "step": 3022 }, { "epoch": 3.1424116424116426, "grad_norm": 6.959147930145264, "learning_rate": 4.13028413028413e-06, "loss": 0.2583, "step": 3023 }, { "epoch": 3.1434511434511436, "grad_norm": 9.682708740234375, "learning_rate": 4.1279741279741285e-06, "loss": 0.4094, "step": 3024 }, { "epoch": 3.1444906444906446, "grad_norm": 0.3175424635410309, "learning_rate": 4.125664125664126e-06, "loss": 0.011, "step": 3025 }, { "epoch": 3.1455301455301456, "grad_norm": 3.7239432334899902, "learning_rate": 4.123354123354123e-06, "loss": 0.0823, "step": 3026 }, { "epoch": 3.1465696465696467, "grad_norm": 0.09776545315980911, "learning_rate": 4.121044121044121e-06, "loss": 0.0023, "step": 3027 }, { "epoch": 3.1476091476091477, "grad_norm": 8.206098556518555, "learning_rate": 4.118734118734119e-06, "loss": 0.2836, "step": 3028 }, { "epoch": 3.1486486486486487, "grad_norm": 5.508115291595459, "learning_rate": 4.116424116424117e-06, "loss": 0.1995, "step": 3029 }, { "epoch": 3.1496881496881497, "grad_norm": 6.341372966766357, "learning_rate": 4.114114114114114e-06, "loss": 0.1132, "step": 3030 }, { "epoch": 3.1507276507276507, "grad_norm": 0.16759119927883148, "learning_rate": 4.111804111804112e-06, "loss": 0.0026, "step": 3031 }, { "epoch": 3.1517671517671517, "grad_norm": 0.1679067462682724, "learning_rate": 4.10949410949411e-06, "loss": 0.0039, "step": 3032 }, { "epoch": 3.1528066528066527, "grad_norm": 0.1461794376373291, "learning_rate": 4.107184107184107e-06, "loss": 0.0026, "step": 3033 }, { "epoch": 3.1538461538461537, "grad_norm": 8.840606689453125, "learning_rate": 4.104874104874105e-06, "loss": 0.3927, "step": 3034 }, { "epoch": 3.1548856548856548, "grad_norm": 9.067317008972168, "learning_rate": 4.102564102564103e-06, "loss": 0.6051, "step": 3035 }, { "epoch": 3.1559251559251558, "grad_norm": 0.08562859892845154, "learning_rate": 4.100254100254101e-06, "loss": 0.002, "step": 3036 }, { "epoch": 3.156964656964657, "grad_norm": 0.3136712908744812, "learning_rate": 4.097944097944098e-06, "loss": 0.0064, "step": 3037 }, { "epoch": 3.1580041580041582, "grad_norm": 5.612872123718262, "learning_rate": 4.095634095634096e-06, "loss": 0.2334, "step": 3038 }, { "epoch": 3.1590436590436592, "grad_norm": 1.436871886253357, "learning_rate": 4.093324093324094e-06, "loss": 0.0241, "step": 3039 }, { "epoch": 3.1600831600831603, "grad_norm": 5.758492469787598, "learning_rate": 4.091014091014091e-06, "loss": 0.1174, "step": 3040 }, { "epoch": 3.1611226611226613, "grad_norm": 1.3731693029403687, "learning_rate": 4.088704088704089e-06, "loss": 0.0247, "step": 3041 }, { "epoch": 3.1621621621621623, "grad_norm": 4.757385730743408, "learning_rate": 4.086394086394087e-06, "loss": 0.1087, "step": 3042 }, { "epoch": 3.1632016632016633, "grad_norm": 0.881007730960846, "learning_rate": 4.084084084084085e-06, "loss": 0.02, "step": 3043 }, { "epoch": 3.1642411642411643, "grad_norm": 2.4619946479797363, "learning_rate": 4.081774081774082e-06, "loss": 0.1007, "step": 3044 }, { "epoch": 3.1652806652806653, "grad_norm": 0.38008901476860046, "learning_rate": 4.07946407946408e-06, "loss": 0.0046, "step": 3045 }, { "epoch": 3.1663201663201663, "grad_norm": 0.6488372683525085, "learning_rate": 4.077154077154078e-06, "loss": 0.0187, "step": 3046 }, { "epoch": 3.1673596673596673, "grad_norm": 3.645965576171875, "learning_rate": 4.074844074844075e-06, "loss": 0.1018, "step": 3047 }, { "epoch": 3.1683991683991684, "grad_norm": 0.9687808752059937, "learning_rate": 4.072534072534073e-06, "loss": 0.0168, "step": 3048 }, { "epoch": 3.1694386694386694, "grad_norm": 0.5096043348312378, "learning_rate": 4.0702240702240706e-06, "loss": 0.0081, "step": 3049 }, { "epoch": 3.1704781704781704, "grad_norm": 0.23202700912952423, "learning_rate": 4.067914067914068e-06, "loss": 0.0056, "step": 3050 }, { "epoch": 3.1715176715176714, "grad_norm": 3.7122135162353516, "learning_rate": 4.065604065604066e-06, "loss": 0.0737, "step": 3051 }, { "epoch": 3.1725571725571724, "grad_norm": 0.19520847499370575, "learning_rate": 4.0632940632940635e-06, "loss": 0.006, "step": 3052 }, { "epoch": 3.1735966735966734, "grad_norm": 2.914714813232422, "learning_rate": 4.060984060984061e-06, "loss": 0.0777, "step": 3053 }, { "epoch": 3.1746361746361744, "grad_norm": 2.344712018966675, "learning_rate": 4.058674058674059e-06, "loss": 0.0649, "step": 3054 }, { "epoch": 3.175675675675676, "grad_norm": 7.863626003265381, "learning_rate": 4.056364056364056e-06, "loss": 0.1842, "step": 3055 }, { "epoch": 3.176715176715177, "grad_norm": 8.394662857055664, "learning_rate": 4.0540540540540545e-06, "loss": 0.5036, "step": 3056 }, { "epoch": 3.177754677754678, "grad_norm": 4.316769599914551, "learning_rate": 4.051744051744052e-06, "loss": 0.0995, "step": 3057 }, { "epoch": 3.178794178794179, "grad_norm": 4.495481967926025, "learning_rate": 4.049434049434049e-06, "loss": 0.1215, "step": 3058 }, { "epoch": 3.17983367983368, "grad_norm": 9.788843154907227, "learning_rate": 4.047124047124047e-06, "loss": 1.0258, "step": 3059 }, { "epoch": 3.180873180873181, "grad_norm": 9.788480758666992, "learning_rate": 4.044814044814045e-06, "loss": 0.7014, "step": 3060 }, { "epoch": 3.181912681912682, "grad_norm": 14.928915977478027, "learning_rate": 4.042504042504043e-06, "loss": 1.2652, "step": 3061 }, { "epoch": 3.182952182952183, "grad_norm": 13.048415184020996, "learning_rate": 4.04019404019404e-06, "loss": 0.3149, "step": 3062 }, { "epoch": 3.183991683991684, "grad_norm": 3.889265537261963, "learning_rate": 4.0378840378840385e-06, "loss": 0.0809, "step": 3063 }, { "epoch": 3.185031185031185, "grad_norm": 0.15134768187999725, "learning_rate": 4.035574035574036e-06, "loss": 0.0026, "step": 3064 }, { "epoch": 3.186070686070686, "grad_norm": 5.497805118560791, "learning_rate": 4.033264033264034e-06, "loss": 0.2563, "step": 3065 }, { "epoch": 3.187110187110187, "grad_norm": 6.498611927032471, "learning_rate": 4.030954030954031e-06, "loss": 0.3335, "step": 3066 }, { "epoch": 3.188149688149688, "grad_norm": 12.062654495239258, "learning_rate": 4.028644028644029e-06, "loss": 0.6318, "step": 3067 }, { "epoch": 3.189189189189189, "grad_norm": 3.560054302215576, "learning_rate": 4.026334026334027e-06, "loss": 0.0463, "step": 3068 }, { "epoch": 3.19022869022869, "grad_norm": 8.133339881896973, "learning_rate": 4.024024024024024e-06, "loss": 0.5378, "step": 3069 }, { "epoch": 3.1912681912681915, "grad_norm": 6.538513660430908, "learning_rate": 4.021714021714022e-06, "loss": 0.4576, "step": 3070 }, { "epoch": 3.1923076923076925, "grad_norm": 1.1551824808120728, "learning_rate": 4.01940401940402e-06, "loss": 0.0318, "step": 3071 }, { "epoch": 3.1933471933471935, "grad_norm": 1.8024814128875732, "learning_rate": 4.017094017094018e-06, "loss": 0.056, "step": 3072 }, { "epoch": 3.1943866943866945, "grad_norm": 12.158453941345215, "learning_rate": 4.014784014784015e-06, "loss": 0.1438, "step": 3073 }, { "epoch": 3.1954261954261955, "grad_norm": 0.8606366515159607, "learning_rate": 4.012474012474013e-06, "loss": 0.0071, "step": 3074 }, { "epoch": 3.1964656964656966, "grad_norm": 5.564821720123291, "learning_rate": 4.010164010164011e-06, "loss": 0.1433, "step": 3075 }, { "epoch": 3.1975051975051976, "grad_norm": 4.401222229003906, "learning_rate": 4.007854007854008e-06, "loss": 0.1796, "step": 3076 }, { "epoch": 3.1985446985446986, "grad_norm": 0.0049988687969744205, "learning_rate": 4.0055440055440055e-06, "loss": 0.0001, "step": 3077 }, { "epoch": 3.1995841995841996, "grad_norm": 3.3277342319488525, "learning_rate": 4.003234003234004e-06, "loss": 0.0776, "step": 3078 }, { "epoch": 3.2006237006237006, "grad_norm": 5.566887855529785, "learning_rate": 4.000924000924001e-06, "loss": 0.1188, "step": 3079 }, { "epoch": 3.2016632016632016, "grad_norm": 7.9430928230285645, "learning_rate": 3.998613998613998e-06, "loss": 0.2335, "step": 3080 }, { "epoch": 3.2027027027027026, "grad_norm": 7.287680625915527, "learning_rate": 3.996303996303997e-06, "loss": 0.2504, "step": 3081 }, { "epoch": 3.2037422037422036, "grad_norm": 2.037623405456543, "learning_rate": 3.993993993993994e-06, "loss": 0.0641, "step": 3082 }, { "epoch": 3.2047817047817047, "grad_norm": 8.476522445678711, "learning_rate": 3.991683991683992e-06, "loss": 0.454, "step": 3083 }, { "epoch": 3.2058212058212057, "grad_norm": 11.712259292602539, "learning_rate": 3.9893739893739895e-06, "loss": 0.6441, "step": 3084 }, { "epoch": 3.2068607068607067, "grad_norm": 0.12107273191213608, "learning_rate": 3.987063987063987e-06, "loss": 0.0024, "step": 3085 }, { "epoch": 3.2079002079002077, "grad_norm": 0.0172722265124321, "learning_rate": 3.984753984753985e-06, "loss": 0.0003, "step": 3086 }, { "epoch": 3.208939708939709, "grad_norm": 7.512850284576416, "learning_rate": 3.982443982443982e-06, "loss": 0.2886, "step": 3087 }, { "epoch": 3.20997920997921, "grad_norm": 10.699891090393066, "learning_rate": 3.9801339801339805e-06, "loss": 1.306, "step": 3088 }, { "epoch": 3.211018711018711, "grad_norm": 3.024954080581665, "learning_rate": 3.977823977823978e-06, "loss": 0.2697, "step": 3089 }, { "epoch": 3.212058212058212, "grad_norm": 8.047675132751465, "learning_rate": 3.975513975513976e-06, "loss": 0.2475, "step": 3090 }, { "epoch": 3.213097713097713, "grad_norm": 0.10544872283935547, "learning_rate": 3.973203973203973e-06, "loss": 0.0018, "step": 3091 }, { "epoch": 3.214137214137214, "grad_norm": 0.11675427854061127, "learning_rate": 3.970893970893972e-06, "loss": 0.0015, "step": 3092 }, { "epoch": 3.215176715176715, "grad_norm": 13.126848220825195, "learning_rate": 3.968583968583969e-06, "loss": 0.4408, "step": 3093 }, { "epoch": 3.2162162162162162, "grad_norm": 12.223016738891602, "learning_rate": 3.966273966273966e-06, "loss": 0.0774, "step": 3094 }, { "epoch": 3.2172557172557172, "grad_norm": 0.5851736664772034, "learning_rate": 3.9639639639639645e-06, "loss": 0.0133, "step": 3095 }, { "epoch": 3.2182952182952183, "grad_norm": 4.598794937133789, "learning_rate": 3.961653961653962e-06, "loss": 0.0648, "step": 3096 }, { "epoch": 3.2193347193347193, "grad_norm": 4.362109184265137, "learning_rate": 3.95934395934396e-06, "loss": 0.2351, "step": 3097 }, { "epoch": 3.2203742203742203, "grad_norm": 15.790438652038574, "learning_rate": 3.957033957033957e-06, "loss": 0.2569, "step": 3098 }, { "epoch": 3.2214137214137213, "grad_norm": 9.33132266998291, "learning_rate": 3.9547239547239555e-06, "loss": 0.4077, "step": 3099 }, { "epoch": 3.2224532224532223, "grad_norm": 1.6285666227340698, "learning_rate": 3.952413952413953e-06, "loss": 0.0337, "step": 3100 }, { "epoch": 3.2234927234927233, "grad_norm": 1.7507997751235962, "learning_rate": 3.95010395010395e-06, "loss": 0.0333, "step": 3101 }, { "epoch": 3.2245322245322248, "grad_norm": 1.823251724243164, "learning_rate": 3.9477939477939484e-06, "loss": 0.0389, "step": 3102 }, { "epoch": 3.225571725571726, "grad_norm": 4.533589839935303, "learning_rate": 3.945483945483946e-06, "loss": 0.2476, "step": 3103 }, { "epoch": 3.226611226611227, "grad_norm": 0.8647873401641846, "learning_rate": 3.943173943173943e-06, "loss": 0.0198, "step": 3104 }, { "epoch": 3.227650727650728, "grad_norm": 9.659738540649414, "learning_rate": 3.940863940863941e-06, "loss": 0.5419, "step": 3105 }, { "epoch": 3.228690228690229, "grad_norm": 0.054894059896469116, "learning_rate": 3.938553938553939e-06, "loss": 0.0015, "step": 3106 }, { "epoch": 3.22972972972973, "grad_norm": 1.3090507984161377, "learning_rate": 3.936243936243936e-06, "loss": 0.0199, "step": 3107 }, { "epoch": 3.230769230769231, "grad_norm": 5.187557697296143, "learning_rate": 3.933933933933934e-06, "loss": 0.1225, "step": 3108 }, { "epoch": 3.231808731808732, "grad_norm": 0.2481028288602829, "learning_rate": 3.9316239316239315e-06, "loss": 0.0065, "step": 3109 }, { "epoch": 3.232848232848233, "grad_norm": 10.085977554321289, "learning_rate": 3.92931392931393e-06, "loss": 0.4734, "step": 3110 }, { "epoch": 3.233887733887734, "grad_norm": 0.2195979505777359, "learning_rate": 3.927003927003927e-06, "loss": 0.0066, "step": 3111 }, { "epoch": 3.234927234927235, "grad_norm": 0.9394448399543762, "learning_rate": 3.924693924693924e-06, "loss": 0.032, "step": 3112 }, { "epoch": 3.235966735966736, "grad_norm": 0.003855444025248289, "learning_rate": 3.922383922383923e-06, "loss": 0.0001, "step": 3113 }, { "epoch": 3.237006237006237, "grad_norm": 2.7753994464874268, "learning_rate": 3.92007392007392e-06, "loss": 0.0534, "step": 3114 }, { "epoch": 3.238045738045738, "grad_norm": 2.2066776752471924, "learning_rate": 3.917763917763918e-06, "loss": 0.06, "step": 3115 }, { "epoch": 3.239085239085239, "grad_norm": 3.437779426574707, "learning_rate": 3.9154539154539155e-06, "loss": 0.0524, "step": 3116 }, { "epoch": 3.24012474012474, "grad_norm": 0.572167694568634, "learning_rate": 3.913143913143914e-06, "loss": 0.013, "step": 3117 }, { "epoch": 3.241164241164241, "grad_norm": 8.824238777160645, "learning_rate": 3.910833910833911e-06, "loss": 0.4768, "step": 3118 }, { "epoch": 3.2422037422037424, "grad_norm": 7.6617584228515625, "learning_rate": 3.908523908523909e-06, "loss": 0.4057, "step": 3119 }, { "epoch": 3.2432432432432434, "grad_norm": 0.5249945521354675, "learning_rate": 3.9062139062139065e-06, "loss": 0.009, "step": 3120 }, { "epoch": 3.2442827442827444, "grad_norm": 1.2607581615447998, "learning_rate": 3.903903903903904e-06, "loss": 0.0261, "step": 3121 }, { "epoch": 3.2453222453222454, "grad_norm": 2.4366354942321777, "learning_rate": 3.901593901593902e-06, "loss": 0.0777, "step": 3122 }, { "epoch": 3.2463617463617465, "grad_norm": 10.035798072814941, "learning_rate": 3.899283899283899e-06, "loss": 0.4241, "step": 3123 }, { "epoch": 3.2474012474012475, "grad_norm": 6.8840413093566895, "learning_rate": 3.896973896973898e-06, "loss": 0.2189, "step": 3124 }, { "epoch": 3.2484407484407485, "grad_norm": 2.9253406524658203, "learning_rate": 3.894663894663895e-06, "loss": 0.0927, "step": 3125 }, { "epoch": 3.2494802494802495, "grad_norm": 3.3498616218566895, "learning_rate": 3.892353892353893e-06, "loss": 0.0548, "step": 3126 }, { "epoch": 3.2505197505197505, "grad_norm": 0.8399617671966553, "learning_rate": 3.8900438900438905e-06, "loss": 0.0177, "step": 3127 }, { "epoch": 3.2515592515592515, "grad_norm": 1.4739288091659546, "learning_rate": 3.887733887733889e-06, "loss": 0.0303, "step": 3128 }, { "epoch": 3.2525987525987525, "grad_norm": 7.085636615753174, "learning_rate": 3.885423885423886e-06, "loss": 0.4284, "step": 3129 }, { "epoch": 3.2536382536382535, "grad_norm": 4.9388933181762695, "learning_rate": 3.883113883113883e-06, "loss": 0.1027, "step": 3130 }, { "epoch": 3.2546777546777546, "grad_norm": 8.358454704284668, "learning_rate": 3.8808038808038816e-06, "loss": 0.8982, "step": 3131 }, { "epoch": 3.2557172557172556, "grad_norm": 2.0921363830566406, "learning_rate": 3.878493878493879e-06, "loss": 0.0371, "step": 3132 }, { "epoch": 3.2567567567567566, "grad_norm": 1.1191800832748413, "learning_rate": 3.876183876183876e-06, "loss": 0.2502, "step": 3133 }, { "epoch": 3.257796257796258, "grad_norm": 1.5055001974105835, "learning_rate": 3.8738738738738744e-06, "loss": 0.0246, "step": 3134 }, { "epoch": 3.258835758835759, "grad_norm": 20.0693302154541, "learning_rate": 3.871563871563872e-06, "loss": 1.0522, "step": 3135 }, { "epoch": 3.25987525987526, "grad_norm": 0.03470122814178467, "learning_rate": 3.869253869253869e-06, "loss": 0.0005, "step": 3136 }, { "epoch": 3.260914760914761, "grad_norm": 7.206092834472656, "learning_rate": 3.866943866943867e-06, "loss": 1.1063, "step": 3137 }, { "epoch": 3.261954261954262, "grad_norm": 1.0938373804092407, "learning_rate": 3.864633864633865e-06, "loss": 0.024, "step": 3138 }, { "epoch": 3.262993762993763, "grad_norm": 4.277055263519287, "learning_rate": 3.862323862323863e-06, "loss": 0.1911, "step": 3139 }, { "epoch": 3.264033264033264, "grad_norm": 0.10324853658676147, "learning_rate": 3.86001386001386e-06, "loss": 0.0024, "step": 3140 }, { "epoch": 3.265072765072765, "grad_norm": 3.6633353233337402, "learning_rate": 3.8577038577038575e-06, "loss": 0.0671, "step": 3141 }, { "epoch": 3.266112266112266, "grad_norm": 2.2221670150756836, "learning_rate": 3.855393855393856e-06, "loss": 0.0271, "step": 3142 }, { "epoch": 3.267151767151767, "grad_norm": 0.14709371328353882, "learning_rate": 3.853083853083853e-06, "loss": 0.0056, "step": 3143 }, { "epoch": 3.268191268191268, "grad_norm": 4.218796730041504, "learning_rate": 3.850773850773851e-06, "loss": 0.3053, "step": 3144 }, { "epoch": 3.269230769230769, "grad_norm": 3.633521318435669, "learning_rate": 3.848463848463849e-06, "loss": 0.0535, "step": 3145 }, { "epoch": 3.27027027027027, "grad_norm": 2.324286937713623, "learning_rate": 3.846153846153847e-06, "loss": 0.0562, "step": 3146 }, { "epoch": 3.271309771309771, "grad_norm": 4.2747111320495605, "learning_rate": 3.843843843843844e-06, "loss": 0.1062, "step": 3147 }, { "epoch": 3.272349272349272, "grad_norm": 0.26570233702659607, "learning_rate": 3.841533841533842e-06, "loss": 0.0053, "step": 3148 }, { "epoch": 3.273388773388773, "grad_norm": 0.3976437449455261, "learning_rate": 3.83922383922384e-06, "loss": 0.0061, "step": 3149 }, { "epoch": 3.274428274428274, "grad_norm": 0.36365818977355957, "learning_rate": 3.836913836913837e-06, "loss": 0.0093, "step": 3150 }, { "epoch": 3.2754677754677752, "grad_norm": 4.533632755279541, "learning_rate": 3.834603834603835e-06, "loss": 0.3268, "step": 3151 }, { "epoch": 3.2765072765072767, "grad_norm": 1.8680239915847778, "learning_rate": 3.8322938322938326e-06, "loss": 0.0712, "step": 3152 }, { "epoch": 3.2775467775467777, "grad_norm": 5.484252452850342, "learning_rate": 3.829983829983831e-06, "loss": 0.1009, "step": 3153 }, { "epoch": 3.2785862785862787, "grad_norm": 9.467362403869629, "learning_rate": 3.827673827673828e-06, "loss": 0.4734, "step": 3154 }, { "epoch": 3.2796257796257797, "grad_norm": 0.005934694781899452, "learning_rate": 3.825363825363826e-06, "loss": 0.0001, "step": 3155 }, { "epoch": 3.2806652806652807, "grad_norm": 9.528066635131836, "learning_rate": 3.823053823053824e-06, "loss": 1.0359, "step": 3156 }, { "epoch": 3.2817047817047817, "grad_norm": 0.7023597359657288, "learning_rate": 3.820743820743821e-06, "loss": 0.018, "step": 3157 }, { "epoch": 3.2827442827442828, "grad_norm": 0.018386509269475937, "learning_rate": 3.818433818433819e-06, "loss": 0.0004, "step": 3158 }, { "epoch": 3.2837837837837838, "grad_norm": 1.8641653060913086, "learning_rate": 3.8161238161238165e-06, "loss": 0.038, "step": 3159 }, { "epoch": 3.284823284823285, "grad_norm": 7.680948257446289, "learning_rate": 3.8138138138138143e-06, "loss": 0.1728, "step": 3160 }, { "epoch": 3.285862785862786, "grad_norm": 5.503201007843018, "learning_rate": 3.8115038115038116e-06, "loss": 0.0966, "step": 3161 }, { "epoch": 3.286902286902287, "grad_norm": 17.328290939331055, "learning_rate": 3.80919380919381e-06, "loss": 2.5301, "step": 3162 }, { "epoch": 3.287941787941788, "grad_norm": 0.25888022780418396, "learning_rate": 3.806883806883807e-06, "loss": 0.0081, "step": 3163 }, { "epoch": 3.288981288981289, "grad_norm": 5.120408535003662, "learning_rate": 3.804573804573805e-06, "loss": 0.2536, "step": 3164 }, { "epoch": 3.29002079002079, "grad_norm": 9.060827255249023, "learning_rate": 3.8022638022638027e-06, "loss": 0.8146, "step": 3165 }, { "epoch": 3.2910602910602913, "grad_norm": 1.1167017221450806, "learning_rate": 3.7999537999538004e-06, "loss": 0.0379, "step": 3166 }, { "epoch": 3.2920997920997923, "grad_norm": 2.823840379714966, "learning_rate": 3.7976437976437978e-06, "loss": 0.0544, "step": 3167 }, { "epoch": 3.2931392931392933, "grad_norm": 4.674472808837891, "learning_rate": 3.7953337953337956e-06, "loss": 0.2274, "step": 3168 }, { "epoch": 3.2941787941787943, "grad_norm": 8.276628494262695, "learning_rate": 3.7930237930237933e-06, "loss": 0.4559, "step": 3169 }, { "epoch": 3.2952182952182953, "grad_norm": 3.9904773235321045, "learning_rate": 3.7907137907137907e-06, "loss": 0.1314, "step": 3170 }, { "epoch": 3.2962577962577964, "grad_norm": 2.9665725231170654, "learning_rate": 3.788403788403789e-06, "loss": 0.0674, "step": 3171 }, { "epoch": 3.2972972972972974, "grad_norm": 0.3282411992549896, "learning_rate": 3.786093786093786e-06, "loss": 0.0073, "step": 3172 }, { "epoch": 3.2983367983367984, "grad_norm": 3.566387414932251, "learning_rate": 3.7837837837837844e-06, "loss": 0.0305, "step": 3173 }, { "epoch": 3.2993762993762994, "grad_norm": 0.08132052421569824, "learning_rate": 3.7814737814737817e-06, "loss": 0.0018, "step": 3174 }, { "epoch": 3.3004158004158004, "grad_norm": 0.3602537214756012, "learning_rate": 3.77916377916378e-06, "loss": 0.0076, "step": 3175 }, { "epoch": 3.3014553014553014, "grad_norm": 4.880962371826172, "learning_rate": 3.7768537768537773e-06, "loss": 0.1439, "step": 3176 }, { "epoch": 3.3024948024948024, "grad_norm": 10.522567749023438, "learning_rate": 3.7745437745437746e-06, "loss": 0.6237, "step": 3177 }, { "epoch": 3.3035343035343034, "grad_norm": 3.7333545684814453, "learning_rate": 3.772233772233773e-06, "loss": 0.0962, "step": 3178 }, { "epoch": 3.3045738045738045, "grad_norm": 0.20931552350521088, "learning_rate": 3.76992376992377e-06, "loss": 0.0057, "step": 3179 }, { "epoch": 3.3056133056133055, "grad_norm": 0.03920707851648331, "learning_rate": 3.767613767613768e-06, "loss": 0.0009, "step": 3180 }, { "epoch": 3.3066528066528065, "grad_norm": 0.022222192957997322, "learning_rate": 3.7653037653037657e-06, "loss": 0.0007, "step": 3181 }, { "epoch": 3.3076923076923075, "grad_norm": 10.037214279174805, "learning_rate": 3.7629937629937634e-06, "loss": 0.6276, "step": 3182 }, { "epoch": 3.3087318087318085, "grad_norm": 0.10803408175706863, "learning_rate": 3.760683760683761e-06, "loss": 0.0022, "step": 3183 }, { "epoch": 3.30977130977131, "grad_norm": 3.0191409587860107, "learning_rate": 3.758373758373759e-06, "loss": 0.05, "step": 3184 }, { "epoch": 3.310810810810811, "grad_norm": 0.004974550101906061, "learning_rate": 3.7560637560637563e-06, "loss": 0.0001, "step": 3185 }, { "epoch": 3.311850311850312, "grad_norm": 2.488440990447998, "learning_rate": 3.7537537537537537e-06, "loss": 0.0761, "step": 3186 }, { "epoch": 3.312889812889813, "grad_norm": 0.1313776671886444, "learning_rate": 3.751443751443752e-06, "loss": 0.0036, "step": 3187 }, { "epoch": 3.313929313929314, "grad_norm": 0.7895636558532715, "learning_rate": 3.749133749133749e-06, "loss": 0.0161, "step": 3188 }, { "epoch": 3.314968814968815, "grad_norm": 0.1644628494977951, "learning_rate": 3.7468237468237474e-06, "loss": 0.0032, "step": 3189 }, { "epoch": 3.316008316008316, "grad_norm": 7.237555980682373, "learning_rate": 3.7445137445137447e-06, "loss": 0.2825, "step": 3190 }, { "epoch": 3.317047817047817, "grad_norm": 0.4008646607398987, "learning_rate": 3.7422037422037425e-06, "loss": 0.0082, "step": 3191 }, { "epoch": 3.318087318087318, "grad_norm": 0.002861259737983346, "learning_rate": 3.7398937398937403e-06, "loss": 0.0001, "step": 3192 }, { "epoch": 3.319126819126819, "grad_norm": 0.028044261038303375, "learning_rate": 3.737583737583738e-06, "loss": 0.0005, "step": 3193 }, { "epoch": 3.32016632016632, "grad_norm": 0.0059648011811077595, "learning_rate": 3.735273735273736e-06, "loss": 0.0002, "step": 3194 }, { "epoch": 3.321205821205821, "grad_norm": 0.5465347766876221, "learning_rate": 3.732963732963733e-06, "loss": 0.0163, "step": 3195 }, { "epoch": 3.322245322245322, "grad_norm": 1.3572473526000977, "learning_rate": 3.730653730653731e-06, "loss": 0.0245, "step": 3196 }, { "epoch": 3.323284823284823, "grad_norm": 10.709805488586426, "learning_rate": 3.7283437283437287e-06, "loss": 1.354, "step": 3197 }, { "epoch": 3.3243243243243246, "grad_norm": 8.527897834777832, "learning_rate": 3.7260337260337265e-06, "loss": 0.2518, "step": 3198 }, { "epoch": 3.3253638253638256, "grad_norm": 0.12114723026752472, "learning_rate": 3.723723723723724e-06, "loss": 0.0044, "step": 3199 }, { "epoch": 3.3264033264033266, "grad_norm": 1.078540563583374, "learning_rate": 3.721413721413722e-06, "loss": 0.0328, "step": 3200 }, { "epoch": 3.3274428274428276, "grad_norm": 0.03818194568157196, "learning_rate": 3.7191037191037193e-06, "loss": 0.001, "step": 3201 }, { "epoch": 3.3284823284823286, "grad_norm": 9.92434024810791, "learning_rate": 3.7167937167937175e-06, "loss": 0.4603, "step": 3202 }, { "epoch": 3.3295218295218296, "grad_norm": 7.579704761505127, "learning_rate": 3.714483714483715e-06, "loss": 0.3058, "step": 3203 }, { "epoch": 3.3305613305613306, "grad_norm": 10.627359390258789, "learning_rate": 3.712173712173712e-06, "loss": 1.1327, "step": 3204 }, { "epoch": 3.3316008316008316, "grad_norm": 11.167915344238281, "learning_rate": 3.7098637098637104e-06, "loss": 0.4044, "step": 3205 }, { "epoch": 3.3326403326403327, "grad_norm": 9.741639137268066, "learning_rate": 3.7075537075537077e-06, "loss": 0.5733, "step": 3206 }, { "epoch": 3.3336798336798337, "grad_norm": 21.80474090576172, "learning_rate": 3.7052437052437055e-06, "loss": 1.0578, "step": 3207 }, { "epoch": 3.3347193347193347, "grad_norm": 11.758549690246582, "learning_rate": 3.7029337029337033e-06, "loss": 0.8109, "step": 3208 }, { "epoch": 3.3357588357588357, "grad_norm": 2.411346197128296, "learning_rate": 3.700623700623701e-06, "loss": 0.0536, "step": 3209 }, { "epoch": 3.3367983367983367, "grad_norm": 0.29183125495910645, "learning_rate": 3.6983136983136984e-06, "loss": 0.0056, "step": 3210 }, { "epoch": 3.3378378378378377, "grad_norm": 8.743294715881348, "learning_rate": 3.6960036960036966e-06, "loss": 0.9973, "step": 3211 }, { "epoch": 3.3388773388773387, "grad_norm": 4.9171953201293945, "learning_rate": 3.693693693693694e-06, "loss": 0.1239, "step": 3212 }, { "epoch": 3.3399168399168397, "grad_norm": 9.574235916137695, "learning_rate": 3.6913836913836913e-06, "loss": 0.245, "step": 3213 }, { "epoch": 3.3409563409563408, "grad_norm": 10.361645698547363, "learning_rate": 3.6890736890736895e-06, "loss": 0.5266, "step": 3214 }, { "epoch": 3.3419958419958418, "grad_norm": 1.520453691482544, "learning_rate": 3.686763686763687e-06, "loss": 0.2928, "step": 3215 }, { "epoch": 3.343035343035343, "grad_norm": 1.8686317205429077, "learning_rate": 3.684453684453685e-06, "loss": 0.0391, "step": 3216 }, { "epoch": 3.3440748440748442, "grad_norm": 8.648658752441406, "learning_rate": 3.6821436821436823e-06, "loss": 1.0406, "step": 3217 }, { "epoch": 3.3451143451143452, "grad_norm": 2.9640727043151855, "learning_rate": 3.6798336798336805e-06, "loss": 0.0632, "step": 3218 }, { "epoch": 3.3461538461538463, "grad_norm": 2.495372772216797, "learning_rate": 3.677523677523678e-06, "loss": 0.0467, "step": 3219 }, { "epoch": 3.3471933471933473, "grad_norm": 10.242700576782227, "learning_rate": 3.6752136752136756e-06, "loss": 0.7899, "step": 3220 }, { "epoch": 3.3482328482328483, "grad_norm": 5.957021713256836, "learning_rate": 3.6729036729036734e-06, "loss": 0.7897, "step": 3221 }, { "epoch": 3.3492723492723493, "grad_norm": 0.18006573617458344, "learning_rate": 3.6705936705936707e-06, "loss": 0.0059, "step": 3222 }, { "epoch": 3.3503118503118503, "grad_norm": 0.0461384654045105, "learning_rate": 3.6682836682836685e-06, "loss": 0.0013, "step": 3223 }, { "epoch": 3.3513513513513513, "grad_norm": 6.071020603179932, "learning_rate": 3.6659736659736663e-06, "loss": 0.2303, "step": 3224 }, { "epoch": 3.3523908523908523, "grad_norm": 1.346615195274353, "learning_rate": 3.663663663663664e-06, "loss": 0.0333, "step": 3225 }, { "epoch": 3.3534303534303533, "grad_norm": 0.11079871654510498, "learning_rate": 3.6613536613536614e-06, "loss": 0.0038, "step": 3226 }, { "epoch": 3.3544698544698544, "grad_norm": 3.9040043354034424, "learning_rate": 3.6590436590436596e-06, "loss": 0.1225, "step": 3227 }, { "epoch": 3.3555093555093554, "grad_norm": 6.001784324645996, "learning_rate": 3.656733656733657e-06, "loss": 0.3533, "step": 3228 }, { "epoch": 3.3565488565488564, "grad_norm": 6.267105579376221, "learning_rate": 3.654423654423655e-06, "loss": 0.189, "step": 3229 }, { "epoch": 3.357588357588358, "grad_norm": 9.05850601196289, "learning_rate": 3.6521136521136525e-06, "loss": 1.8204, "step": 3230 }, { "epoch": 3.358627858627859, "grad_norm": 5.095354080200195, "learning_rate": 3.64980364980365e-06, "loss": 0.1764, "step": 3231 }, { "epoch": 3.35966735966736, "grad_norm": 0.06016012281179428, "learning_rate": 3.647493647493648e-06, "loss": 0.0014, "step": 3232 }, { "epoch": 3.360706860706861, "grad_norm": 4.24834680557251, "learning_rate": 3.6451836451836453e-06, "loss": 0.2245, "step": 3233 }, { "epoch": 3.361746361746362, "grad_norm": 0.030917443335056305, "learning_rate": 3.642873642873643e-06, "loss": 0.0007, "step": 3234 }, { "epoch": 3.362785862785863, "grad_norm": 0.8087771534919739, "learning_rate": 3.640563640563641e-06, "loss": 0.0179, "step": 3235 }, { "epoch": 3.363825363825364, "grad_norm": 0.3363551199436188, "learning_rate": 3.6382536382536386e-06, "loss": 0.0066, "step": 3236 }, { "epoch": 3.364864864864865, "grad_norm": 8.794129371643066, "learning_rate": 3.635943635943636e-06, "loss": 0.6676, "step": 3237 }, { "epoch": 3.365904365904366, "grad_norm": 5.283533096313477, "learning_rate": 3.633633633633634e-06, "loss": 0.1615, "step": 3238 }, { "epoch": 3.366943866943867, "grad_norm": 0.8963358402252197, "learning_rate": 3.6313236313236315e-06, "loss": 0.0214, "step": 3239 }, { "epoch": 3.367983367983368, "grad_norm": 1.7845673561096191, "learning_rate": 3.6290136290136293e-06, "loss": 0.0336, "step": 3240 }, { "epoch": 3.369022869022869, "grad_norm": 3.5747463703155518, "learning_rate": 3.626703626703627e-06, "loss": 0.0929, "step": 3241 }, { "epoch": 3.37006237006237, "grad_norm": 1.0860050916671753, "learning_rate": 3.6243936243936244e-06, "loss": 0.0163, "step": 3242 }, { "epoch": 3.371101871101871, "grad_norm": 2.0682332515716553, "learning_rate": 3.6220836220836226e-06, "loss": 0.0467, "step": 3243 }, { "epoch": 3.372141372141372, "grad_norm": 0.3948007822036743, "learning_rate": 3.61977361977362e-06, "loss": 0.01, "step": 3244 }, { "epoch": 3.373180873180873, "grad_norm": 0.7122275233268738, "learning_rate": 3.617463617463618e-06, "loss": 0.0148, "step": 3245 }, { "epoch": 3.374220374220374, "grad_norm": 3.147752046585083, "learning_rate": 3.6151536151536155e-06, "loss": 0.1012, "step": 3246 }, { "epoch": 3.375259875259875, "grad_norm": 8.694098472595215, "learning_rate": 3.6128436128436132e-06, "loss": 0.3972, "step": 3247 }, { "epoch": 3.3762993762993765, "grad_norm": 6.065917491912842, "learning_rate": 3.610533610533611e-06, "loss": 0.1205, "step": 3248 }, { "epoch": 3.3773388773388775, "grad_norm": 0.15893030166625977, "learning_rate": 3.6082236082236083e-06, "loss": 0.0049, "step": 3249 }, { "epoch": 3.3783783783783785, "grad_norm": 1.9620164632797241, "learning_rate": 3.605913605913606e-06, "loss": 0.0327, "step": 3250 }, { "epoch": 3.3794178794178795, "grad_norm": 6.791231155395508, "learning_rate": 3.603603603603604e-06, "loss": 0.724, "step": 3251 }, { "epoch": 3.3804573804573805, "grad_norm": 1.8762073516845703, "learning_rate": 3.6012936012936016e-06, "loss": 0.0796, "step": 3252 }, { "epoch": 3.3814968814968815, "grad_norm": 2.9928243160247803, "learning_rate": 3.598983598983599e-06, "loss": 0.2971, "step": 3253 }, { "epoch": 3.3825363825363826, "grad_norm": 1.9753413200378418, "learning_rate": 3.596673596673597e-06, "loss": 0.0759, "step": 3254 }, { "epoch": 3.3835758835758836, "grad_norm": 5.722926616668701, "learning_rate": 3.5943635943635945e-06, "loss": 0.1421, "step": 3255 }, { "epoch": 3.3846153846153846, "grad_norm": 6.9614481925964355, "learning_rate": 3.5920535920535927e-06, "loss": 0.2756, "step": 3256 }, { "epoch": 3.3856548856548856, "grad_norm": 1.7708218097686768, "learning_rate": 3.58974358974359e-06, "loss": 0.0287, "step": 3257 }, { "epoch": 3.3866943866943866, "grad_norm": 4.310425758361816, "learning_rate": 3.5874335874335874e-06, "loss": 0.3408, "step": 3258 }, { "epoch": 3.3877338877338876, "grad_norm": 0.33174580335617065, "learning_rate": 3.5851235851235856e-06, "loss": 0.0075, "step": 3259 }, { "epoch": 3.3887733887733886, "grad_norm": 0.6925557851791382, "learning_rate": 3.582813582813583e-06, "loss": 0.018, "step": 3260 }, { "epoch": 3.3898128898128896, "grad_norm": 5.32797908782959, "learning_rate": 3.580503580503581e-06, "loss": 0.6188, "step": 3261 }, { "epoch": 3.390852390852391, "grad_norm": 13.79472541809082, "learning_rate": 3.5781935781935785e-06, "loss": 1.1053, "step": 3262 }, { "epoch": 3.391891891891892, "grad_norm": 4.517867088317871, "learning_rate": 3.5758835758835762e-06, "loss": 0.159, "step": 3263 }, { "epoch": 3.392931392931393, "grad_norm": 10.487228393554688, "learning_rate": 3.573573573573574e-06, "loss": 0.6029, "step": 3264 }, { "epoch": 3.393970893970894, "grad_norm": 0.06305764615535736, "learning_rate": 3.5712635712635718e-06, "loss": 0.0014, "step": 3265 }, { "epoch": 3.395010395010395, "grad_norm": 0.005865427199751139, "learning_rate": 3.568953568953569e-06, "loss": 0.0001, "step": 3266 }, { "epoch": 3.396049896049896, "grad_norm": 0.030989723280072212, "learning_rate": 3.566643566643567e-06, "loss": 0.0006, "step": 3267 }, { "epoch": 3.397089397089397, "grad_norm": 5.602653980255127, "learning_rate": 3.5643335643335647e-06, "loss": 0.0412, "step": 3268 }, { "epoch": 3.398128898128898, "grad_norm": 17.178709030151367, "learning_rate": 3.562023562023562e-06, "loss": 1.5057, "step": 3269 }, { "epoch": 3.399168399168399, "grad_norm": 0.24581624567508698, "learning_rate": 3.55971355971356e-06, "loss": 0.0068, "step": 3270 }, { "epoch": 3.4002079002079, "grad_norm": 3.708796501159668, "learning_rate": 3.5574035574035575e-06, "loss": 0.0858, "step": 3271 }, { "epoch": 3.401247401247401, "grad_norm": 5.2083868980407715, "learning_rate": 3.5550935550935557e-06, "loss": 0.124, "step": 3272 }, { "epoch": 3.4022869022869022, "grad_norm": 0.6852853894233704, "learning_rate": 3.552783552783553e-06, "loss": 0.0206, "step": 3273 }, { "epoch": 3.4033264033264032, "grad_norm": 7.372675895690918, "learning_rate": 3.550473550473551e-06, "loss": 0.1647, "step": 3274 }, { "epoch": 3.4043659043659042, "grad_norm": 7.724948406219482, "learning_rate": 3.5481635481635486e-06, "loss": 0.2526, "step": 3275 }, { "epoch": 3.4054054054054053, "grad_norm": 8.5491304397583, "learning_rate": 3.545853545853546e-06, "loss": 0.7543, "step": 3276 }, { "epoch": 3.4064449064449063, "grad_norm": 1.5162405967712402, "learning_rate": 3.5435435435435437e-06, "loss": 0.0232, "step": 3277 }, { "epoch": 3.4074844074844073, "grad_norm": 6.716646194458008, "learning_rate": 3.5412335412335415e-06, "loss": 0.1469, "step": 3278 }, { "epoch": 3.4085239085239083, "grad_norm": 1.3024156093597412, "learning_rate": 3.5389235389235392e-06, "loss": 0.0274, "step": 3279 }, { "epoch": 3.4095634095634098, "grad_norm": 2.4426019191741943, "learning_rate": 3.5366135366135366e-06, "loss": 0.0742, "step": 3280 }, { "epoch": 3.4106029106029108, "grad_norm": 1.7469676733016968, "learning_rate": 3.5343035343035348e-06, "loss": 0.0559, "step": 3281 }, { "epoch": 3.4116424116424118, "grad_norm": 0.047953590750694275, "learning_rate": 3.531993531993532e-06, "loss": 0.001, "step": 3282 }, { "epoch": 3.412681912681913, "grad_norm": 9.703154563903809, "learning_rate": 3.5296835296835303e-06, "loss": 0.39, "step": 3283 }, { "epoch": 3.413721413721414, "grad_norm": 2.85369873046875, "learning_rate": 3.5273735273735277e-06, "loss": 0.0528, "step": 3284 }, { "epoch": 3.414760914760915, "grad_norm": 10.792146682739258, "learning_rate": 3.525063525063525e-06, "loss": 0.5995, "step": 3285 }, { "epoch": 3.415800415800416, "grad_norm": 7.268505096435547, "learning_rate": 3.522753522753523e-06, "loss": 0.2875, "step": 3286 }, { "epoch": 3.416839916839917, "grad_norm": 0.2587016224861145, "learning_rate": 3.5204435204435205e-06, "loss": 0.0075, "step": 3287 }, { "epoch": 3.417879417879418, "grad_norm": 4.772622108459473, "learning_rate": 3.5181335181335187e-06, "loss": 0.4761, "step": 3288 }, { "epoch": 3.418918918918919, "grad_norm": 8.32681941986084, "learning_rate": 3.515823515823516e-06, "loss": 0.2824, "step": 3289 }, { "epoch": 3.41995841995842, "grad_norm": 8.72518253326416, "learning_rate": 3.513513513513514e-06, "loss": 0.5085, "step": 3290 }, { "epoch": 3.420997920997921, "grad_norm": 0.007968052290380001, "learning_rate": 3.5112035112035116e-06, "loss": 0.0002, "step": 3291 }, { "epoch": 3.422037422037422, "grad_norm": 14.061298370361328, "learning_rate": 3.5088935088935094e-06, "loss": 1.0375, "step": 3292 }, { "epoch": 3.423076923076923, "grad_norm": 6.244088649749756, "learning_rate": 3.5065835065835067e-06, "loss": 0.1486, "step": 3293 }, { "epoch": 3.4241164241164244, "grad_norm": 9.241902351379395, "learning_rate": 3.5042735042735045e-06, "loss": 0.7761, "step": 3294 }, { "epoch": 3.4251559251559254, "grad_norm": 7.88060188293457, "learning_rate": 3.5019635019635022e-06, "loss": 0.701, "step": 3295 }, { "epoch": 3.4261954261954264, "grad_norm": 10.947381973266602, "learning_rate": 3.4996534996534996e-06, "loss": 0.8914, "step": 3296 }, { "epoch": 3.4272349272349274, "grad_norm": 8.519932746887207, "learning_rate": 3.4973434973434978e-06, "loss": 0.6309, "step": 3297 }, { "epoch": 3.4282744282744284, "grad_norm": 0.453152596950531, "learning_rate": 3.495033495033495e-06, "loss": 0.0069, "step": 3298 }, { "epoch": 3.4293139293139294, "grad_norm": 5.153072834014893, "learning_rate": 3.4927234927234933e-06, "loss": 0.3492, "step": 3299 }, { "epoch": 3.4303534303534304, "grad_norm": 0.113200344145298, "learning_rate": 3.4904134904134907e-06, "loss": 0.0024, "step": 3300 }, { "epoch": 3.4313929313929314, "grad_norm": 0.620791494846344, "learning_rate": 3.488103488103489e-06, "loss": 0.009, "step": 3301 }, { "epoch": 3.4324324324324325, "grad_norm": 9.207609176635742, "learning_rate": 3.485793485793486e-06, "loss": 0.6816, "step": 3302 }, { "epoch": 3.4334719334719335, "grad_norm": 7.150522232055664, "learning_rate": 3.4834834834834835e-06, "loss": 0.3006, "step": 3303 }, { "epoch": 3.4345114345114345, "grad_norm": 14.311772346496582, "learning_rate": 3.4811734811734817e-06, "loss": 0.6698, "step": 3304 }, { "epoch": 3.4355509355509355, "grad_norm": 5.834444522857666, "learning_rate": 3.478863478863479e-06, "loss": 0.2432, "step": 3305 }, { "epoch": 3.4365904365904365, "grad_norm": 5.6178388595581055, "learning_rate": 3.476553476553477e-06, "loss": 0.2244, "step": 3306 }, { "epoch": 3.4376299376299375, "grad_norm": 8.573112487792969, "learning_rate": 3.4742434742434746e-06, "loss": 0.2221, "step": 3307 }, { "epoch": 3.4386694386694385, "grad_norm": 1.456484317779541, "learning_rate": 3.4719334719334724e-06, "loss": 0.027, "step": 3308 }, { "epoch": 3.4397089397089395, "grad_norm": 8.12042236328125, "learning_rate": 3.4696234696234697e-06, "loss": 0.1341, "step": 3309 }, { "epoch": 3.4407484407484406, "grad_norm": 2.8493335247039795, "learning_rate": 3.467313467313468e-06, "loss": 0.0507, "step": 3310 }, { "epoch": 3.4417879417879416, "grad_norm": 0.18166130781173706, "learning_rate": 3.4650034650034653e-06, "loss": 0.0043, "step": 3311 }, { "epoch": 3.442827442827443, "grad_norm": 0.8375445008277893, "learning_rate": 3.4626934626934626e-06, "loss": 0.0124, "step": 3312 }, { "epoch": 3.443866943866944, "grad_norm": 3.7751896381378174, "learning_rate": 3.4603834603834608e-06, "loss": 0.2906, "step": 3313 }, { "epoch": 3.444906444906445, "grad_norm": 6.509339809417725, "learning_rate": 3.458073458073458e-06, "loss": 0.3631, "step": 3314 }, { "epoch": 3.445945945945946, "grad_norm": 8.254549026489258, "learning_rate": 3.4557634557634563e-06, "loss": 0.2388, "step": 3315 }, { "epoch": 3.446985446985447, "grad_norm": 7.653320789337158, "learning_rate": 3.4534534534534537e-06, "loss": 0.2905, "step": 3316 }, { "epoch": 3.448024948024948, "grad_norm": 13.026524543762207, "learning_rate": 3.4511434511434514e-06, "loss": 0.4787, "step": 3317 }, { "epoch": 3.449064449064449, "grad_norm": 3.805297374725342, "learning_rate": 3.448833448833449e-06, "loss": 0.1155, "step": 3318 }, { "epoch": 3.45010395010395, "grad_norm": 0.15272344648838043, "learning_rate": 3.446523446523447e-06, "loss": 0.0039, "step": 3319 }, { "epoch": 3.451143451143451, "grad_norm": 6.021086692810059, "learning_rate": 3.4442134442134443e-06, "loss": 0.3758, "step": 3320 }, { "epoch": 3.452182952182952, "grad_norm": 0.07570965588092804, "learning_rate": 3.441903441903442e-06, "loss": 0.0027, "step": 3321 }, { "epoch": 3.453222453222453, "grad_norm": 0.24904006719589233, "learning_rate": 3.43959343959344e-06, "loss": 0.0048, "step": 3322 }, { "epoch": 3.454261954261954, "grad_norm": 21.89639663696289, "learning_rate": 3.437283437283437e-06, "loss": 0.4499, "step": 3323 }, { "epoch": 3.455301455301455, "grad_norm": 0.08154599368572235, "learning_rate": 3.4349734349734354e-06, "loss": 0.0018, "step": 3324 }, { "epoch": 3.456340956340956, "grad_norm": 0.4847028851509094, "learning_rate": 3.4326634326634327e-06, "loss": 0.0132, "step": 3325 }, { "epoch": 3.4573804573804576, "grad_norm": 13.547603607177734, "learning_rate": 3.430353430353431e-06, "loss": 1.0378, "step": 3326 }, { "epoch": 3.4584199584199586, "grad_norm": 10.268739700317383, "learning_rate": 3.4280434280434283e-06, "loss": 0.6069, "step": 3327 }, { "epoch": 3.4594594594594597, "grad_norm": 0.10986010730266571, "learning_rate": 3.4257334257334264e-06, "loss": 0.0018, "step": 3328 }, { "epoch": 3.4604989604989607, "grad_norm": 1.7400387525558472, "learning_rate": 3.423423423423424e-06, "loss": 0.0485, "step": 3329 }, { "epoch": 3.4615384615384617, "grad_norm": 3.423602819442749, "learning_rate": 3.421113421113421e-06, "loss": 0.1067, "step": 3330 }, { "epoch": 3.4625779625779627, "grad_norm": 0.472627729177475, "learning_rate": 3.4188034188034193e-06, "loss": 0.0154, "step": 3331 }, { "epoch": 3.4636174636174637, "grad_norm": 0.05725175887346268, "learning_rate": 3.4164934164934167e-06, "loss": 0.0017, "step": 3332 }, { "epoch": 3.4646569646569647, "grad_norm": 0.6359158754348755, "learning_rate": 3.4141834141834144e-06, "loss": 0.0094, "step": 3333 }, { "epoch": 3.4656964656964657, "grad_norm": 7.547875881195068, "learning_rate": 3.411873411873412e-06, "loss": 0.5435, "step": 3334 }, { "epoch": 3.4667359667359667, "grad_norm": 0.7937979698181152, "learning_rate": 3.40956340956341e-06, "loss": 0.0177, "step": 3335 }, { "epoch": 3.4677754677754677, "grad_norm": 10.875249862670898, "learning_rate": 3.4072534072534073e-06, "loss": 0.9046, "step": 3336 }, { "epoch": 3.4688149688149688, "grad_norm": 0.3446725606918335, "learning_rate": 3.4049434049434055e-06, "loss": 0.0061, "step": 3337 }, { "epoch": 3.4698544698544698, "grad_norm": 4.590447425842285, "learning_rate": 3.402633402633403e-06, "loss": 0.1371, "step": 3338 }, { "epoch": 3.470893970893971, "grad_norm": 6.492908954620361, "learning_rate": 3.4003234003234e-06, "loss": 0.3914, "step": 3339 }, { "epoch": 3.471933471933472, "grad_norm": 0.5407949686050415, "learning_rate": 3.3980133980133984e-06, "loss": 0.0078, "step": 3340 }, { "epoch": 3.472972972972973, "grad_norm": 14.164336204528809, "learning_rate": 3.3957033957033957e-06, "loss": 1.6147, "step": 3341 }, { "epoch": 3.474012474012474, "grad_norm": 3.5835986137390137, "learning_rate": 3.393393393393394e-06, "loss": 0.0709, "step": 3342 }, { "epoch": 3.475051975051975, "grad_norm": 0.34511393308639526, "learning_rate": 3.3910833910833913e-06, "loss": 0.008, "step": 3343 }, { "epoch": 3.4760914760914763, "grad_norm": 2.1724395751953125, "learning_rate": 3.388773388773389e-06, "loss": 0.0773, "step": 3344 }, { "epoch": 3.4771309771309773, "grad_norm": 0.025691157206892967, "learning_rate": 3.386463386463387e-06, "loss": 0.0007, "step": 3345 }, { "epoch": 3.4781704781704783, "grad_norm": 0.0027891502249985933, "learning_rate": 3.3841533841533846e-06, "loss": 0.0001, "step": 3346 }, { "epoch": 3.4792099792099793, "grad_norm": 7.376180171966553, "learning_rate": 3.3818433818433823e-06, "loss": 0.7187, "step": 3347 }, { "epoch": 3.4802494802494803, "grad_norm": 0.02257869578897953, "learning_rate": 3.3795333795333797e-06, "loss": 0.0006, "step": 3348 }, { "epoch": 3.4812889812889813, "grad_norm": 10.220701217651367, "learning_rate": 3.3772233772233774e-06, "loss": 0.5355, "step": 3349 }, { "epoch": 3.4823284823284824, "grad_norm": 3.1495954990386963, "learning_rate": 3.374913374913375e-06, "loss": 0.3358, "step": 3350 }, { "epoch": 3.4833679833679834, "grad_norm": 5.3782830238342285, "learning_rate": 3.372603372603373e-06, "loss": 0.1578, "step": 3351 }, { "epoch": 3.4844074844074844, "grad_norm": 0.06863047927618027, "learning_rate": 3.3702933702933703e-06, "loss": 0.0012, "step": 3352 }, { "epoch": 3.4854469854469854, "grad_norm": 1.133750319480896, "learning_rate": 3.3679833679833685e-06, "loss": 0.0422, "step": 3353 }, { "epoch": 3.4864864864864864, "grad_norm": 11.342142105102539, "learning_rate": 3.365673365673366e-06, "loss": 1.5505, "step": 3354 }, { "epoch": 3.4875259875259874, "grad_norm": 1.8219274282455444, "learning_rate": 3.363363363363364e-06, "loss": 0.0329, "step": 3355 }, { "epoch": 3.4885654885654884, "grad_norm": 5.730031967163086, "learning_rate": 3.3610533610533614e-06, "loss": 0.2092, "step": 3356 }, { "epoch": 3.4896049896049894, "grad_norm": 0.18049009144306183, "learning_rate": 3.3587433587433587e-06, "loss": 0.0041, "step": 3357 }, { "epoch": 3.490644490644491, "grad_norm": 2.5031416416168213, "learning_rate": 3.356433356433357e-06, "loss": 0.0972, "step": 3358 }, { "epoch": 3.491683991683992, "grad_norm": 0.024312352761626244, "learning_rate": 3.3541233541233543e-06, "loss": 0.0007, "step": 3359 }, { "epoch": 3.492723492723493, "grad_norm": 1.044015884399414, "learning_rate": 3.351813351813352e-06, "loss": 0.0218, "step": 3360 }, { "epoch": 3.493762993762994, "grad_norm": 9.544400215148926, "learning_rate": 3.34950334950335e-06, "loss": 0.8989, "step": 3361 }, { "epoch": 3.494802494802495, "grad_norm": 3.1391563415527344, "learning_rate": 3.3471933471933476e-06, "loss": 0.0732, "step": 3362 }, { "epoch": 3.495841995841996, "grad_norm": 0.5424788594245911, "learning_rate": 3.344883344883345e-06, "loss": 0.0115, "step": 3363 }, { "epoch": 3.496881496881497, "grad_norm": 11.996427536010742, "learning_rate": 3.342573342573343e-06, "loss": 0.8597, "step": 3364 }, { "epoch": 3.497920997920998, "grad_norm": 0.0049547902308404446, "learning_rate": 3.3402633402633404e-06, "loss": 0.0001, "step": 3365 }, { "epoch": 3.498960498960499, "grad_norm": 0.07860179245471954, "learning_rate": 3.337953337953338e-06, "loss": 0.0013, "step": 3366 }, { "epoch": 3.5, "grad_norm": 10.51418685913086, "learning_rate": 3.335643335643336e-06, "loss": 0.2187, "step": 3367 }, { "epoch": 3.501039501039501, "grad_norm": 0.04997267574071884, "learning_rate": 3.3333333333333333e-06, "loss": 0.0013, "step": 3368 }, { "epoch": 3.502079002079002, "grad_norm": 11.178793907165527, "learning_rate": 3.3310233310233315e-06, "loss": 0.9946, "step": 3369 }, { "epoch": 3.503118503118503, "grad_norm": 4.278314113616943, "learning_rate": 3.328713328713329e-06, "loss": 0.1087, "step": 3370 }, { "epoch": 3.504158004158004, "grad_norm": 1.7856881618499756, "learning_rate": 3.326403326403327e-06, "loss": 0.0257, "step": 3371 }, { "epoch": 3.505197505197505, "grad_norm": 0.048529352992773056, "learning_rate": 3.3240933240933244e-06, "loss": 0.0016, "step": 3372 }, { "epoch": 3.506237006237006, "grad_norm": 3.9260010719299316, "learning_rate": 3.321783321783322e-06, "loss": 0.0494, "step": 3373 }, { "epoch": 3.507276507276507, "grad_norm": 3.7316524982452393, "learning_rate": 3.31947331947332e-06, "loss": 0.0984, "step": 3374 }, { "epoch": 3.508316008316008, "grad_norm": 7.0579400062561035, "learning_rate": 3.3171633171633173e-06, "loss": 0.2721, "step": 3375 }, { "epoch": 3.509355509355509, "grad_norm": 0.03118578903377056, "learning_rate": 3.314853314853315e-06, "loss": 0.0008, "step": 3376 }, { "epoch": 3.51039501039501, "grad_norm": 7.600438117980957, "learning_rate": 3.312543312543313e-06, "loss": 0.3741, "step": 3377 }, { "epoch": 3.5114345114345116, "grad_norm": 0.0009457019041292369, "learning_rate": 3.3102333102333106e-06, "loss": 0.0, "step": 3378 }, { "epoch": 3.5124740124740126, "grad_norm": 3.3967413902282715, "learning_rate": 3.307923307923308e-06, "loss": 0.08, "step": 3379 }, { "epoch": 3.5135135135135136, "grad_norm": 0.9884988069534302, "learning_rate": 3.305613305613306e-06, "loss": 0.0188, "step": 3380 }, { "epoch": 3.5145530145530146, "grad_norm": 0.1111251637339592, "learning_rate": 3.3033033033033035e-06, "loss": 0.0028, "step": 3381 }, { "epoch": 3.5155925155925156, "grad_norm": 9.056451797485352, "learning_rate": 3.3009933009933016e-06, "loss": 0.366, "step": 3382 }, { "epoch": 3.5166320166320166, "grad_norm": 5.030453681945801, "learning_rate": 3.298683298683299e-06, "loss": 0.1245, "step": 3383 }, { "epoch": 3.5176715176715176, "grad_norm": 9.545477867126465, "learning_rate": 3.2963732963732963e-06, "loss": 2.5062, "step": 3384 }, { "epoch": 3.5187110187110187, "grad_norm": 17.07159423828125, "learning_rate": 3.2940632940632945e-06, "loss": 0.78, "step": 3385 }, { "epoch": 3.5197505197505197, "grad_norm": 0.9238329529762268, "learning_rate": 3.291753291753292e-06, "loss": 0.0238, "step": 3386 }, { "epoch": 3.5207900207900207, "grad_norm": 3.1281144618988037, "learning_rate": 3.2894432894432896e-06, "loss": 0.2682, "step": 3387 }, { "epoch": 3.5218295218295217, "grad_norm": 7.184891700744629, "learning_rate": 3.2871332871332874e-06, "loss": 0.3436, "step": 3388 }, { "epoch": 3.5228690228690227, "grad_norm": 0.00028464957722462714, "learning_rate": 3.284823284823285e-06, "loss": 0.0, "step": 3389 }, { "epoch": 3.523908523908524, "grad_norm": 1.0748562812805176, "learning_rate": 3.2825132825132825e-06, "loss": 0.0262, "step": 3390 }, { "epoch": 3.524948024948025, "grad_norm": 0.48287853598594666, "learning_rate": 3.2802032802032807e-06, "loss": 0.0074, "step": 3391 }, { "epoch": 3.525987525987526, "grad_norm": 2.27337908744812, "learning_rate": 3.277893277893278e-06, "loss": 0.0436, "step": 3392 }, { "epoch": 3.527027027027027, "grad_norm": 13.80897045135498, "learning_rate": 3.275583275583276e-06, "loss": 0.3124, "step": 3393 }, { "epoch": 3.528066528066528, "grad_norm": 4.119503498077393, "learning_rate": 3.2732732732732736e-06, "loss": 0.1013, "step": 3394 }, { "epoch": 3.529106029106029, "grad_norm": 1.8827383518218994, "learning_rate": 3.270963270963271e-06, "loss": 0.0556, "step": 3395 }, { "epoch": 3.5301455301455302, "grad_norm": 1.0069574117660522, "learning_rate": 3.268653268653269e-06, "loss": 0.016, "step": 3396 }, { "epoch": 3.5311850311850312, "grad_norm": 2.0999436378479004, "learning_rate": 3.2663432663432665e-06, "loss": 0.0216, "step": 3397 }, { "epoch": 3.5322245322245323, "grad_norm": 11.844630241394043, "learning_rate": 3.2640332640332646e-06, "loss": 1.6668, "step": 3398 }, { "epoch": 3.5332640332640333, "grad_norm": 5.110901832580566, "learning_rate": 3.261723261723262e-06, "loss": 0.1361, "step": 3399 }, { "epoch": 3.5343035343035343, "grad_norm": 0.22296836972236633, "learning_rate": 3.2594132594132598e-06, "loss": 0.0056, "step": 3400 }, { "epoch": 3.5353430353430353, "grad_norm": 8.396090507507324, "learning_rate": 3.2571032571032575e-06, "loss": 0.3264, "step": 3401 }, { "epoch": 3.5363825363825363, "grad_norm": 0.7088819742202759, "learning_rate": 3.254793254793255e-06, "loss": 0.013, "step": 3402 }, { "epoch": 3.5374220374220373, "grad_norm": 1.2826210260391235, "learning_rate": 3.2524832524832526e-06, "loss": 0.0232, "step": 3403 }, { "epoch": 3.5384615384615383, "grad_norm": 8.47658634185791, "learning_rate": 3.2501732501732504e-06, "loss": 0.3357, "step": 3404 }, { "epoch": 3.5395010395010393, "grad_norm": 11.396049499511719, "learning_rate": 3.247863247863248e-06, "loss": 1.7146, "step": 3405 }, { "epoch": 3.5405405405405403, "grad_norm": 0.4763636291027069, "learning_rate": 3.2455532455532455e-06, "loss": 0.0211, "step": 3406 }, { "epoch": 3.5415800415800414, "grad_norm": 1.6495420932769775, "learning_rate": 3.2432432432432437e-06, "loss": 0.0316, "step": 3407 }, { "epoch": 3.5426195426195424, "grad_norm": 2.4308693408966064, "learning_rate": 3.240933240933241e-06, "loss": 0.0561, "step": 3408 }, { "epoch": 3.5436590436590434, "grad_norm": 8.214385032653809, "learning_rate": 3.2386232386232392e-06, "loss": 0.3248, "step": 3409 }, { "epoch": 3.544698544698545, "grad_norm": 0.19386696815490723, "learning_rate": 3.2363132363132366e-06, "loss": 0.0022, "step": 3410 }, { "epoch": 3.545738045738046, "grad_norm": 6.6452107429504395, "learning_rate": 3.234003234003234e-06, "loss": 0.3384, "step": 3411 }, { "epoch": 3.546777546777547, "grad_norm": 7.752755165100098, "learning_rate": 3.231693231693232e-06, "loss": 0.7156, "step": 3412 }, { "epoch": 3.547817047817048, "grad_norm": 2.214608669281006, "learning_rate": 3.2293832293832295e-06, "loss": 0.0791, "step": 3413 }, { "epoch": 3.548856548856549, "grad_norm": 8.558451652526855, "learning_rate": 3.2270732270732276e-06, "loss": 0.9333, "step": 3414 }, { "epoch": 3.54989604989605, "grad_norm": 7.606333255767822, "learning_rate": 3.224763224763225e-06, "loss": 0.5964, "step": 3415 }, { "epoch": 3.550935550935551, "grad_norm": 3.464120388031006, "learning_rate": 3.2224532224532228e-06, "loss": 0.0738, "step": 3416 }, { "epoch": 3.551975051975052, "grad_norm": 12.014719009399414, "learning_rate": 3.2201432201432205e-06, "loss": 1.1626, "step": 3417 }, { "epoch": 3.553014553014553, "grad_norm": 16.28420639038086, "learning_rate": 3.2178332178332183e-06, "loss": 0.7914, "step": 3418 }, { "epoch": 3.554054054054054, "grad_norm": 0.35258999466896057, "learning_rate": 3.2155232155232156e-06, "loss": 0.0122, "step": 3419 }, { "epoch": 3.555093555093555, "grad_norm": 0.008318168111145496, "learning_rate": 3.2132132132132134e-06, "loss": 0.0002, "step": 3420 }, { "epoch": 3.556133056133056, "grad_norm": 0.10127261281013489, "learning_rate": 3.210903210903211e-06, "loss": 0.0022, "step": 3421 }, { "epoch": 3.5571725571725574, "grad_norm": 1.6699411869049072, "learning_rate": 3.2085932085932085e-06, "loss": 0.0236, "step": 3422 }, { "epoch": 3.5582120582120584, "grad_norm": 0.19638171792030334, "learning_rate": 3.2062832062832067e-06, "loss": 0.0037, "step": 3423 }, { "epoch": 3.5592515592515594, "grad_norm": 0.19892263412475586, "learning_rate": 3.203973203973204e-06, "loss": 0.0042, "step": 3424 }, { "epoch": 3.5602910602910605, "grad_norm": 6.505924224853516, "learning_rate": 3.2016632016632022e-06, "loss": 0.1503, "step": 3425 }, { "epoch": 3.5613305613305615, "grad_norm": 0.04345586895942688, "learning_rate": 3.1993531993531996e-06, "loss": 0.0011, "step": 3426 }, { "epoch": 3.5623700623700625, "grad_norm": 7.214348793029785, "learning_rate": 3.1970431970431974e-06, "loss": 0.4071, "step": 3427 }, { "epoch": 3.5634095634095635, "grad_norm": 0.15629810094833374, "learning_rate": 3.194733194733195e-06, "loss": 0.0053, "step": 3428 }, { "epoch": 3.5644490644490645, "grad_norm": 7.527303695678711, "learning_rate": 3.1924231924231925e-06, "loss": 0.3416, "step": 3429 }, { "epoch": 3.5654885654885655, "grad_norm": 9.97258472442627, "learning_rate": 3.1901131901131902e-06, "loss": 0.3462, "step": 3430 }, { "epoch": 3.5665280665280665, "grad_norm": 1.385948896408081, "learning_rate": 3.187803187803188e-06, "loss": 0.0396, "step": 3431 }, { "epoch": 3.5675675675675675, "grad_norm": 1.7325775623321533, "learning_rate": 3.1854931854931858e-06, "loss": 0.0559, "step": 3432 }, { "epoch": 3.5686070686070686, "grad_norm": 2.2397871017456055, "learning_rate": 3.183183183183183e-06, "loss": 0.0434, "step": 3433 }, { "epoch": 3.5696465696465696, "grad_norm": 0.028923479840159416, "learning_rate": 3.1808731808731813e-06, "loss": 0.0007, "step": 3434 }, { "epoch": 3.5706860706860706, "grad_norm": 1.0595364570617676, "learning_rate": 3.1785631785631786e-06, "loss": 0.0283, "step": 3435 }, { "epoch": 3.5717255717255716, "grad_norm": 2.357923746109009, "learning_rate": 3.176253176253177e-06, "loss": 0.0457, "step": 3436 }, { "epoch": 3.5727650727650726, "grad_norm": 11.933266639709473, "learning_rate": 3.173943173943174e-06, "loss": 1.2562, "step": 3437 }, { "epoch": 3.5738045738045736, "grad_norm": 8.789453506469727, "learning_rate": 3.1716331716331715e-06, "loss": 0.8383, "step": 3438 }, { "epoch": 3.5748440748440746, "grad_norm": 8.93902587890625, "learning_rate": 3.1693231693231697e-06, "loss": 0.6527, "step": 3439 }, { "epoch": 3.5758835758835756, "grad_norm": 0.027036476880311966, "learning_rate": 3.167013167013167e-06, "loss": 0.0005, "step": 3440 }, { "epoch": 3.5769230769230766, "grad_norm": 0.13876822590827942, "learning_rate": 3.1647031647031652e-06, "loss": 0.0027, "step": 3441 }, { "epoch": 3.577962577962578, "grad_norm": 8.124290466308594, "learning_rate": 3.1623931623931626e-06, "loss": 0.6514, "step": 3442 }, { "epoch": 3.579002079002079, "grad_norm": 0.695641279220581, "learning_rate": 3.1600831600831604e-06, "loss": 0.0171, "step": 3443 }, { "epoch": 3.58004158004158, "grad_norm": 9.36235523223877, "learning_rate": 3.157773157773158e-06, "loss": 0.4286, "step": 3444 }, { "epoch": 3.581081081081081, "grad_norm": 0.017492927610874176, "learning_rate": 3.155463155463156e-06, "loss": 0.0003, "step": 3445 }, { "epoch": 3.582120582120582, "grad_norm": 5.884636402130127, "learning_rate": 3.1531531531531532e-06, "loss": 0.6446, "step": 3446 }, { "epoch": 3.583160083160083, "grad_norm": 13.913434028625488, "learning_rate": 3.150843150843151e-06, "loss": 0.6823, "step": 3447 }, { "epoch": 3.584199584199584, "grad_norm": 8.719658851623535, "learning_rate": 3.1485331485331488e-06, "loss": 0.5322, "step": 3448 }, { "epoch": 3.585239085239085, "grad_norm": 10.48001480102539, "learning_rate": 3.146223146223146e-06, "loss": 1.0481, "step": 3449 }, { "epoch": 3.586278586278586, "grad_norm": 6.402904510498047, "learning_rate": 3.1439131439131443e-06, "loss": 0.447, "step": 3450 }, { "epoch": 3.587318087318087, "grad_norm": 9.76417064666748, "learning_rate": 3.1416031416031416e-06, "loss": 0.6952, "step": 3451 }, { "epoch": 3.5883575883575882, "grad_norm": 2.7027227878570557, "learning_rate": 3.13929313929314e-06, "loss": 0.0502, "step": 3452 }, { "epoch": 3.5893970893970892, "grad_norm": 0.3677961528301239, "learning_rate": 3.136983136983137e-06, "loss": 0.0098, "step": 3453 }, { "epoch": 3.5904365904365907, "grad_norm": 0.6174468398094177, "learning_rate": 3.1346731346731354e-06, "loss": 0.0145, "step": 3454 }, { "epoch": 3.5914760914760917, "grad_norm": 9.135771751403809, "learning_rate": 3.1323631323631327e-06, "loss": 0.2257, "step": 3455 }, { "epoch": 3.5925155925155927, "grad_norm": 0.7872626185417175, "learning_rate": 3.13005313005313e-06, "loss": 0.0139, "step": 3456 }, { "epoch": 3.5935550935550937, "grad_norm": 4.767543792724609, "learning_rate": 3.1277431277431283e-06, "loss": 0.1294, "step": 3457 }, { "epoch": 3.5945945945945947, "grad_norm": 9.243647575378418, "learning_rate": 3.1254331254331256e-06, "loss": 0.5878, "step": 3458 }, { "epoch": 3.5956340956340958, "grad_norm": 0.5455932021141052, "learning_rate": 3.1231231231231234e-06, "loss": 0.0145, "step": 3459 }, { "epoch": 3.5966735966735968, "grad_norm": 8.225960731506348, "learning_rate": 3.120813120813121e-06, "loss": 0.6251, "step": 3460 }, { "epoch": 3.5977130977130978, "grad_norm": 23.434959411621094, "learning_rate": 3.118503118503119e-06, "loss": 1.1641, "step": 3461 }, { "epoch": 3.598752598752599, "grad_norm": 5.3083720207214355, "learning_rate": 3.1161931161931162e-06, "loss": 0.1499, "step": 3462 }, { "epoch": 3.5997920997921, "grad_norm": 0.2932235300540924, "learning_rate": 3.1138831138831144e-06, "loss": 0.0054, "step": 3463 }, { "epoch": 3.600831600831601, "grad_norm": 0.014458829537034035, "learning_rate": 3.1115731115731118e-06, "loss": 0.0004, "step": 3464 }, { "epoch": 3.601871101871102, "grad_norm": 3.100857734680176, "learning_rate": 3.109263109263109e-06, "loss": 0.0848, "step": 3465 }, { "epoch": 3.602910602910603, "grad_norm": 3.241119384765625, "learning_rate": 3.1069531069531073e-06, "loss": 0.1285, "step": 3466 }, { "epoch": 3.603950103950104, "grad_norm": 1.9484933614730835, "learning_rate": 3.1046431046431047e-06, "loss": 0.0507, "step": 3467 }, { "epoch": 3.604989604989605, "grad_norm": 14.662372589111328, "learning_rate": 3.102333102333103e-06, "loss": 0.925, "step": 3468 }, { "epoch": 3.606029106029106, "grad_norm": 9.647832870483398, "learning_rate": 3.1000231000231e-06, "loss": 0.7678, "step": 3469 }, { "epoch": 3.607068607068607, "grad_norm": 2.403639316558838, "learning_rate": 3.097713097713098e-06, "loss": 0.1773, "step": 3470 }, { "epoch": 3.608108108108108, "grad_norm": 4.800213813781738, "learning_rate": 3.0954030954030957e-06, "loss": 0.1514, "step": 3471 }, { "epoch": 3.609147609147609, "grad_norm": 0.20564277470111847, "learning_rate": 3.0930930930930935e-06, "loss": 0.0037, "step": 3472 }, { "epoch": 3.61018711018711, "grad_norm": 0.821073055267334, "learning_rate": 3.090783090783091e-06, "loss": 0.0293, "step": 3473 }, { "epoch": 3.6112266112266114, "grad_norm": 1.197895884513855, "learning_rate": 3.0884730884730886e-06, "loss": 0.0228, "step": 3474 }, { "epoch": 3.6122661122661124, "grad_norm": 1.6000584363937378, "learning_rate": 3.0861630861630864e-06, "loss": 0.0328, "step": 3475 }, { "epoch": 3.6133056133056134, "grad_norm": 3.514110803604126, "learning_rate": 3.0838530838530837e-06, "loss": 0.0724, "step": 3476 }, { "epoch": 3.6143451143451144, "grad_norm": 11.626545906066895, "learning_rate": 3.081543081543082e-06, "loss": 0.9721, "step": 3477 }, { "epoch": 3.6153846153846154, "grad_norm": 7.133696556091309, "learning_rate": 3.0792330792330792e-06, "loss": 0.5373, "step": 3478 }, { "epoch": 3.6164241164241164, "grad_norm": 6.3263983726501465, "learning_rate": 3.0769230769230774e-06, "loss": 0.2538, "step": 3479 }, { "epoch": 3.6174636174636174, "grad_norm": 5.5645599365234375, "learning_rate": 3.0746130746130748e-06, "loss": 0.4539, "step": 3480 }, { "epoch": 3.6185031185031185, "grad_norm": 1.2195281982421875, "learning_rate": 3.072303072303073e-06, "loss": 0.0161, "step": 3481 }, { "epoch": 3.6195426195426195, "grad_norm": 11.107911109924316, "learning_rate": 3.0699930699930703e-06, "loss": 0.2437, "step": 3482 }, { "epoch": 3.6205821205821205, "grad_norm": 4.7963056564331055, "learning_rate": 3.0676830676830677e-06, "loss": 0.1774, "step": 3483 }, { "epoch": 3.6216216216216215, "grad_norm": 3.392280101776123, "learning_rate": 3.065373065373066e-06, "loss": 0.0688, "step": 3484 }, { "epoch": 3.6226611226611225, "grad_norm": 3.227189540863037, "learning_rate": 3.063063063063063e-06, "loss": 0.0418, "step": 3485 }, { "epoch": 3.623700623700624, "grad_norm": 2.165048360824585, "learning_rate": 3.060753060753061e-06, "loss": 0.0669, "step": 3486 }, { "epoch": 3.624740124740125, "grad_norm": 6.590515613555908, "learning_rate": 3.0584430584430587e-06, "loss": 0.1336, "step": 3487 }, { "epoch": 3.625779625779626, "grad_norm": 0.3319731056690216, "learning_rate": 3.0561330561330565e-06, "loss": 0.0038, "step": 3488 }, { "epoch": 3.626819126819127, "grad_norm": 7.134136199951172, "learning_rate": 3.053823053823054e-06, "loss": 0.3527, "step": 3489 }, { "epoch": 3.627858627858628, "grad_norm": 0.6251439452171326, "learning_rate": 3.051513051513052e-06, "loss": 0.0108, "step": 3490 }, { "epoch": 3.628898128898129, "grad_norm": 0.04119293764233589, "learning_rate": 3.0492030492030494e-06, "loss": 0.0009, "step": 3491 }, { "epoch": 3.62993762993763, "grad_norm": 4.981802463531494, "learning_rate": 3.0468930468930467e-06, "loss": 0.1096, "step": 3492 }, { "epoch": 3.630977130977131, "grad_norm": 0.7034261226654053, "learning_rate": 3.044583044583045e-06, "loss": 0.0151, "step": 3493 }, { "epoch": 3.632016632016632, "grad_norm": 2.9406309127807617, "learning_rate": 3.0422730422730422e-06, "loss": 0.0738, "step": 3494 }, { "epoch": 3.633056133056133, "grad_norm": 0.5052170753479004, "learning_rate": 3.0399630399630404e-06, "loss": 0.0072, "step": 3495 }, { "epoch": 3.634095634095634, "grad_norm": 0.02603205479681492, "learning_rate": 3.0376530376530378e-06, "loss": 0.0008, "step": 3496 }, { "epoch": 3.635135135135135, "grad_norm": 10.114480972290039, "learning_rate": 3.0353430353430356e-06, "loss": 1.4564, "step": 3497 }, { "epoch": 3.636174636174636, "grad_norm": 2.4543769359588623, "learning_rate": 3.0330330330330333e-06, "loss": 0.049, "step": 3498 }, { "epoch": 3.637214137214137, "grad_norm": 0.6489257216453552, "learning_rate": 3.030723030723031e-06, "loss": 0.0133, "step": 3499 }, { "epoch": 3.638253638253638, "grad_norm": 0.2170177847146988, "learning_rate": 3.028413028413029e-06, "loss": 0.0026, "step": 3500 }, { "epoch": 3.639293139293139, "grad_norm": 9.83189868927002, "learning_rate": 3.026103026103026e-06, "loss": 0.4348, "step": 3501 }, { "epoch": 3.64033264033264, "grad_norm": 0.019325939938426018, "learning_rate": 3.023793023793024e-06, "loss": 0.0002, "step": 3502 }, { "epoch": 3.641372141372141, "grad_norm": 1.3234479427337646, "learning_rate": 3.0214830214830217e-06, "loss": 0.0501, "step": 3503 }, { "epoch": 3.642411642411642, "grad_norm": 0.08140242099761963, "learning_rate": 3.0191730191730195e-06, "loss": 0.0018, "step": 3504 }, { "epoch": 3.643451143451143, "grad_norm": 14.23491382598877, "learning_rate": 3.016863016863017e-06, "loss": 1.7548, "step": 3505 }, { "epoch": 3.6444906444906446, "grad_norm": 2.7035605907440186, "learning_rate": 3.014553014553015e-06, "loss": 0.2122, "step": 3506 }, { "epoch": 3.6455301455301456, "grad_norm": 5.186462879180908, "learning_rate": 3.0122430122430124e-06, "loss": 0.213, "step": 3507 }, { "epoch": 3.6465696465696467, "grad_norm": 0.010343966074287891, "learning_rate": 3.0099330099330106e-06, "loss": 0.0003, "step": 3508 }, { "epoch": 3.6476091476091477, "grad_norm": 5.633244514465332, "learning_rate": 3.007623007623008e-06, "loss": 0.2276, "step": 3509 }, { "epoch": 3.6486486486486487, "grad_norm": 10.875540733337402, "learning_rate": 3.0053130053130053e-06, "loss": 0.4257, "step": 3510 }, { "epoch": 3.6496881496881497, "grad_norm": 1.6572810411453247, "learning_rate": 3.0030030030030034e-06, "loss": 0.0349, "step": 3511 }, { "epoch": 3.6507276507276507, "grad_norm": 0.6229730844497681, "learning_rate": 3.0006930006930008e-06, "loss": 0.0178, "step": 3512 }, { "epoch": 3.6517671517671517, "grad_norm": 5.359668731689453, "learning_rate": 2.9983829983829986e-06, "loss": 0.1755, "step": 3513 }, { "epoch": 3.6528066528066527, "grad_norm": 4.23331880569458, "learning_rate": 2.9960729960729963e-06, "loss": 0.3545, "step": 3514 }, { "epoch": 3.6538461538461537, "grad_norm": 0.015081733465194702, "learning_rate": 2.993762993762994e-06, "loss": 0.0003, "step": 3515 }, { "epoch": 3.6548856548856548, "grad_norm": 1.0635371208190918, "learning_rate": 2.9914529914529914e-06, "loss": 0.025, "step": 3516 }, { "epoch": 3.6559251559251558, "grad_norm": 1.5098602771759033, "learning_rate": 2.9891429891429896e-06, "loss": 0.0284, "step": 3517 }, { "epoch": 3.6569646569646572, "grad_norm": 7.799502849578857, "learning_rate": 2.986832986832987e-06, "loss": 0.282, "step": 3518 }, { "epoch": 3.6580041580041582, "grad_norm": 5.349996566772461, "learning_rate": 2.9845229845229843e-06, "loss": 0.0988, "step": 3519 }, { "epoch": 3.6590436590436592, "grad_norm": 13.843528747558594, "learning_rate": 2.9822129822129825e-06, "loss": 1.0991, "step": 3520 }, { "epoch": 3.6600831600831603, "grad_norm": 8.505522727966309, "learning_rate": 2.97990297990298e-06, "loss": 0.2774, "step": 3521 }, { "epoch": 3.6611226611226613, "grad_norm": 7.561004161834717, "learning_rate": 2.977592977592978e-06, "loss": 0.4512, "step": 3522 }, { "epoch": 3.6621621621621623, "grad_norm": 1.7828177213668823, "learning_rate": 2.9752829752829754e-06, "loss": 0.2971, "step": 3523 }, { "epoch": 3.6632016632016633, "grad_norm": 3.8652234077453613, "learning_rate": 2.9729729729729736e-06, "loss": 0.0334, "step": 3524 }, { "epoch": 3.6642411642411643, "grad_norm": 3.4428791999816895, "learning_rate": 2.970662970662971e-06, "loss": 0.0855, "step": 3525 }, { "epoch": 3.6652806652806653, "grad_norm": 7.088212013244629, "learning_rate": 2.9683529683529687e-06, "loss": 0.5074, "step": 3526 }, { "epoch": 3.6663201663201663, "grad_norm": 0.06194361299276352, "learning_rate": 2.9660429660429664e-06, "loss": 0.0014, "step": 3527 }, { "epoch": 3.6673596673596673, "grad_norm": 0.017604293301701546, "learning_rate": 2.963732963732964e-06, "loss": 0.0004, "step": 3528 }, { "epoch": 3.6683991683991684, "grad_norm": 0.9275417923927307, "learning_rate": 2.9614229614229616e-06, "loss": 0.0126, "step": 3529 }, { "epoch": 3.6694386694386694, "grad_norm": 3.7533137798309326, "learning_rate": 2.9591129591129593e-06, "loss": 0.0467, "step": 3530 }, { "epoch": 3.6704781704781704, "grad_norm": 12.880677223205566, "learning_rate": 2.956802956802957e-06, "loss": 1.2227, "step": 3531 }, { "epoch": 3.6715176715176714, "grad_norm": 2.595752000808716, "learning_rate": 2.9544929544929544e-06, "loss": 0.0513, "step": 3532 }, { "epoch": 3.6725571725571724, "grad_norm": 2.3721625804901123, "learning_rate": 2.9521829521829526e-06, "loss": 0.0381, "step": 3533 }, { "epoch": 3.6735966735966734, "grad_norm": 3.710249423980713, "learning_rate": 2.94987294987295e-06, "loss": 0.3444, "step": 3534 }, { "epoch": 3.6746361746361744, "grad_norm": 2.203608274459839, "learning_rate": 2.947562947562948e-06, "loss": 0.0293, "step": 3535 }, { "epoch": 3.6756756756756754, "grad_norm": 4.4015374183654785, "learning_rate": 2.9452529452529455e-06, "loss": 0.2799, "step": 3536 }, { "epoch": 3.6767151767151764, "grad_norm": 3.468798875808716, "learning_rate": 2.942942942942943e-06, "loss": 0.3088, "step": 3537 }, { "epoch": 3.677754677754678, "grad_norm": 3.099963903427124, "learning_rate": 2.940632940632941e-06, "loss": 0.1014, "step": 3538 }, { "epoch": 3.678794178794179, "grad_norm": 0.2617235779762268, "learning_rate": 2.9383229383229384e-06, "loss": 0.0064, "step": 3539 }, { "epoch": 3.67983367983368, "grad_norm": 1.5205084085464478, "learning_rate": 2.936012936012936e-06, "loss": 0.2208, "step": 3540 }, { "epoch": 3.680873180873181, "grad_norm": 0.017373790964484215, "learning_rate": 2.933702933702934e-06, "loss": 0.0002, "step": 3541 }, { "epoch": 3.681912681912682, "grad_norm": NaN, "learning_rate": 2.9313929313929317e-06, "loss": 0.0641, "step": 3542 }, { "epoch": 3.682952182952183, "grad_norm": 0.20491603016853333, "learning_rate": 2.929082929082929e-06, "loss": 0.0043, "step": 3543 }, { "epoch": 3.683991683991684, "grad_norm": 0.0069188824854791164, "learning_rate": 2.9267729267729272e-06, "loss": 0.0002, "step": 3544 }, { "epoch": 3.685031185031185, "grad_norm": 4.759822845458984, "learning_rate": 2.9244629244629246e-06, "loss": 0.1093, "step": 3545 }, { "epoch": 3.686070686070686, "grad_norm": 3.06587553024292, "learning_rate": 2.9221529221529223e-06, "loss": 0.0937, "step": 3546 }, { "epoch": 3.687110187110187, "grad_norm": 9.3523588180542, "learning_rate": 2.91984291984292e-06, "loss": 0.472, "step": 3547 }, { "epoch": 3.688149688149688, "grad_norm": 0.035290349274873734, "learning_rate": 2.9175329175329174e-06, "loss": 0.0011, "step": 3548 }, { "epoch": 3.689189189189189, "grad_norm": 12.437941551208496, "learning_rate": 2.9152229152229156e-06, "loss": 0.4781, "step": 3549 }, { "epoch": 3.6902286902286905, "grad_norm": 14.929848670959473, "learning_rate": 2.912912912912913e-06, "loss": 1.4515, "step": 3550 }, { "epoch": 3.6912681912681915, "grad_norm": 8.693596839904785, "learning_rate": 2.910602910602911e-06, "loss": 0.6999, "step": 3551 }, { "epoch": 3.6923076923076925, "grad_norm": 1.6712979078292847, "learning_rate": 2.9082929082929085e-06, "loss": 0.2589, "step": 3552 }, { "epoch": 3.6933471933471935, "grad_norm": 3.1328914165496826, "learning_rate": 2.9059829059829063e-06, "loss": 0.1631, "step": 3553 }, { "epoch": 3.6943866943866945, "grad_norm": 4.731451511383057, "learning_rate": 2.903672903672904e-06, "loss": 0.1442, "step": 3554 }, { "epoch": 3.6954261954261955, "grad_norm": 4.9439520835876465, "learning_rate": 2.9013629013629014e-06, "loss": 0.3415, "step": 3555 }, { "epoch": 3.6964656964656966, "grad_norm": 9.034040451049805, "learning_rate": 2.899052899052899e-06, "loss": 0.5509, "step": 3556 }, { "epoch": 3.6975051975051976, "grad_norm": 0.06901997327804565, "learning_rate": 2.896742896742897e-06, "loss": 0.0023, "step": 3557 }, { "epoch": 3.6985446985446986, "grad_norm": 0.17667238414287567, "learning_rate": 2.8944328944328947e-06, "loss": 0.0032, "step": 3558 }, { "epoch": 3.6995841995841996, "grad_norm": 5.443127632141113, "learning_rate": 2.892122892122892e-06, "loss": 0.5829, "step": 3559 }, { "epoch": 3.7006237006237006, "grad_norm": 8.582060813903809, "learning_rate": 2.8898128898128902e-06, "loss": 0.3604, "step": 3560 }, { "epoch": 3.7016632016632016, "grad_norm": 2.5995655059814453, "learning_rate": 2.8875028875028876e-06, "loss": 0.0486, "step": 3561 }, { "epoch": 3.7027027027027026, "grad_norm": 0.021831559017300606, "learning_rate": 2.8851928851928858e-06, "loss": 0.0006, "step": 3562 }, { "epoch": 3.7037422037422036, "grad_norm": 3.39544677734375, "learning_rate": 2.882882882882883e-06, "loss": 0.0849, "step": 3563 }, { "epoch": 3.7047817047817047, "grad_norm": 0.7509887218475342, "learning_rate": 2.8805728805728804e-06, "loss": 0.0133, "step": 3564 }, { "epoch": 3.7058212058212057, "grad_norm": 12.253210067749023, "learning_rate": 2.8782628782628786e-06, "loss": 0.7792, "step": 3565 }, { "epoch": 3.7068607068607067, "grad_norm": 14.429997444152832, "learning_rate": 2.875952875952876e-06, "loss": 0.5612, "step": 3566 }, { "epoch": 3.7079002079002077, "grad_norm": 13.459504127502441, "learning_rate": 2.873642873642874e-06, "loss": 1.1039, "step": 3567 }, { "epoch": 3.7089397089397087, "grad_norm": 2.6319940090179443, "learning_rate": 2.8713328713328715e-06, "loss": 0.0435, "step": 3568 }, { "epoch": 3.7099792099792097, "grad_norm": 3.980374336242676, "learning_rate": 2.8690228690228693e-06, "loss": 0.0248, "step": 3569 }, { "epoch": 3.711018711018711, "grad_norm": 0.9240010976791382, "learning_rate": 2.866712866712867e-06, "loss": 0.0189, "step": 3570 }, { "epoch": 3.712058212058212, "grad_norm": 1.6734356880187988, "learning_rate": 2.864402864402865e-06, "loss": 0.0638, "step": 3571 }, { "epoch": 3.713097713097713, "grad_norm": 0.5525492429733276, "learning_rate": 2.862092862092862e-06, "loss": 0.0075, "step": 3572 }, { "epoch": 3.714137214137214, "grad_norm": 1.808472752571106, "learning_rate": 2.85978285978286e-06, "loss": 0.052, "step": 3573 }, { "epoch": 3.715176715176715, "grad_norm": 1.0177438259124756, "learning_rate": 2.8574728574728577e-06, "loss": 0.0199, "step": 3574 }, { "epoch": 3.7162162162162162, "grad_norm": 1.5117632150650024, "learning_rate": 2.855162855162855e-06, "loss": 0.0383, "step": 3575 }, { "epoch": 3.7172557172557172, "grad_norm": 0.37571460008621216, "learning_rate": 2.8528528528528532e-06, "loss": 0.0113, "step": 3576 }, { "epoch": 3.7182952182952183, "grad_norm": 0.6104511022567749, "learning_rate": 2.8505428505428506e-06, "loss": 0.0209, "step": 3577 }, { "epoch": 3.7193347193347193, "grad_norm": 0.32047128677368164, "learning_rate": 2.8482328482328488e-06, "loss": 0.0072, "step": 3578 }, { "epoch": 3.7203742203742203, "grad_norm": 11.813075065612793, "learning_rate": 2.845922845922846e-06, "loss": 0.4715, "step": 3579 }, { "epoch": 3.7214137214137213, "grad_norm": 0.27141356468200684, "learning_rate": 2.843612843612844e-06, "loss": 0.0039, "step": 3580 }, { "epoch": 3.7224532224532223, "grad_norm": 10.95467758178711, "learning_rate": 2.8413028413028416e-06, "loss": 0.6977, "step": 3581 }, { "epoch": 3.7234927234927238, "grad_norm": 0.06070644408464432, "learning_rate": 2.838992838992839e-06, "loss": 0.001, "step": 3582 }, { "epoch": 3.7245322245322248, "grad_norm": 0.3406676650047302, "learning_rate": 2.8366828366828368e-06, "loss": 0.0073, "step": 3583 }, { "epoch": 3.725571725571726, "grad_norm": 3.348909378051758, "learning_rate": 2.8343728343728345e-06, "loss": 0.063, "step": 3584 }, { "epoch": 3.726611226611227, "grad_norm": 1.7469083070755005, "learning_rate": 2.8320628320628323e-06, "loss": 0.0402, "step": 3585 }, { "epoch": 3.727650727650728, "grad_norm": 7.333977222442627, "learning_rate": 2.8297528297528296e-06, "loss": 0.1989, "step": 3586 }, { "epoch": 3.728690228690229, "grad_norm": 10.271039962768555, "learning_rate": 2.827442827442828e-06, "loss": 0.8281, "step": 3587 }, { "epoch": 3.72972972972973, "grad_norm": 0.02635607309639454, "learning_rate": 2.825132825132825e-06, "loss": 0.0005, "step": 3588 }, { "epoch": 3.730769230769231, "grad_norm": 8.348944664001465, "learning_rate": 2.8228228228228234e-06, "loss": 0.4226, "step": 3589 }, { "epoch": 3.731808731808732, "grad_norm": 12.303922653198242, "learning_rate": 2.8205128205128207e-06, "loss": 1.1037, "step": 3590 }, { "epoch": 3.732848232848233, "grad_norm": 6.568941593170166, "learning_rate": 2.818202818202818e-06, "loss": 0.1263, "step": 3591 }, { "epoch": 3.733887733887734, "grad_norm": 11.41336441040039, "learning_rate": 2.8158928158928162e-06, "loss": 0.4161, "step": 3592 }, { "epoch": 3.734927234927235, "grad_norm": 8.585813522338867, "learning_rate": 2.8135828135828136e-06, "loss": 0.9174, "step": 3593 }, { "epoch": 3.735966735966736, "grad_norm": 8.427745819091797, "learning_rate": 2.8112728112728118e-06, "loss": 0.44, "step": 3594 }, { "epoch": 3.737006237006237, "grad_norm": 0.2848096787929535, "learning_rate": 2.808962808962809e-06, "loss": 0.0046, "step": 3595 }, { "epoch": 3.738045738045738, "grad_norm": 0.09448783099651337, "learning_rate": 2.806652806652807e-06, "loss": 0.0008, "step": 3596 }, { "epoch": 3.739085239085239, "grad_norm": 2.5192325115203857, "learning_rate": 2.8043428043428046e-06, "loss": 0.0636, "step": 3597 }, { "epoch": 3.74012474012474, "grad_norm": 1.4547226428985596, "learning_rate": 2.8020328020328024e-06, "loss": 0.035, "step": 3598 }, { "epoch": 3.741164241164241, "grad_norm": 8.2759370803833, "learning_rate": 2.7997227997227998e-06, "loss": 0.5974, "step": 3599 }, { "epoch": 3.742203742203742, "grad_norm": 0.031137343496084213, "learning_rate": 2.7974127974127975e-06, "loss": 0.0007, "step": 3600 }, { "epoch": 3.743243243243243, "grad_norm": 0.04575344920158386, "learning_rate": 2.7951027951027953e-06, "loss": 0.0014, "step": 3601 }, { "epoch": 3.7442827442827444, "grad_norm": 0.6326460242271423, "learning_rate": 2.7927927927927926e-06, "loss": 0.0223, "step": 3602 }, { "epoch": 3.7453222453222454, "grad_norm": 2.685260772705078, "learning_rate": 2.790482790482791e-06, "loss": 0.1627, "step": 3603 }, { "epoch": 3.7463617463617465, "grad_norm": 13.41769027709961, "learning_rate": 2.788172788172788e-06, "loss": 1.2246, "step": 3604 }, { "epoch": 3.7474012474012475, "grad_norm": 0.7796103358268738, "learning_rate": 2.7858627858627864e-06, "loss": 0.0158, "step": 3605 }, { "epoch": 3.7484407484407485, "grad_norm": 11.729232788085938, "learning_rate": 2.7835527835527837e-06, "loss": 1.0726, "step": 3606 }, { "epoch": 3.7494802494802495, "grad_norm": 3.69287371635437, "learning_rate": 2.781242781242782e-06, "loss": 0.0925, "step": 3607 }, { "epoch": 3.7505197505197505, "grad_norm": 0.11060631275177002, "learning_rate": 2.7789327789327792e-06, "loss": 0.0026, "step": 3608 }, { "epoch": 3.7515592515592515, "grad_norm": 8.287408828735352, "learning_rate": 2.776622776622777e-06, "loss": 0.2696, "step": 3609 }, { "epoch": 3.7525987525987525, "grad_norm": 0.1551632434129715, "learning_rate": 2.7743127743127748e-06, "loss": 0.0041, "step": 3610 }, { "epoch": 3.7536382536382535, "grad_norm": 3.1916396617889404, "learning_rate": 2.772002772002772e-06, "loss": 0.0784, "step": 3611 }, { "epoch": 3.7546777546777546, "grad_norm": 1.6691946983337402, "learning_rate": 2.76969276969277e-06, "loss": 0.0507, "step": 3612 }, { "epoch": 3.7557172557172556, "grad_norm": 4.951547145843506, "learning_rate": 2.7673827673827677e-06, "loss": 0.4273, "step": 3613 }, { "epoch": 3.756756756756757, "grad_norm": 6.517636299133301, "learning_rate": 2.7650727650727654e-06, "loss": 0.3348, "step": 3614 }, { "epoch": 3.757796257796258, "grad_norm": 4.4977006912231445, "learning_rate": 2.7627627627627628e-06, "loss": 0.0738, "step": 3615 }, { "epoch": 3.758835758835759, "grad_norm": 0.9126572012901306, "learning_rate": 2.760452760452761e-06, "loss": 0.0165, "step": 3616 }, { "epoch": 3.75987525987526, "grad_norm": 6.097523212432861, "learning_rate": 2.7581427581427583e-06, "loss": 0.1942, "step": 3617 }, { "epoch": 3.760914760914761, "grad_norm": 9.230613708496094, "learning_rate": 2.7558327558327565e-06, "loss": 0.4356, "step": 3618 }, { "epoch": 3.761954261954262, "grad_norm": 1.694145917892456, "learning_rate": 2.753522753522754e-06, "loss": 0.0581, "step": 3619 }, { "epoch": 3.762993762993763, "grad_norm": 5.806911468505859, "learning_rate": 2.751212751212751e-06, "loss": 0.195, "step": 3620 }, { "epoch": 3.764033264033264, "grad_norm": 10.217206954956055, "learning_rate": 2.7489027489027494e-06, "loss": 0.9645, "step": 3621 }, { "epoch": 3.765072765072765, "grad_norm": 2.835890531539917, "learning_rate": 2.7465927465927467e-06, "loss": 0.0674, "step": 3622 }, { "epoch": 3.766112266112266, "grad_norm": 0.19575683772563934, "learning_rate": 2.7442827442827445e-06, "loss": 0.0026, "step": 3623 }, { "epoch": 3.767151767151767, "grad_norm": 1.5427911281585693, "learning_rate": 2.7419727419727422e-06, "loss": 0.0391, "step": 3624 }, { "epoch": 3.768191268191268, "grad_norm": 9.355785369873047, "learning_rate": 2.73966273966274e-06, "loss": 0.6473, "step": 3625 }, { "epoch": 3.769230769230769, "grad_norm": 0.5041698217391968, "learning_rate": 2.7373527373527374e-06, "loss": 0.015, "step": 3626 }, { "epoch": 3.77027027027027, "grad_norm": 1.6092287302017212, "learning_rate": 2.7350427350427355e-06, "loss": 0.0286, "step": 3627 }, { "epoch": 3.771309771309771, "grad_norm": 1.0374270677566528, "learning_rate": 2.732732732732733e-06, "loss": 0.026, "step": 3628 }, { "epoch": 3.772349272349272, "grad_norm": 5.282305717468262, "learning_rate": 2.7304227304227302e-06, "loss": 0.1368, "step": 3629 }, { "epoch": 3.773388773388773, "grad_norm": 1.2266831398010254, "learning_rate": 2.7281127281127284e-06, "loss": 0.0191, "step": 3630 }, { "epoch": 3.774428274428274, "grad_norm": 0.8408501148223877, "learning_rate": 2.7258027258027258e-06, "loss": 0.0172, "step": 3631 }, { "epoch": 3.7754677754677752, "grad_norm": 0.9763440489768982, "learning_rate": 2.723492723492724e-06, "loss": 0.0166, "step": 3632 }, { "epoch": 3.7765072765072762, "grad_norm": 6.578399181365967, "learning_rate": 2.7211827211827213e-06, "loss": 0.0925, "step": 3633 }, { "epoch": 3.7775467775467777, "grad_norm": 0.2096863090991974, "learning_rate": 2.7188727188727195e-06, "loss": 0.0038, "step": 3634 }, { "epoch": 3.7785862785862787, "grad_norm": 1.804506540298462, "learning_rate": 2.716562716562717e-06, "loss": 0.0344, "step": 3635 }, { "epoch": 3.7796257796257797, "grad_norm": 0.49040916562080383, "learning_rate": 2.7142527142527146e-06, "loss": 0.0119, "step": 3636 }, { "epoch": 3.7806652806652807, "grad_norm": 11.101481437683105, "learning_rate": 2.7119427119427124e-06, "loss": 1.0053, "step": 3637 }, { "epoch": 3.7817047817047817, "grad_norm": 0.007577314507216215, "learning_rate": 2.7096327096327097e-06, "loss": 0.0001, "step": 3638 }, { "epoch": 3.7827442827442828, "grad_norm": 2.290045976638794, "learning_rate": 2.7073227073227075e-06, "loss": 0.0554, "step": 3639 }, { "epoch": 3.7837837837837838, "grad_norm": 0.050055865198373795, "learning_rate": 2.7050127050127052e-06, "loss": 0.0008, "step": 3640 }, { "epoch": 3.784823284823285, "grad_norm": 10.9364013671875, "learning_rate": 2.702702702702703e-06, "loss": 1.6398, "step": 3641 }, { "epoch": 3.785862785862786, "grad_norm": 0.030551081523299217, "learning_rate": 2.7003927003927004e-06, "loss": 0.0008, "step": 3642 }, { "epoch": 3.786902286902287, "grad_norm": 5.652614593505859, "learning_rate": 2.6980826980826985e-06, "loss": 0.0763, "step": 3643 }, { "epoch": 3.787941787941788, "grad_norm": 0.8022517561912537, "learning_rate": 2.695772695772696e-06, "loss": 0.0202, "step": 3644 }, { "epoch": 3.788981288981289, "grad_norm": 0.5187711715698242, "learning_rate": 2.693462693462694e-06, "loss": 0.0166, "step": 3645 }, { "epoch": 3.7900207900207903, "grad_norm": 0.14020437002182007, "learning_rate": 2.6911526911526914e-06, "loss": 0.004, "step": 3646 }, { "epoch": 3.7910602910602913, "grad_norm": 0.45986345410346985, "learning_rate": 2.6888426888426888e-06, "loss": 0.0068, "step": 3647 }, { "epoch": 3.7920997920997923, "grad_norm": 11.233826637268066, "learning_rate": 2.686532686532687e-06, "loss": 0.6995, "step": 3648 }, { "epoch": 3.7931392931392933, "grad_norm": 2.0428266525268555, "learning_rate": 2.6842226842226843e-06, "loss": 0.0425, "step": 3649 }, { "epoch": 3.7941787941787943, "grad_norm": 2.4397714138031006, "learning_rate": 2.681912681912682e-06, "loss": 0.0566, "step": 3650 }, { "epoch": 3.7952182952182953, "grad_norm": 13.182563781738281, "learning_rate": 2.67960267960268e-06, "loss": 0.7635, "step": 3651 }, { "epoch": 3.7962577962577964, "grad_norm": 0.6618658900260925, "learning_rate": 2.6772926772926776e-06, "loss": 0.0128, "step": 3652 }, { "epoch": 3.7972972972972974, "grad_norm": 1.9590808153152466, "learning_rate": 2.6749826749826754e-06, "loss": 0.0343, "step": 3653 }, { "epoch": 3.7983367983367984, "grad_norm": 6.853428363800049, "learning_rate": 2.672672672672673e-06, "loss": 0.1584, "step": 3654 }, { "epoch": 3.7993762993762994, "grad_norm": 4.4361724853515625, "learning_rate": 2.6703626703626705e-06, "loss": 0.0722, "step": 3655 }, { "epoch": 3.8004158004158004, "grad_norm": 4.45872163772583, "learning_rate": 2.6680526680526683e-06, "loss": 0.0835, "step": 3656 }, { "epoch": 3.8014553014553014, "grad_norm": 4.59832239151001, "learning_rate": 2.665742665742666e-06, "loss": 0.1583, "step": 3657 }, { "epoch": 3.8024948024948024, "grad_norm": 12.465092658996582, "learning_rate": 2.6634326634326634e-06, "loss": 1.0707, "step": 3658 }, { "epoch": 3.8035343035343034, "grad_norm": 11.42121410369873, "learning_rate": 2.6611226611226616e-06, "loss": 0.7943, "step": 3659 }, { "epoch": 3.8045738045738045, "grad_norm": 7.620397567749023, "learning_rate": 2.658812658812659e-06, "loss": 0.2346, "step": 3660 }, { "epoch": 3.8056133056133055, "grad_norm": 0.00030691453139297664, "learning_rate": 2.656502656502657e-06, "loss": 0.0, "step": 3661 }, { "epoch": 3.8066528066528065, "grad_norm": 2.865452766418457, "learning_rate": 2.6541926541926544e-06, "loss": 0.0559, "step": 3662 }, { "epoch": 3.8076923076923075, "grad_norm": 4.418657302856445, "learning_rate": 2.651882651882652e-06, "loss": 0.1019, "step": 3663 }, { "epoch": 3.8087318087318085, "grad_norm": 4.617337703704834, "learning_rate": 2.64957264957265e-06, "loss": 0.0996, "step": 3664 }, { "epoch": 3.8097713097713095, "grad_norm": 2.217773675918579, "learning_rate": 2.6472626472626473e-06, "loss": 0.0625, "step": 3665 }, { "epoch": 3.810810810810811, "grad_norm": 4.007612228393555, "learning_rate": 2.644952644952645e-06, "loss": 0.0961, "step": 3666 }, { "epoch": 3.811850311850312, "grad_norm": 8.661126136779785, "learning_rate": 2.642642642642643e-06, "loss": 0.5064, "step": 3667 }, { "epoch": 3.812889812889813, "grad_norm": 0.30983543395996094, "learning_rate": 2.6403326403326406e-06, "loss": 0.005, "step": 3668 }, { "epoch": 3.813929313929314, "grad_norm": 0.18076816201210022, "learning_rate": 2.638022638022638e-06, "loss": 0.0038, "step": 3669 }, { "epoch": 3.814968814968815, "grad_norm": 7.025464057922363, "learning_rate": 2.635712635712636e-06, "loss": 0.1497, "step": 3670 }, { "epoch": 3.816008316008316, "grad_norm": 12.076990127563477, "learning_rate": 2.6334026334026335e-06, "loss": 1.2221, "step": 3671 }, { "epoch": 3.817047817047817, "grad_norm": 6.618215084075928, "learning_rate": 2.6310926310926317e-06, "loss": 0.2126, "step": 3672 }, { "epoch": 3.818087318087318, "grad_norm": 1.5214687585830688, "learning_rate": 2.628782628782629e-06, "loss": 0.0272, "step": 3673 }, { "epoch": 3.819126819126819, "grad_norm": 7.2195281982421875, "learning_rate": 2.6264726264726264e-06, "loss": 0.4445, "step": 3674 }, { "epoch": 3.82016632016632, "grad_norm": 8.39342975616455, "learning_rate": 2.6241626241626246e-06, "loss": 0.2886, "step": 3675 }, { "epoch": 3.821205821205821, "grad_norm": 0.33063164353370667, "learning_rate": 2.621852621852622e-06, "loss": 0.0077, "step": 3676 }, { "epoch": 3.822245322245322, "grad_norm": 12.577540397644043, "learning_rate": 2.61954261954262e-06, "loss": 0.26, "step": 3677 }, { "epoch": 3.8232848232848236, "grad_norm": 6.113277912139893, "learning_rate": 2.6172326172326174e-06, "loss": 0.1863, "step": 3678 }, { "epoch": 3.8243243243243246, "grad_norm": 8.560226440429688, "learning_rate": 2.614922614922615e-06, "loss": 0.355, "step": 3679 }, { "epoch": 3.8253638253638256, "grad_norm": 9.159148216247559, "learning_rate": 2.612612612612613e-06, "loss": 0.2278, "step": 3680 }, { "epoch": 3.8264033264033266, "grad_norm": 4.1118574142456055, "learning_rate": 2.6103026103026107e-06, "loss": 0.1376, "step": 3681 }, { "epoch": 3.8274428274428276, "grad_norm": 0.10904556512832642, "learning_rate": 2.607992607992608e-06, "loss": 0.0023, "step": 3682 }, { "epoch": 3.8284823284823286, "grad_norm": 0.13135097920894623, "learning_rate": 2.605682605682606e-06, "loss": 0.002, "step": 3683 }, { "epoch": 3.8295218295218296, "grad_norm": 13.160359382629395, "learning_rate": 2.6033726033726036e-06, "loss": 2.0968, "step": 3684 }, { "epoch": 3.8305613305613306, "grad_norm": 4.60862922668457, "learning_rate": 2.601062601062601e-06, "loss": 0.156, "step": 3685 }, { "epoch": 3.8316008316008316, "grad_norm": 1.0691585540771484, "learning_rate": 2.598752598752599e-06, "loss": 0.0158, "step": 3686 }, { "epoch": 3.8326403326403327, "grad_norm": 0.3119029700756073, "learning_rate": 2.5964425964425965e-06, "loss": 0.0057, "step": 3687 }, { "epoch": 3.8336798336798337, "grad_norm": 0.9196264743804932, "learning_rate": 2.5941325941325947e-06, "loss": 0.0313, "step": 3688 }, { "epoch": 3.8347193347193347, "grad_norm": 3.6135544776916504, "learning_rate": 2.591822591822592e-06, "loss": 0.0938, "step": 3689 }, { "epoch": 3.8357588357588357, "grad_norm": 5.493073463439941, "learning_rate": 2.58951258951259e-06, "loss": 0.2199, "step": 3690 }, { "epoch": 3.8367983367983367, "grad_norm": 0.0029417388141155243, "learning_rate": 2.5872025872025876e-06, "loss": 0.0001, "step": 3691 }, { "epoch": 3.8378378378378377, "grad_norm": 0.00937197357416153, "learning_rate": 2.584892584892585e-06, "loss": 0.0002, "step": 3692 }, { "epoch": 3.8388773388773387, "grad_norm": 7.578929424285889, "learning_rate": 2.5825825825825827e-06, "loss": 0.3299, "step": 3693 }, { "epoch": 3.8399168399168397, "grad_norm": 9.87551212310791, "learning_rate": 2.5802725802725804e-06, "loss": 0.6378, "step": 3694 }, { "epoch": 3.8409563409563408, "grad_norm": 0.6146271824836731, "learning_rate": 2.577962577962578e-06, "loss": 0.0142, "step": 3695 }, { "epoch": 3.8419958419958418, "grad_norm": 1.575995683670044, "learning_rate": 2.5756525756525756e-06, "loss": 0.0283, "step": 3696 }, { "epoch": 3.8430353430353428, "grad_norm": 0.2135927826166153, "learning_rate": 2.5733425733425737e-06, "loss": 0.0048, "step": 3697 }, { "epoch": 3.8440748440748442, "grad_norm": 0.813083827495575, "learning_rate": 2.571032571032571e-06, "loss": 0.0203, "step": 3698 }, { "epoch": 3.8451143451143452, "grad_norm": 6.919641971588135, "learning_rate": 2.5687225687225693e-06, "loss": 1.2358, "step": 3699 }, { "epoch": 3.8461538461538463, "grad_norm": 7.1949028968811035, "learning_rate": 2.5664125664125666e-06, "loss": 0.1928, "step": 3700 }, { "epoch": 3.8471933471933473, "grad_norm": 10.908730506896973, "learning_rate": 2.564102564102564e-06, "loss": 1.4967, "step": 3701 }, { "epoch": 3.8482328482328483, "grad_norm": 1.6860178709030151, "learning_rate": 2.561792561792562e-06, "loss": 0.1061, "step": 3702 }, { "epoch": 3.8492723492723493, "grad_norm": 1.1886787414550781, "learning_rate": 2.5594825594825595e-06, "loss": 0.0334, "step": 3703 }, { "epoch": 3.8503118503118503, "grad_norm": 0.08556528389453888, "learning_rate": 2.5571725571725577e-06, "loss": 0.0013, "step": 3704 }, { "epoch": 3.8513513513513513, "grad_norm": 1.3318017721176147, "learning_rate": 2.554862554862555e-06, "loss": 0.0475, "step": 3705 }, { "epoch": 3.8523908523908523, "grad_norm": 0.05544210225343704, "learning_rate": 2.552552552552553e-06, "loss": 0.0007, "step": 3706 }, { "epoch": 3.8534303534303533, "grad_norm": 0.11199303716421127, "learning_rate": 2.5502425502425506e-06, "loss": 0.0022, "step": 3707 }, { "epoch": 3.8544698544698544, "grad_norm": 4.047089099884033, "learning_rate": 2.5479325479325483e-06, "loss": 0.0905, "step": 3708 }, { "epoch": 3.8555093555093554, "grad_norm": 4.6328935623168945, "learning_rate": 2.5456225456225457e-06, "loss": 0.074, "step": 3709 }, { "epoch": 3.856548856548857, "grad_norm": 15.714149475097656, "learning_rate": 2.5433125433125434e-06, "loss": 0.0865, "step": 3710 }, { "epoch": 3.857588357588358, "grad_norm": 0.0857136994600296, "learning_rate": 2.5410025410025412e-06, "loss": 0.0016, "step": 3711 }, { "epoch": 3.858627858627859, "grad_norm": 3.9635956287384033, "learning_rate": 2.5386925386925386e-06, "loss": 0.2832, "step": 3712 }, { "epoch": 3.85966735966736, "grad_norm": 2.428004741668701, "learning_rate": 2.5363825363825367e-06, "loss": 0.0717, "step": 3713 }, { "epoch": 3.860706860706861, "grad_norm": 0.019456250593066216, "learning_rate": 2.534072534072534e-06, "loss": 0.0002, "step": 3714 }, { "epoch": 3.861746361746362, "grad_norm": 0.16324344277381897, "learning_rate": 2.5317625317625323e-06, "loss": 0.0036, "step": 3715 }, { "epoch": 3.862785862785863, "grad_norm": 8.783102035522461, "learning_rate": 2.5294525294525296e-06, "loss": 0.4462, "step": 3716 }, { "epoch": 3.863825363825364, "grad_norm": 1.616220474243164, "learning_rate": 2.527142527142528e-06, "loss": 0.0132, "step": 3717 }, { "epoch": 3.864864864864865, "grad_norm": 0.14159680902957916, "learning_rate": 2.524832524832525e-06, "loss": 0.0042, "step": 3718 }, { "epoch": 3.865904365904366, "grad_norm": 1.1944466829299927, "learning_rate": 2.5225225225225225e-06, "loss": 0.0295, "step": 3719 }, { "epoch": 3.866943866943867, "grad_norm": 1.1400628089904785, "learning_rate": 2.5202125202125207e-06, "loss": 0.0242, "step": 3720 }, { "epoch": 3.867983367983368, "grad_norm": 0.20073829591274261, "learning_rate": 2.517902517902518e-06, "loss": 0.0039, "step": 3721 }, { "epoch": 3.869022869022869, "grad_norm": 5.438026428222656, "learning_rate": 2.515592515592516e-06, "loss": 0.1886, "step": 3722 }, { "epoch": 3.87006237006237, "grad_norm": 0.04271695762872696, "learning_rate": 2.5132825132825136e-06, "loss": 0.0006, "step": 3723 }, { "epoch": 3.871101871101871, "grad_norm": 4.7715654373168945, "learning_rate": 2.5109725109725113e-06, "loss": 0.3594, "step": 3724 }, { "epoch": 3.872141372141372, "grad_norm": 0.6592026352882385, "learning_rate": 2.5086625086625087e-06, "loss": 0.0126, "step": 3725 }, { "epoch": 3.873180873180873, "grad_norm": 0.3664993941783905, "learning_rate": 2.506352506352507e-06, "loss": 0.0105, "step": 3726 }, { "epoch": 3.874220374220374, "grad_norm": 7.50433349609375, "learning_rate": 2.5040425040425042e-06, "loss": 0.2574, "step": 3727 }, { "epoch": 3.875259875259875, "grad_norm": 9.310280799865723, "learning_rate": 2.5017325017325016e-06, "loss": 0.286, "step": 3728 }, { "epoch": 3.876299376299376, "grad_norm": 7.536202907562256, "learning_rate": 2.4994224994224998e-06, "loss": 0.3656, "step": 3729 }, { "epoch": 3.8773388773388775, "grad_norm": 0.292306512594223, "learning_rate": 2.4971124971124975e-06, "loss": 0.0058, "step": 3730 }, { "epoch": 3.8783783783783785, "grad_norm": 1.817397952079773, "learning_rate": 2.4948024948024953e-06, "loss": 0.0895, "step": 3731 }, { "epoch": 3.8794178794178795, "grad_norm": 3.750711679458618, "learning_rate": 2.4924924924924926e-06, "loss": 0.0859, "step": 3732 }, { "epoch": 3.8804573804573805, "grad_norm": 9.821932792663574, "learning_rate": 2.4901824901824904e-06, "loss": 0.7896, "step": 3733 }, { "epoch": 3.8814968814968815, "grad_norm": 0.03397275507450104, "learning_rate": 2.487872487872488e-06, "loss": 0.0007, "step": 3734 }, { "epoch": 3.8825363825363826, "grad_norm": 7.19288969039917, "learning_rate": 2.485562485562486e-06, "loss": 0.468, "step": 3735 }, { "epoch": 3.8835758835758836, "grad_norm": 9.314663887023926, "learning_rate": 2.4832524832524833e-06, "loss": 0.7014, "step": 3736 }, { "epoch": 3.8846153846153846, "grad_norm": 3.0756399631500244, "learning_rate": 2.480942480942481e-06, "loss": 0.0745, "step": 3737 }, { "epoch": 3.8856548856548856, "grad_norm": 2.730370283126831, "learning_rate": 2.478632478632479e-06, "loss": 0.0803, "step": 3738 }, { "epoch": 3.8866943866943866, "grad_norm": 5.826650142669678, "learning_rate": 2.4763224763224766e-06, "loss": 0.6514, "step": 3739 }, { "epoch": 3.8877338877338876, "grad_norm": 2.1858022212982178, "learning_rate": 2.4740124740124743e-06, "loss": 0.0215, "step": 3740 }, { "epoch": 3.8887733887733886, "grad_norm": 3.4572198390960693, "learning_rate": 2.4717024717024717e-06, "loss": 0.0368, "step": 3741 }, { "epoch": 3.88981288981289, "grad_norm": 0.550865650177002, "learning_rate": 2.4693924693924695e-06, "loss": 0.0119, "step": 3742 }, { "epoch": 3.890852390852391, "grad_norm": 2.629605293273926, "learning_rate": 2.4670824670824672e-06, "loss": 0.0367, "step": 3743 }, { "epoch": 3.891891891891892, "grad_norm": 12.43010139465332, "learning_rate": 2.464772464772465e-06, "loss": 1.614, "step": 3744 }, { "epoch": 3.892931392931393, "grad_norm": 0.385886013507843, "learning_rate": 2.4624624624624628e-06, "loss": 0.0111, "step": 3745 }, { "epoch": 3.893970893970894, "grad_norm": 0.2466421276330948, "learning_rate": 2.4601524601524605e-06, "loss": 0.0066, "step": 3746 }, { "epoch": 3.895010395010395, "grad_norm": 0.9171250462532043, "learning_rate": 2.4578424578424583e-06, "loss": 0.2473, "step": 3747 }, { "epoch": 3.896049896049896, "grad_norm": 0.02100229635834694, "learning_rate": 2.4555324555324556e-06, "loss": 0.0006, "step": 3748 }, { "epoch": 3.897089397089397, "grad_norm": 0.4232587218284607, "learning_rate": 2.4532224532224534e-06, "loss": 0.0099, "step": 3749 }, { "epoch": 3.898128898128898, "grad_norm": 0.06537114083766937, "learning_rate": 2.450912450912451e-06, "loss": 0.0018, "step": 3750 }, { "epoch": 3.899168399168399, "grad_norm": 0.0012716350611299276, "learning_rate": 2.4486024486024485e-06, "loss": 0.0, "step": 3751 }, { "epoch": 3.9002079002079, "grad_norm": 8.932744026184082, "learning_rate": 2.4462924462924463e-06, "loss": 1.0465, "step": 3752 }, { "epoch": 3.901247401247401, "grad_norm": 3.2070016860961914, "learning_rate": 2.443982443982444e-06, "loss": 0.0669, "step": 3753 }, { "epoch": 3.9022869022869022, "grad_norm": 0.04936572164297104, "learning_rate": 2.441672441672442e-06, "loss": 0.0012, "step": 3754 }, { "epoch": 3.9033264033264032, "grad_norm": 0.60624098777771, "learning_rate": 2.4393624393624396e-06, "loss": 0.0073, "step": 3755 }, { "epoch": 3.9043659043659042, "grad_norm": 3.0943517684936523, "learning_rate": 2.4370524370524373e-06, "loss": 0.0836, "step": 3756 }, { "epoch": 3.9054054054054053, "grad_norm": 0.1718333512544632, "learning_rate": 2.434742434742435e-06, "loss": 0.0023, "step": 3757 }, { "epoch": 3.9064449064449063, "grad_norm": 0.06529901921749115, "learning_rate": 2.432432432432433e-06, "loss": 0.0009, "step": 3758 }, { "epoch": 3.9074844074844073, "grad_norm": 11.549318313598633, "learning_rate": 2.4301224301224302e-06, "loss": 0.5639, "step": 3759 }, { "epoch": 3.9085239085239083, "grad_norm": 0.7153767347335815, "learning_rate": 2.427812427812428e-06, "loss": 0.0206, "step": 3760 }, { "epoch": 3.9095634095634093, "grad_norm": 1.1701185703277588, "learning_rate": 2.4255024255024258e-06, "loss": 0.0226, "step": 3761 }, { "epoch": 3.9106029106029108, "grad_norm": 1.5243420600891113, "learning_rate": 2.4231924231924235e-06, "loss": 0.0257, "step": 3762 }, { "epoch": 3.9116424116424118, "grad_norm": 9.948197364807129, "learning_rate": 2.4208824208824213e-06, "loss": 0.527, "step": 3763 }, { "epoch": 3.912681912681913, "grad_norm": 0.8991334438323975, "learning_rate": 2.4185724185724186e-06, "loss": 0.0239, "step": 3764 }, { "epoch": 3.913721413721414, "grad_norm": 22.073930740356445, "learning_rate": 2.4162624162624164e-06, "loss": 1.0042, "step": 3765 }, { "epoch": 3.914760914760915, "grad_norm": 0.10600987076759338, "learning_rate": 2.413952413952414e-06, "loss": 0.0013, "step": 3766 }, { "epoch": 3.915800415800416, "grad_norm": 0.7799251675605774, "learning_rate": 2.411642411642412e-06, "loss": 0.0164, "step": 3767 }, { "epoch": 3.916839916839917, "grad_norm": 8.539443969726562, "learning_rate": 2.4093324093324093e-06, "loss": 0.2256, "step": 3768 }, { "epoch": 3.917879417879418, "grad_norm": 8.018176078796387, "learning_rate": 2.407022407022407e-06, "loss": 0.567, "step": 3769 }, { "epoch": 3.918918918918919, "grad_norm": 2.124363422393799, "learning_rate": 2.404712404712405e-06, "loss": 0.3356, "step": 3770 }, { "epoch": 3.91995841995842, "grad_norm": 7.096327304840088, "learning_rate": 2.4024024024024026e-06, "loss": 0.3187, "step": 3771 }, { "epoch": 3.920997920997921, "grad_norm": 8.52831745147705, "learning_rate": 2.4000924000924004e-06, "loss": 0.6401, "step": 3772 }, { "epoch": 3.922037422037422, "grad_norm": 2.970491886138916, "learning_rate": 2.397782397782398e-06, "loss": 0.0845, "step": 3773 }, { "epoch": 3.9230769230769234, "grad_norm": 0.8159802556037903, "learning_rate": 2.395472395472396e-06, "loss": 0.0412, "step": 3774 }, { "epoch": 3.9241164241164244, "grad_norm": 0.5145953893661499, "learning_rate": 2.3931623931623937e-06, "loss": 0.0138, "step": 3775 }, { "epoch": 3.9251559251559254, "grad_norm": 0.3560482859611511, "learning_rate": 2.390852390852391e-06, "loss": 0.0075, "step": 3776 }, { "epoch": 3.9261954261954264, "grad_norm": 14.716246604919434, "learning_rate": 2.3885423885423888e-06, "loss": 1.043, "step": 3777 }, { "epoch": 3.9272349272349274, "grad_norm": 0.0786583423614502, "learning_rate": 2.3862323862323865e-06, "loss": 0.0011, "step": 3778 }, { "epoch": 3.9282744282744284, "grad_norm": 14.384025573730469, "learning_rate": 2.383922383922384e-06, "loss": 0.8149, "step": 3779 }, { "epoch": 3.9293139293139294, "grad_norm": 1.1967740058898926, "learning_rate": 2.3816123816123816e-06, "loss": 0.0347, "step": 3780 }, { "epoch": 3.9303534303534304, "grad_norm": 6.0211381912231445, "learning_rate": 2.3793023793023794e-06, "loss": 0.1471, "step": 3781 }, { "epoch": 3.9313929313929314, "grad_norm": 6.998178482055664, "learning_rate": 2.376992376992377e-06, "loss": 0.2397, "step": 3782 }, { "epoch": 3.9324324324324325, "grad_norm": 0.5392391085624695, "learning_rate": 2.374682374682375e-06, "loss": 0.011, "step": 3783 }, { "epoch": 3.9334719334719335, "grad_norm": 0.8353741765022278, "learning_rate": 2.3723723723723727e-06, "loss": 0.0045, "step": 3784 }, { "epoch": 3.9345114345114345, "grad_norm": 0.29350486397743225, "learning_rate": 2.3700623700623705e-06, "loss": 0.005, "step": 3785 }, { "epoch": 3.9355509355509355, "grad_norm": 0.05252784863114357, "learning_rate": 2.367752367752368e-06, "loss": 0.0017, "step": 3786 }, { "epoch": 3.9365904365904365, "grad_norm": 0.7203182578086853, "learning_rate": 2.3654423654423656e-06, "loss": 0.0167, "step": 3787 }, { "epoch": 3.9376299376299375, "grad_norm": 0.04804070293903351, "learning_rate": 2.3631323631323634e-06, "loss": 0.0011, "step": 3788 }, { "epoch": 3.9386694386694385, "grad_norm": 7.088498592376709, "learning_rate": 2.360822360822361e-06, "loss": 0.6737, "step": 3789 }, { "epoch": 3.9397089397089395, "grad_norm": 0.023920787498354912, "learning_rate": 2.358512358512359e-06, "loss": 0.0006, "step": 3790 }, { "epoch": 3.9407484407484406, "grad_norm": 15.3619966506958, "learning_rate": 2.3562023562023562e-06, "loss": 0.9882, "step": 3791 }, { "epoch": 3.9417879417879416, "grad_norm": 1.5771281719207764, "learning_rate": 2.353892353892354e-06, "loss": 0.0176, "step": 3792 }, { "epoch": 3.9428274428274426, "grad_norm": 11.848381042480469, "learning_rate": 2.3515823515823518e-06, "loss": 1.0265, "step": 3793 }, { "epoch": 3.943866943866944, "grad_norm": 11.604540824890137, "learning_rate": 2.3492723492723495e-06, "loss": 1.1948, "step": 3794 }, { "epoch": 3.944906444906445, "grad_norm": 12.787267684936523, "learning_rate": 2.346962346962347e-06, "loss": 1.0989, "step": 3795 }, { "epoch": 3.945945945945946, "grad_norm": 5.049587249755859, "learning_rate": 2.3446523446523446e-06, "loss": 0.2074, "step": 3796 }, { "epoch": 3.946985446985447, "grad_norm": 0.019799979403614998, "learning_rate": 2.3423423423423424e-06, "loss": 0.0004, "step": 3797 }, { "epoch": 3.948024948024948, "grad_norm": 0.018526798114180565, "learning_rate": 2.34003234003234e-06, "loss": 0.0003, "step": 3798 }, { "epoch": 3.949064449064449, "grad_norm": 0.3677530884742737, "learning_rate": 2.337722337722338e-06, "loss": 0.0139, "step": 3799 }, { "epoch": 3.95010395010395, "grad_norm": 0.17251178622245789, "learning_rate": 2.3354123354123357e-06, "loss": 0.0039, "step": 3800 }, { "epoch": 3.951143451143451, "grad_norm": 9.902690887451172, "learning_rate": 2.3331023331023335e-06, "loss": 0.5293, "step": 3801 }, { "epoch": 3.952182952182952, "grad_norm": 0.6778705716133118, "learning_rate": 2.3307923307923313e-06, "loss": 0.0143, "step": 3802 }, { "epoch": 3.953222453222453, "grad_norm": 0.0005741470959037542, "learning_rate": 2.3284823284823286e-06, "loss": 0.0, "step": 3803 }, { "epoch": 3.954261954261954, "grad_norm": 15.404338836669922, "learning_rate": 2.3261723261723264e-06, "loss": 0.6453, "step": 3804 }, { "epoch": 3.955301455301455, "grad_norm": 7.858543395996094, "learning_rate": 2.323862323862324e-06, "loss": 0.296, "step": 3805 }, { "epoch": 3.9563409563409566, "grad_norm": 4.857236385345459, "learning_rate": 2.321552321552322e-06, "loss": 0.1043, "step": 3806 }, { "epoch": 3.9573804573804576, "grad_norm": 1.5526775121688843, "learning_rate": 2.3192423192423192e-06, "loss": 0.0264, "step": 3807 }, { "epoch": 3.9584199584199586, "grad_norm": 1.5784658193588257, "learning_rate": 2.316932316932317e-06, "loss": 0.0316, "step": 3808 }, { "epoch": 3.9594594594594597, "grad_norm": 2.6113502979278564, "learning_rate": 2.3146223146223148e-06, "loss": 0.1145, "step": 3809 }, { "epoch": 3.9604989604989607, "grad_norm": 3.468583822250366, "learning_rate": 2.3123123123123125e-06, "loss": 0.1322, "step": 3810 }, { "epoch": 3.9615384615384617, "grad_norm": 0.004322944208979607, "learning_rate": 2.3100023100023103e-06, "loss": 0.0001, "step": 3811 }, { "epoch": 3.9625779625779627, "grad_norm": 0.10281362384557724, "learning_rate": 2.307692307692308e-06, "loss": 0.0021, "step": 3812 }, { "epoch": 3.9636174636174637, "grad_norm": 0.28737759590148926, "learning_rate": 2.3053823053823054e-06, "loss": 0.0046, "step": 3813 }, { "epoch": 3.9646569646569647, "grad_norm": 14.735072135925293, "learning_rate": 2.303072303072303e-06, "loss": 1.09, "step": 3814 }, { "epoch": 3.9656964656964657, "grad_norm": 19.965375900268555, "learning_rate": 2.300762300762301e-06, "loss": 0.5098, "step": 3815 }, { "epoch": 3.9667359667359667, "grad_norm": 0.02133842557668686, "learning_rate": 2.2984522984522987e-06, "loss": 0.0006, "step": 3816 }, { "epoch": 3.9677754677754677, "grad_norm": 14.559213638305664, "learning_rate": 2.2961422961422965e-06, "loss": 1.8074, "step": 3817 }, { "epoch": 3.9688149688149688, "grad_norm": 0.0009462753660045564, "learning_rate": 2.2938322938322943e-06, "loss": 0.0, "step": 3818 }, { "epoch": 3.9698544698544698, "grad_norm": 1.4533599615097046, "learning_rate": 2.2915222915222916e-06, "loss": 0.0346, "step": 3819 }, { "epoch": 3.970893970893971, "grad_norm": 2.8870737552642822, "learning_rate": 2.2892122892122894e-06, "loss": 0.0598, "step": 3820 }, { "epoch": 3.971933471933472, "grad_norm": 4.132802963256836, "learning_rate": 2.286902286902287e-06, "loss": 0.3874, "step": 3821 }, { "epoch": 3.972972972972973, "grad_norm": 8.681702613830566, "learning_rate": 2.2845922845922845e-06, "loss": 0.3768, "step": 3822 }, { "epoch": 3.974012474012474, "grad_norm": 0.6327468156814575, "learning_rate": 2.2822822822822822e-06, "loss": 0.0134, "step": 3823 }, { "epoch": 3.975051975051975, "grad_norm": 2.971172332763672, "learning_rate": 2.27997227997228e-06, "loss": 0.0713, "step": 3824 }, { "epoch": 3.976091476091476, "grad_norm": 8.309932708740234, "learning_rate": 2.2776622776622778e-06, "loss": 0.5183, "step": 3825 }, { "epoch": 3.9771309771309773, "grad_norm": 13.4599027633667, "learning_rate": 2.2753522753522755e-06, "loss": 1.2856, "step": 3826 }, { "epoch": 3.9781704781704783, "grad_norm": 11.234210014343262, "learning_rate": 2.2730422730422733e-06, "loss": 0.8115, "step": 3827 }, { "epoch": 3.9792099792099793, "grad_norm": 0.08762773871421814, "learning_rate": 2.270732270732271e-06, "loss": 0.0017, "step": 3828 }, { "epoch": 3.9802494802494803, "grad_norm": 5.529712200164795, "learning_rate": 2.268422268422269e-06, "loss": 0.509, "step": 3829 }, { "epoch": 3.9812889812889813, "grad_norm": 2.618068218231201, "learning_rate": 2.2661122661122666e-06, "loss": 0.0958, "step": 3830 }, { "epoch": 3.9823284823284824, "grad_norm": 8.357830047607422, "learning_rate": 2.263802263802264e-06, "loss": 0.2174, "step": 3831 }, { "epoch": 3.9833679833679834, "grad_norm": 4.5352277755737305, "learning_rate": 2.2614922614922617e-06, "loss": 0.1564, "step": 3832 }, { "epoch": 3.9844074844074844, "grad_norm": 0.011746260337531567, "learning_rate": 2.2591822591822595e-06, "loss": 0.0002, "step": 3833 }, { "epoch": 3.9854469854469854, "grad_norm": 0.48385775089263916, "learning_rate": 2.256872256872257e-06, "loss": 0.0127, "step": 3834 }, { "epoch": 3.9864864864864864, "grad_norm": 5.331934452056885, "learning_rate": 2.2545622545622546e-06, "loss": 0.4049, "step": 3835 }, { "epoch": 3.9875259875259874, "grad_norm": 0.004386260639876127, "learning_rate": 2.2522522522522524e-06, "loss": 0.0001, "step": 3836 }, { "epoch": 3.9885654885654884, "grad_norm": 24.71803092956543, "learning_rate": 2.24994224994225e-06, "loss": 0.0714, "step": 3837 }, { "epoch": 3.98960498960499, "grad_norm": 8.399441719055176, "learning_rate": 2.247632247632248e-06, "loss": 0.4843, "step": 3838 }, { "epoch": 3.990644490644491, "grad_norm": 6.994706153869629, "learning_rate": 2.2453222453222457e-06, "loss": 0.591, "step": 3839 }, { "epoch": 3.991683991683992, "grad_norm": 0.43649402260780334, "learning_rate": 2.243012243012243e-06, "loss": 0.0052, "step": 3840 }, { "epoch": 3.992723492723493, "grad_norm": 1.914015769958496, "learning_rate": 2.2407022407022408e-06, "loss": 0.074, "step": 3841 }, { "epoch": 3.993762993762994, "grad_norm": 0.054117415100336075, "learning_rate": 2.2383922383922386e-06, "loss": 0.001, "step": 3842 }, { "epoch": 3.994802494802495, "grad_norm": 3.9509143829345703, "learning_rate": 2.2360822360822363e-06, "loss": 0.0723, "step": 3843 }, { "epoch": 3.995841995841996, "grad_norm": 12.29439640045166, "learning_rate": 2.233772233772234e-06, "loss": 0.9938, "step": 3844 }, { "epoch": 3.996881496881497, "grad_norm": 0.023738661780953407, "learning_rate": 2.231462231462232e-06, "loss": 0.0005, "step": 3845 }, { "epoch": 3.997920997920998, "grad_norm": 8.44709587097168, "learning_rate": 2.229152229152229e-06, "loss": 0.3122, "step": 3846 }, { "epoch": 3.998960498960499, "grad_norm": 11.878024101257324, "learning_rate": 2.226842226842227e-06, "loss": 0.8965, "step": 3847 }, { "epoch": 4.0, "grad_norm": 0.6288987398147583, "learning_rate": 2.2245322245322247e-06, "loss": 0.0147, "step": 3848 }, { "epoch": 4.001039501039501, "grad_norm": 4.34573221206665, "learning_rate": 2.222222222222222e-06, "loss": 0.299, "step": 3849 }, { "epoch": 4.002079002079002, "grad_norm": 0.3988482654094696, "learning_rate": 2.21991221991222e-06, "loss": 0.0088, "step": 3850 }, { "epoch": 4.003118503118503, "grad_norm": 5.311618328094482, "learning_rate": 2.2176022176022176e-06, "loss": 0.1288, "step": 3851 }, { "epoch": 4.004158004158004, "grad_norm": 0.03252868354320526, "learning_rate": 2.2152922152922154e-06, "loss": 0.0008, "step": 3852 }, { "epoch": 4.005197505197505, "grad_norm": 1.65116286277771, "learning_rate": 2.212982212982213e-06, "loss": 0.0598, "step": 3853 }, { "epoch": 4.006237006237006, "grad_norm": 14.985928535461426, "learning_rate": 2.210672210672211e-06, "loss": 0.6218, "step": 3854 }, { "epoch": 4.007276507276507, "grad_norm": 9.496588706970215, "learning_rate": 2.2083622083622087e-06, "loss": 0.7732, "step": 3855 }, { "epoch": 4.008316008316008, "grad_norm": 7.977664947509766, "learning_rate": 2.2060522060522064e-06, "loss": 0.7576, "step": 3856 }, { "epoch": 4.009355509355509, "grad_norm": 6.168428421020508, "learning_rate": 2.2037422037422042e-06, "loss": 0.3512, "step": 3857 }, { "epoch": 4.01039501039501, "grad_norm": 7.742276668548584, "learning_rate": 2.2014322014322016e-06, "loss": 0.1884, "step": 3858 }, { "epoch": 4.011434511434511, "grad_norm": 0.12430943548679352, "learning_rate": 2.1991221991221993e-06, "loss": 0.0033, "step": 3859 }, { "epoch": 4.012474012474012, "grad_norm": 3.1924452781677246, "learning_rate": 2.196812196812197e-06, "loss": 0.0411, "step": 3860 }, { "epoch": 4.013513513513513, "grad_norm": 2.041194200515747, "learning_rate": 2.194502194502195e-06, "loss": 0.0927, "step": 3861 }, { "epoch": 4.014553014553014, "grad_norm": 0.478383332490921, "learning_rate": 2.192192192192192e-06, "loss": 0.0099, "step": 3862 }, { "epoch": 4.015592515592515, "grad_norm": 8.808903694152832, "learning_rate": 2.18988218988219e-06, "loss": 0.411, "step": 3863 }, { "epoch": 4.016632016632016, "grad_norm": 4.590777397155762, "learning_rate": 2.1875721875721877e-06, "loss": 0.0937, "step": 3864 }, { "epoch": 4.017671517671518, "grad_norm": 0.27158844470977783, "learning_rate": 2.1852621852621855e-06, "loss": 0.0058, "step": 3865 }, { "epoch": 4.018711018711019, "grad_norm": 6.849306106567383, "learning_rate": 2.1829521829521833e-06, "loss": 0.4817, "step": 3866 }, { "epoch": 4.01975051975052, "grad_norm": 2.9405770301818848, "learning_rate": 2.1806421806421806e-06, "loss": 0.0536, "step": 3867 }, { "epoch": 4.020790020790021, "grad_norm": 3.641545057296753, "learning_rate": 2.1783321783321784e-06, "loss": 0.1562, "step": 3868 }, { "epoch": 4.021829521829522, "grad_norm": 1.2189357280731201, "learning_rate": 2.176022176022176e-06, "loss": 0.029, "step": 3869 }, { "epoch": 4.022869022869023, "grad_norm": 0.2723861038684845, "learning_rate": 2.173712173712174e-06, "loss": 0.0076, "step": 3870 }, { "epoch": 4.023908523908524, "grad_norm": 3.9028191566467285, "learning_rate": 2.1714021714021717e-06, "loss": 0.1592, "step": 3871 }, { "epoch": 4.024948024948025, "grad_norm": 5.267368316650391, "learning_rate": 2.1690921690921694e-06, "loss": 0.1642, "step": 3872 }, { "epoch": 4.025987525987526, "grad_norm": 2.9735355377197266, "learning_rate": 2.1667821667821672e-06, "loss": 0.061, "step": 3873 }, { "epoch": 4.027027027027027, "grad_norm": 1.855345368385315, "learning_rate": 2.1644721644721646e-06, "loss": 0.0525, "step": 3874 }, { "epoch": 4.028066528066528, "grad_norm": 8.544869422912598, "learning_rate": 2.1621621621621623e-06, "loss": 0.6427, "step": 3875 }, { "epoch": 4.029106029106029, "grad_norm": 9.181122779846191, "learning_rate": 2.15985215985216e-06, "loss": 0.5065, "step": 3876 }, { "epoch": 4.03014553014553, "grad_norm": 11.528375625610352, "learning_rate": 2.1575421575421574e-06, "loss": 0.3951, "step": 3877 }, { "epoch": 4.031185031185031, "grad_norm": 0.14364181458950043, "learning_rate": 2.155232155232155e-06, "loss": 0.0039, "step": 3878 }, { "epoch": 4.032224532224532, "grad_norm": 10.91196060180664, "learning_rate": 2.152922152922153e-06, "loss": 0.5188, "step": 3879 }, { "epoch": 4.033264033264033, "grad_norm": 3.6201767921447754, "learning_rate": 2.1506121506121507e-06, "loss": 0.3075, "step": 3880 }, { "epoch": 4.034303534303534, "grad_norm": 1.5775871276855469, "learning_rate": 2.1483021483021485e-06, "loss": 0.0861, "step": 3881 }, { "epoch": 4.035343035343035, "grad_norm": 0.17233553528785706, "learning_rate": 2.1459921459921463e-06, "loss": 0.0046, "step": 3882 }, { "epoch": 4.036382536382536, "grad_norm": 1.6797047853469849, "learning_rate": 2.143682143682144e-06, "loss": 0.0766, "step": 3883 }, { "epoch": 4.037422037422037, "grad_norm": 0.03757861256599426, "learning_rate": 2.141372141372142e-06, "loss": 0.0012, "step": 3884 }, { "epoch": 4.038461538461538, "grad_norm": 6.65224552154541, "learning_rate": 2.139062139062139e-06, "loss": 0.3537, "step": 3885 }, { "epoch": 4.039501039501039, "grad_norm": 2.8057408332824707, "learning_rate": 2.136752136752137e-06, "loss": 0.1392, "step": 3886 }, { "epoch": 4.04054054054054, "grad_norm": 13.577767372131348, "learning_rate": 2.1344421344421347e-06, "loss": 0.825, "step": 3887 }, { "epoch": 4.041580041580041, "grad_norm": 21.758955001831055, "learning_rate": 2.1321321321321325e-06, "loss": 1.6505, "step": 3888 }, { "epoch": 4.042619542619542, "grad_norm": 0.7290350198745728, "learning_rate": 2.12982212982213e-06, "loss": 0.0148, "step": 3889 }, { "epoch": 4.043659043659043, "grad_norm": 4.572800159454346, "learning_rate": 2.1275121275121276e-06, "loss": 0.3039, "step": 3890 }, { "epoch": 4.044698544698544, "grad_norm": 1.6838501691818237, "learning_rate": 2.1252021252021253e-06, "loss": 0.2458, "step": 3891 }, { "epoch": 4.045738045738045, "grad_norm": 0.09158274531364441, "learning_rate": 2.122892122892123e-06, "loss": 0.0022, "step": 3892 }, { "epoch": 4.046777546777546, "grad_norm": 1.051876425743103, "learning_rate": 2.120582120582121e-06, "loss": 0.0301, "step": 3893 }, { "epoch": 4.047817047817047, "grad_norm": 7.565310955047607, "learning_rate": 2.118272118272118e-06, "loss": 0.5889, "step": 3894 }, { "epoch": 4.048856548856548, "grad_norm": 0.4519527852535248, "learning_rate": 2.115962115962116e-06, "loss": 0.0051, "step": 3895 }, { "epoch": 4.04989604989605, "grad_norm": 11.864056587219238, "learning_rate": 2.1136521136521137e-06, "loss": 0.6653, "step": 3896 }, { "epoch": 4.050935550935551, "grad_norm": 3.6035194396972656, "learning_rate": 2.1113421113421115e-06, "loss": 0.0504, "step": 3897 }, { "epoch": 4.051975051975052, "grad_norm": 2.7916107177734375, "learning_rate": 2.1090321090321093e-06, "loss": 0.0709, "step": 3898 }, { "epoch": 4.053014553014553, "grad_norm": 0.2317170947790146, "learning_rate": 2.106722106722107e-06, "loss": 0.0072, "step": 3899 }, { "epoch": 4.054054054054054, "grad_norm": 3.6809520721435547, "learning_rate": 2.104412104412105e-06, "loss": 0.0592, "step": 3900 }, { "epoch": 4.055093555093555, "grad_norm": 12.056526184082031, "learning_rate": 2.102102102102102e-06, "loss": 1.1791, "step": 3901 }, { "epoch": 4.056133056133056, "grad_norm": 0.6267997622489929, "learning_rate": 2.0997920997921e-06, "loss": 0.026, "step": 3902 }, { "epoch": 4.057172557172557, "grad_norm": 7.745768070220947, "learning_rate": 2.0974820974820977e-06, "loss": 0.1447, "step": 3903 }, { "epoch": 4.058212058212058, "grad_norm": 0.046626631170511246, "learning_rate": 2.095172095172095e-06, "loss": 0.0009, "step": 3904 }, { "epoch": 4.0592515592515594, "grad_norm": 0.03431563451886177, "learning_rate": 2.092862092862093e-06, "loss": 0.001, "step": 3905 }, { "epoch": 4.0602910602910605, "grad_norm": 0.21163296699523926, "learning_rate": 2.0905520905520906e-06, "loss": 0.0064, "step": 3906 }, { "epoch": 4.0613305613305615, "grad_norm": 2.4047062397003174, "learning_rate": 2.0882420882420883e-06, "loss": 0.0353, "step": 3907 }, { "epoch": 4.0623700623700625, "grad_norm": 7.710663318634033, "learning_rate": 2.085932085932086e-06, "loss": 0.5675, "step": 3908 }, { "epoch": 4.0634095634095635, "grad_norm": 3.3957293033599854, "learning_rate": 2.083622083622084e-06, "loss": 0.1848, "step": 3909 }, { "epoch": 4.0644490644490645, "grad_norm": 8.47187328338623, "learning_rate": 2.0813120813120816e-06, "loss": 0.1675, "step": 3910 }, { "epoch": 4.0654885654885655, "grad_norm": 0.38779881596565247, "learning_rate": 2.0790020790020794e-06, "loss": 0.0084, "step": 3911 }, { "epoch": 4.0665280665280665, "grad_norm": 11.849868774414062, "learning_rate": 2.0766920766920767e-06, "loss": 1.1938, "step": 3912 }, { "epoch": 4.0675675675675675, "grad_norm": 7.814253330230713, "learning_rate": 2.0743820743820745e-06, "loss": 0.3056, "step": 3913 }, { "epoch": 4.0686070686070686, "grad_norm": 9.556112289428711, "learning_rate": 2.0720720720720723e-06, "loss": 0.4659, "step": 3914 }, { "epoch": 4.06964656964657, "grad_norm": 0.10459741204977036, "learning_rate": 2.06976206976207e-06, "loss": 0.0019, "step": 3915 }, { "epoch": 4.070686070686071, "grad_norm": 2.1530704498291016, "learning_rate": 2.067452067452068e-06, "loss": 0.0306, "step": 3916 }, { "epoch": 4.071725571725572, "grad_norm": 9.012895584106445, "learning_rate": 2.065142065142065e-06, "loss": 0.4619, "step": 3917 }, { "epoch": 4.072765072765073, "grad_norm": 10.304547309875488, "learning_rate": 2.062832062832063e-06, "loss": 0.4175, "step": 3918 }, { "epoch": 4.073804573804574, "grad_norm": 8.123601913452148, "learning_rate": 2.0605220605220607e-06, "loss": 0.4888, "step": 3919 }, { "epoch": 4.074844074844075, "grad_norm": 0.0011895884526893497, "learning_rate": 2.0582120582120585e-06, "loss": 0.0, "step": 3920 }, { "epoch": 4.075883575883576, "grad_norm": 1.604593276977539, "learning_rate": 2.055902055902056e-06, "loss": 0.0495, "step": 3921 }, { "epoch": 4.076923076923077, "grad_norm": 3.2594895362854004, "learning_rate": 2.0535920535920536e-06, "loss": 0.0281, "step": 3922 }, { "epoch": 4.077962577962578, "grad_norm": 0.15859843790531158, "learning_rate": 2.0512820512820513e-06, "loss": 0.004, "step": 3923 }, { "epoch": 4.079002079002079, "grad_norm": 0.02080628275871277, "learning_rate": 2.048972048972049e-06, "loss": 0.0004, "step": 3924 }, { "epoch": 4.08004158004158, "grad_norm": 0.42855557799339294, "learning_rate": 2.046662046662047e-06, "loss": 0.0045, "step": 3925 }, { "epoch": 4.081081081081081, "grad_norm": 4.763267517089844, "learning_rate": 2.0443520443520446e-06, "loss": 0.252, "step": 3926 }, { "epoch": 4.082120582120582, "grad_norm": 7.961658477783203, "learning_rate": 2.0420420420420424e-06, "loss": 0.4169, "step": 3927 }, { "epoch": 4.083160083160083, "grad_norm": 12.200848579406738, "learning_rate": 2.03973203973204e-06, "loss": 0.5576, "step": 3928 }, { "epoch": 4.084199584199585, "grad_norm": 0.2671249210834503, "learning_rate": 2.0374220374220375e-06, "loss": 0.0063, "step": 3929 }, { "epoch": 4.085239085239086, "grad_norm": 2.368168592453003, "learning_rate": 2.0351120351120353e-06, "loss": 0.0575, "step": 3930 }, { "epoch": 4.086278586278587, "grad_norm": 10.609644889831543, "learning_rate": 2.032802032802033e-06, "loss": 0.4751, "step": 3931 }, { "epoch": 4.087318087318088, "grad_norm": 7.265843391418457, "learning_rate": 2.0304920304920304e-06, "loss": 0.4997, "step": 3932 }, { "epoch": 4.088357588357589, "grad_norm": 10.27369213104248, "learning_rate": 2.028182028182028e-06, "loss": 0.3908, "step": 3933 }, { "epoch": 4.08939708939709, "grad_norm": 2.5010275840759277, "learning_rate": 2.025872025872026e-06, "loss": 0.2932, "step": 3934 }, { "epoch": 4.090436590436591, "grad_norm": 1.6605794429779053, "learning_rate": 2.0235620235620237e-06, "loss": 0.0376, "step": 3935 }, { "epoch": 4.091476091476092, "grad_norm": 4.947322845458984, "learning_rate": 2.0212520212520215e-06, "loss": 0.1477, "step": 3936 }, { "epoch": 4.092515592515593, "grad_norm": 0.015340354293584824, "learning_rate": 2.0189420189420192e-06, "loss": 0.0003, "step": 3937 }, { "epoch": 4.093555093555094, "grad_norm": 12.941651344299316, "learning_rate": 2.016632016632017e-06, "loss": 0.6778, "step": 3938 }, { "epoch": 4.094594594594595, "grad_norm": 9.275856971740723, "learning_rate": 2.0143220143220143e-06, "loss": 0.741, "step": 3939 }, { "epoch": 4.095634095634096, "grad_norm": 2.234531879425049, "learning_rate": 2.012012012012012e-06, "loss": 0.3126, "step": 3940 }, { "epoch": 4.096673596673597, "grad_norm": 4.1659369468688965, "learning_rate": 2.00970200970201e-06, "loss": 0.1181, "step": 3941 }, { "epoch": 4.097713097713098, "grad_norm": 10.8466215133667, "learning_rate": 2.0073920073920076e-06, "loss": 0.5613, "step": 3942 }, { "epoch": 4.098752598752599, "grad_norm": 1.559237003326416, "learning_rate": 2.0050820050820054e-06, "loss": 0.0277, "step": 3943 }, { "epoch": 4.0997920997921, "grad_norm": 7.026435375213623, "learning_rate": 2.0027720027720028e-06, "loss": 0.2141, "step": 3944 }, { "epoch": 4.100831600831601, "grad_norm": 6.664749622344971, "learning_rate": 2.0004620004620005e-06, "loss": 0.2036, "step": 3945 }, { "epoch": 4.101871101871102, "grad_norm": 2.792083501815796, "learning_rate": 1.9981519981519983e-06, "loss": 0.0606, "step": 3946 }, { "epoch": 4.102910602910603, "grad_norm": 6.02000093460083, "learning_rate": 1.995841995841996e-06, "loss": 0.1256, "step": 3947 }, { "epoch": 4.103950103950104, "grad_norm": 4.715527057647705, "learning_rate": 1.9935319935319934e-06, "loss": 0.1299, "step": 3948 }, { "epoch": 4.104989604989605, "grad_norm": 0.0066122873686254025, "learning_rate": 1.991221991221991e-06, "loss": 0.0001, "step": 3949 }, { "epoch": 4.106029106029106, "grad_norm": 3.474184274673462, "learning_rate": 1.988911988911989e-06, "loss": 0.2328, "step": 3950 }, { "epoch": 4.107068607068607, "grad_norm": 4.632596015930176, "learning_rate": 1.9866019866019867e-06, "loss": 0.1278, "step": 3951 }, { "epoch": 4.108108108108108, "grad_norm": 3.1452908515930176, "learning_rate": 1.9842919842919845e-06, "loss": 0.061, "step": 3952 }, { "epoch": 4.109147609147609, "grad_norm": 0.2267417013645172, "learning_rate": 1.9819819819819822e-06, "loss": 0.0059, "step": 3953 }, { "epoch": 4.11018711018711, "grad_norm": 1.7719359397888184, "learning_rate": 1.97967197967198e-06, "loss": 0.07, "step": 3954 }, { "epoch": 4.111226611226611, "grad_norm": 1.1216524839401245, "learning_rate": 1.9773619773619778e-06, "loss": 0.0213, "step": 3955 }, { "epoch": 4.112266112266112, "grad_norm": 5.114573001861572, "learning_rate": 1.975051975051975e-06, "loss": 0.1357, "step": 3956 }, { "epoch": 4.113305613305613, "grad_norm": 9.502123832702637, "learning_rate": 1.972741972741973e-06, "loss": 0.7736, "step": 3957 }, { "epoch": 4.114345114345114, "grad_norm": 6.611077308654785, "learning_rate": 1.9704319704319707e-06, "loss": 0.1888, "step": 3958 }, { "epoch": 4.115384615384615, "grad_norm": 3.405991792678833, "learning_rate": 1.968121968121968e-06, "loss": 0.0855, "step": 3959 }, { "epoch": 4.116424116424117, "grad_norm": 0.3084595501422882, "learning_rate": 1.9658119658119658e-06, "loss": 0.0088, "step": 3960 }, { "epoch": 4.117463617463618, "grad_norm": 0.11557632684707642, "learning_rate": 1.9635019635019635e-06, "loss": 0.0023, "step": 3961 }, { "epoch": 4.118503118503119, "grad_norm": 0.4615918695926666, "learning_rate": 1.9611919611919613e-06, "loss": 0.0136, "step": 3962 }, { "epoch": 4.11954261954262, "grad_norm": 8.216227531433105, "learning_rate": 1.958881958881959e-06, "loss": 0.5017, "step": 3963 }, { "epoch": 4.120582120582121, "grad_norm": 1.2277436256408691, "learning_rate": 1.956571956571957e-06, "loss": 0.0259, "step": 3964 }, { "epoch": 4.121621621621622, "grad_norm": 11.282058715820312, "learning_rate": 1.9542619542619546e-06, "loss": 0.7121, "step": 3965 }, { "epoch": 4.122661122661123, "grad_norm": 0.6728746294975281, "learning_rate": 1.951951951951952e-06, "loss": 0.02, "step": 3966 }, { "epoch": 4.123700623700624, "grad_norm": 0.007000439800322056, "learning_rate": 1.9496419496419497e-06, "loss": 0.0002, "step": 3967 }, { "epoch": 4.124740124740125, "grad_norm": 8.943163871765137, "learning_rate": 1.9473319473319475e-06, "loss": 0.5562, "step": 3968 }, { "epoch": 4.125779625779626, "grad_norm": 11.286050796508789, "learning_rate": 1.9450219450219452e-06, "loss": 0.7542, "step": 3969 }, { "epoch": 4.126819126819127, "grad_norm": 0.1594606339931488, "learning_rate": 1.942711942711943e-06, "loss": 0.0057, "step": 3970 }, { "epoch": 4.127858627858628, "grad_norm": 3.0805113315582275, "learning_rate": 1.9404019404019408e-06, "loss": 0.1091, "step": 3971 }, { "epoch": 4.128898128898129, "grad_norm": 0.0031344450544565916, "learning_rate": 1.938091938091938e-06, "loss": 0.0001, "step": 3972 }, { "epoch": 4.12993762993763, "grad_norm": 5.24778938293457, "learning_rate": 1.935781935781936e-06, "loss": 0.1809, "step": 3973 }, { "epoch": 4.130977130977131, "grad_norm": 0.20770363509655, "learning_rate": 1.9334719334719337e-06, "loss": 0.0038, "step": 3974 }, { "epoch": 4.132016632016632, "grad_norm": 0.0760016217827797, "learning_rate": 1.9311619311619314e-06, "loss": 0.0012, "step": 3975 }, { "epoch": 4.133056133056133, "grad_norm": 1.8212653398513794, "learning_rate": 1.9288519288519288e-06, "loss": 0.0574, "step": 3976 }, { "epoch": 4.134095634095634, "grad_norm": 3.7411952018737793, "learning_rate": 1.9265419265419265e-06, "loss": 0.3133, "step": 3977 }, { "epoch": 4.135135135135135, "grad_norm": 8.470130920410156, "learning_rate": 1.9242319242319243e-06, "loss": 0.1549, "step": 3978 }, { "epoch": 4.136174636174636, "grad_norm": 13.471936225891113, "learning_rate": 1.921921921921922e-06, "loss": 1.9759, "step": 3979 }, { "epoch": 4.137214137214137, "grad_norm": 10.0439453125, "learning_rate": 1.91961191961192e-06, "loss": 0.5779, "step": 3980 }, { "epoch": 4.138253638253638, "grad_norm": 0.3061752915382385, "learning_rate": 1.9173019173019176e-06, "loss": 0.008, "step": 3981 }, { "epoch": 4.139293139293139, "grad_norm": 0.16787175834178925, "learning_rate": 1.9149919149919154e-06, "loss": 0.0025, "step": 3982 }, { "epoch": 4.14033264033264, "grad_norm": 0.1021217480301857, "learning_rate": 1.912681912681913e-06, "loss": 0.0022, "step": 3983 }, { "epoch": 4.141372141372141, "grad_norm": 1.6589683294296265, "learning_rate": 1.9103719103719105e-06, "loss": 0.046, "step": 3984 }, { "epoch": 4.142411642411642, "grad_norm": 4.064008712768555, "learning_rate": 1.9080619080619082e-06, "loss": 0.0336, "step": 3985 }, { "epoch": 4.143451143451143, "grad_norm": 0.7545316815376282, "learning_rate": 1.9057519057519058e-06, "loss": 0.0158, "step": 3986 }, { "epoch": 4.144490644490644, "grad_norm": 1.8117564916610718, "learning_rate": 1.9034419034419036e-06, "loss": 0.4961, "step": 3987 }, { "epoch": 4.145530145530145, "grad_norm": 0.3270220160484314, "learning_rate": 1.9011319011319013e-06, "loss": 0.0122, "step": 3988 }, { "epoch": 4.146569646569646, "grad_norm": 5.634047031402588, "learning_rate": 1.8988218988218989e-06, "loss": 0.3619, "step": 3989 }, { "epoch": 4.147609147609147, "grad_norm": 0.004145979881286621, "learning_rate": 1.8965118965118967e-06, "loss": 0.0001, "step": 3990 }, { "epoch": 4.148648648648648, "grad_norm": 2.454077959060669, "learning_rate": 1.8942018942018944e-06, "loss": 0.0743, "step": 3991 }, { "epoch": 4.149688149688149, "grad_norm": 2.424099922180176, "learning_rate": 1.8918918918918922e-06, "loss": 0.0716, "step": 3992 }, { "epoch": 4.150727650727651, "grad_norm": 0.14816658198833466, "learning_rate": 1.88958188958189e-06, "loss": 0.0032, "step": 3993 }, { "epoch": 4.151767151767152, "grad_norm": 9.936027526855469, "learning_rate": 1.8872718872718873e-06, "loss": 0.9768, "step": 3994 }, { "epoch": 4.152806652806653, "grad_norm": 7.472232818603516, "learning_rate": 1.884961884961885e-06, "loss": 0.2945, "step": 3995 }, { "epoch": 4.153846153846154, "grad_norm": 5.426685810089111, "learning_rate": 1.8826518826518828e-06, "loss": 0.2832, "step": 3996 }, { "epoch": 4.154885654885655, "grad_norm": 2.724226951599121, "learning_rate": 1.8803418803418804e-06, "loss": 0.0439, "step": 3997 }, { "epoch": 4.155925155925156, "grad_norm": 1.9673157930374146, "learning_rate": 1.8780318780318782e-06, "loss": 0.0547, "step": 3998 }, { "epoch": 4.156964656964657, "grad_norm": 7.403589725494385, "learning_rate": 1.875721875721876e-06, "loss": 0.4418, "step": 3999 }, { "epoch": 4.158004158004158, "grad_norm": 7.950428009033203, "learning_rate": 1.8734118734118737e-06, "loss": 0.4021, "step": 4000 }, { "epoch": 4.159043659043659, "grad_norm": 9.602352142333984, "learning_rate": 1.8711018711018713e-06, "loss": 0.8566, "step": 4001 }, { "epoch": 4.16008316008316, "grad_norm": 1.825050711631775, "learning_rate": 1.868791868791869e-06, "loss": 0.026, "step": 4002 }, { "epoch": 4.161122661122661, "grad_norm": 7.144789218902588, "learning_rate": 1.8664818664818666e-06, "loss": 0.1889, "step": 4003 }, { "epoch": 4.162162162162162, "grad_norm": 2.014277458190918, "learning_rate": 1.8641718641718643e-06, "loss": 0.041, "step": 4004 }, { "epoch": 4.163201663201663, "grad_norm": 0.017938906326889992, "learning_rate": 1.861861861861862e-06, "loss": 0.0004, "step": 4005 }, { "epoch": 4.164241164241164, "grad_norm": 5.936906814575195, "learning_rate": 1.8595518595518597e-06, "loss": 0.1381, "step": 4006 }, { "epoch": 4.165280665280665, "grad_norm": 5.884034633636475, "learning_rate": 1.8572418572418574e-06, "loss": 0.4302, "step": 4007 }, { "epoch": 4.166320166320166, "grad_norm": 0.4456760883331299, "learning_rate": 1.8549318549318552e-06, "loss": 0.0147, "step": 4008 }, { "epoch": 4.167359667359667, "grad_norm": 4.608760356903076, "learning_rate": 1.8526218526218528e-06, "loss": 0.1292, "step": 4009 }, { "epoch": 4.168399168399168, "grad_norm": 8.948893547058105, "learning_rate": 1.8503118503118505e-06, "loss": 0.9863, "step": 4010 }, { "epoch": 4.169438669438669, "grad_norm": 0.216671884059906, "learning_rate": 1.8480018480018483e-06, "loss": 0.0044, "step": 4011 }, { "epoch": 4.17047817047817, "grad_norm": 4.625119209289551, "learning_rate": 1.8456918456918456e-06, "loss": 0.2856, "step": 4012 }, { "epoch": 4.171517671517671, "grad_norm": 0.2241864949464798, "learning_rate": 1.8433818433818434e-06, "loss": 0.0061, "step": 4013 }, { "epoch": 4.172557172557172, "grad_norm": 4.816661357879639, "learning_rate": 1.8410718410718412e-06, "loss": 0.5286, "step": 4014 }, { "epoch": 4.173596673596673, "grad_norm": 6.056797027587891, "learning_rate": 1.838761838761839e-06, "loss": 0.2242, "step": 4015 }, { "epoch": 4.174636174636174, "grad_norm": 1.5220786333084106, "learning_rate": 1.8364518364518367e-06, "loss": 0.0316, "step": 4016 }, { "epoch": 4.175675675675675, "grad_norm": 9.600642204284668, "learning_rate": 1.8341418341418343e-06, "loss": 0.6818, "step": 4017 }, { "epoch": 4.1767151767151764, "grad_norm": 0.1177779883146286, "learning_rate": 1.831831831831832e-06, "loss": 0.0037, "step": 4018 }, { "epoch": 4.1777546777546775, "grad_norm": 4.937497615814209, "learning_rate": 1.8295218295218298e-06, "loss": 0.3238, "step": 4019 }, { "epoch": 4.1787941787941785, "grad_norm": 5.371255874633789, "learning_rate": 1.8272118272118276e-06, "loss": 0.23, "step": 4020 }, { "epoch": 4.1798336798336795, "grad_norm": 1.4247781038284302, "learning_rate": 1.824901824901825e-06, "loss": 0.0328, "step": 4021 }, { "epoch": 4.1808731808731805, "grad_norm": 0.15081067383289337, "learning_rate": 1.8225918225918227e-06, "loss": 0.0032, "step": 4022 }, { "epoch": 4.1819126819126815, "grad_norm": 2.7095346450805664, "learning_rate": 1.8202818202818204e-06, "loss": 0.1122, "step": 4023 }, { "epoch": 4.182952182952183, "grad_norm": 1.09393310546875, "learning_rate": 1.817971817971818e-06, "loss": 0.0206, "step": 4024 }, { "epoch": 4.183991683991684, "grad_norm": 10.313549995422363, "learning_rate": 1.8156618156618158e-06, "loss": 0.8821, "step": 4025 }, { "epoch": 4.185031185031185, "grad_norm": 6.645352840423584, "learning_rate": 1.8133518133518135e-06, "loss": 0.4741, "step": 4026 }, { "epoch": 4.186070686070686, "grad_norm": 1.595228910446167, "learning_rate": 1.8110418110418113e-06, "loss": 0.0237, "step": 4027 }, { "epoch": 4.1871101871101875, "grad_norm": 4.622249126434326, "learning_rate": 1.808731808731809e-06, "loss": 0.1143, "step": 4028 }, { "epoch": 4.1881496881496885, "grad_norm": 18.4790096282959, "learning_rate": 1.8064218064218066e-06, "loss": 0.677, "step": 4029 }, { "epoch": 4.1891891891891895, "grad_norm": 7.102630615234375, "learning_rate": 1.8041118041118042e-06, "loss": 0.2497, "step": 4030 }, { "epoch": 4.1902286902286905, "grad_norm": 0.9145203828811646, "learning_rate": 1.801801801801802e-06, "loss": 0.0262, "step": 4031 }, { "epoch": 4.1912681912681915, "grad_norm": 0.6228851079940796, "learning_rate": 1.7994917994917995e-06, "loss": 0.0118, "step": 4032 }, { "epoch": 4.1923076923076925, "grad_norm": 0.08999498933553696, "learning_rate": 1.7971817971817973e-06, "loss": 0.0021, "step": 4033 }, { "epoch": 4.1933471933471935, "grad_norm": 1.8668020963668823, "learning_rate": 1.794871794871795e-06, "loss": 0.056, "step": 4034 }, { "epoch": 4.1943866943866945, "grad_norm": 0.015170056372880936, "learning_rate": 1.7925617925617928e-06, "loss": 0.0003, "step": 4035 }, { "epoch": 4.1954261954261955, "grad_norm": 3.824902057647705, "learning_rate": 1.7902517902517906e-06, "loss": 0.1182, "step": 4036 }, { "epoch": 4.196465696465697, "grad_norm": 4.108384609222412, "learning_rate": 1.7879417879417881e-06, "loss": 0.1205, "step": 4037 }, { "epoch": 4.197505197505198, "grad_norm": 1.546683669090271, "learning_rate": 1.7856317856317859e-06, "loss": 0.0357, "step": 4038 }, { "epoch": 4.198544698544699, "grad_norm": 0.24022161960601807, "learning_rate": 1.7833217833217834e-06, "loss": 0.005, "step": 4039 }, { "epoch": 4.1995841995842, "grad_norm": 4.08000373840332, "learning_rate": 1.781011781011781e-06, "loss": 0.0946, "step": 4040 }, { "epoch": 4.200623700623701, "grad_norm": 0.7225533127784729, "learning_rate": 1.7787017787017788e-06, "loss": 0.0199, "step": 4041 }, { "epoch": 4.201663201663202, "grad_norm": 4.201906204223633, "learning_rate": 1.7763917763917765e-06, "loss": 0.1265, "step": 4042 }, { "epoch": 4.202702702702703, "grad_norm": 2.111260175704956, "learning_rate": 1.7740817740817743e-06, "loss": 0.0775, "step": 4043 }, { "epoch": 4.203742203742204, "grad_norm": 3.998500347137451, "learning_rate": 1.7717717717717719e-06, "loss": 0.3013, "step": 4044 }, { "epoch": 4.204781704781705, "grad_norm": 10.227681159973145, "learning_rate": 1.7694617694617696e-06, "loss": 0.6536, "step": 4045 }, { "epoch": 4.205821205821206, "grad_norm": 0.6396321058273315, "learning_rate": 1.7671517671517674e-06, "loss": 0.0144, "step": 4046 }, { "epoch": 4.206860706860707, "grad_norm": 3.9572525024414062, "learning_rate": 1.7648417648417652e-06, "loss": 0.2184, "step": 4047 }, { "epoch": 4.207900207900208, "grad_norm": 4.4264116287231445, "learning_rate": 1.7625317625317625e-06, "loss": 0.2336, "step": 4048 }, { "epoch": 4.208939708939709, "grad_norm": 0.20853319764137268, "learning_rate": 1.7602217602217603e-06, "loss": 0.004, "step": 4049 }, { "epoch": 4.20997920997921, "grad_norm": 1.5663752555847168, "learning_rate": 1.757911757911758e-06, "loss": 0.0281, "step": 4050 }, { "epoch": 4.211018711018711, "grad_norm": 0.04008062928915024, "learning_rate": 1.7556017556017558e-06, "loss": 0.0012, "step": 4051 }, { "epoch": 4.212058212058212, "grad_norm": 5.784019470214844, "learning_rate": 1.7532917532917534e-06, "loss": 0.2711, "step": 4052 }, { "epoch": 4.213097713097713, "grad_norm": 1.9803050756454468, "learning_rate": 1.7509817509817511e-06, "loss": 0.3233, "step": 4053 }, { "epoch": 4.214137214137214, "grad_norm": 2.890364408493042, "learning_rate": 1.7486717486717489e-06, "loss": 0.052, "step": 4054 }, { "epoch": 4.215176715176715, "grad_norm": 0.04758216068148613, "learning_rate": 1.7463617463617467e-06, "loss": 0.0013, "step": 4055 }, { "epoch": 4.216216216216216, "grad_norm": 0.18161475658416748, "learning_rate": 1.7440517440517444e-06, "loss": 0.0037, "step": 4056 }, { "epoch": 4.217255717255718, "grad_norm": 9.276385307312012, "learning_rate": 1.7417417417417418e-06, "loss": 0.9505, "step": 4057 }, { "epoch": 4.218295218295219, "grad_norm": 4.724182605743408, "learning_rate": 1.7394317394317395e-06, "loss": 0.1682, "step": 4058 }, { "epoch": 4.21933471933472, "grad_norm": 0.2898210883140564, "learning_rate": 1.7371217371217373e-06, "loss": 0.0085, "step": 4059 }, { "epoch": 4.220374220374221, "grad_norm": 4.5421833992004395, "learning_rate": 1.7348117348117349e-06, "loss": 0.4659, "step": 4060 }, { "epoch": 4.221413721413722, "grad_norm": 4.829350471496582, "learning_rate": 1.7325017325017326e-06, "loss": 0.409, "step": 4061 }, { "epoch": 4.222453222453223, "grad_norm": 15.786012649536133, "learning_rate": 1.7301917301917304e-06, "loss": 0.7211, "step": 4062 }, { "epoch": 4.223492723492724, "grad_norm": 0.1034480556845665, "learning_rate": 1.7278817278817282e-06, "loss": 0.0027, "step": 4063 }, { "epoch": 4.224532224532225, "grad_norm": 0.5985593795776367, "learning_rate": 1.7255717255717257e-06, "loss": 0.0105, "step": 4064 }, { "epoch": 4.225571725571726, "grad_norm": 3.111302375793457, "learning_rate": 1.7232617232617235e-06, "loss": 0.0978, "step": 4065 }, { "epoch": 4.226611226611227, "grad_norm": 5.8687872886657715, "learning_rate": 1.720951720951721e-06, "loss": 0.1263, "step": 4066 }, { "epoch": 4.227650727650728, "grad_norm": 0.045550454407930374, "learning_rate": 1.7186417186417186e-06, "loss": 0.0011, "step": 4067 }, { "epoch": 4.228690228690229, "grad_norm": 0.04059290140867233, "learning_rate": 1.7163317163317164e-06, "loss": 0.0006, "step": 4068 }, { "epoch": 4.22972972972973, "grad_norm": 2.0094802379608154, "learning_rate": 1.7140217140217141e-06, "loss": 0.0292, "step": 4069 }, { "epoch": 4.230769230769231, "grad_norm": 0.2362472116947174, "learning_rate": 1.711711711711712e-06, "loss": 0.0057, "step": 4070 }, { "epoch": 4.231808731808732, "grad_norm": 0.0013394836569204926, "learning_rate": 1.7094017094017097e-06, "loss": 0.0, "step": 4071 }, { "epoch": 4.232848232848233, "grad_norm": 7.218457221984863, "learning_rate": 1.7070917070917072e-06, "loss": 0.2895, "step": 4072 }, { "epoch": 4.233887733887734, "grad_norm": 1.518581509590149, "learning_rate": 1.704781704781705e-06, "loss": 0.0419, "step": 4073 }, { "epoch": 4.234927234927235, "grad_norm": 2.0033302307128906, "learning_rate": 1.7024717024717028e-06, "loss": 0.05, "step": 4074 }, { "epoch": 4.235966735966736, "grad_norm": 1.7973511219024658, "learning_rate": 1.7001617001617e-06, "loss": 0.0701, "step": 4075 }, { "epoch": 4.237006237006237, "grad_norm": 13.791241645812988, "learning_rate": 1.6978516978516979e-06, "loss": 0.3143, "step": 4076 }, { "epoch": 4.238045738045738, "grad_norm": 6.425209045410156, "learning_rate": 1.6955416955416956e-06, "loss": 0.0913, "step": 4077 }, { "epoch": 4.239085239085239, "grad_norm": 3.037710428237915, "learning_rate": 1.6932316932316934e-06, "loss": 0.0527, "step": 4078 }, { "epoch": 4.24012474012474, "grad_norm": 0.12332512438297272, "learning_rate": 1.6909216909216912e-06, "loss": 0.0032, "step": 4079 }, { "epoch": 4.241164241164241, "grad_norm": 1.4452285766601562, "learning_rate": 1.6886116886116887e-06, "loss": 0.0545, "step": 4080 }, { "epoch": 4.242203742203742, "grad_norm": 4.934514999389648, "learning_rate": 1.6863016863016865e-06, "loss": 0.0771, "step": 4081 }, { "epoch": 4.243243243243243, "grad_norm": 0.048850350081920624, "learning_rate": 1.6839916839916843e-06, "loss": 0.0013, "step": 4082 }, { "epoch": 4.244282744282744, "grad_norm": 5.821188926696777, "learning_rate": 1.681681681681682e-06, "loss": 0.3531, "step": 4083 }, { "epoch": 4.245322245322245, "grad_norm": 2.806713104248047, "learning_rate": 1.6793716793716794e-06, "loss": 0.0935, "step": 4084 }, { "epoch": 4.246361746361746, "grad_norm": 0.1255660057067871, "learning_rate": 1.6770616770616771e-06, "loss": 0.0027, "step": 4085 }, { "epoch": 4.247401247401247, "grad_norm": 7.8570404052734375, "learning_rate": 1.674751674751675e-06, "loss": 0.3277, "step": 4086 }, { "epoch": 4.248440748440748, "grad_norm": 7.057332515716553, "learning_rate": 1.6724416724416725e-06, "loss": 0.315, "step": 4087 }, { "epoch": 4.24948024948025, "grad_norm": 4.830037593841553, "learning_rate": 1.6701316701316702e-06, "loss": 0.272, "step": 4088 }, { "epoch": 4.25051975051975, "grad_norm": 9.737507820129395, "learning_rate": 1.667821667821668e-06, "loss": 0.5581, "step": 4089 }, { "epoch": 4.251559251559252, "grad_norm": 6.508758068084717, "learning_rate": 1.6655116655116658e-06, "loss": 0.1772, "step": 4090 }, { "epoch": 4.252598752598753, "grad_norm": 6.953061580657959, "learning_rate": 1.6632016632016635e-06, "loss": 0.1369, "step": 4091 }, { "epoch": 4.253638253638254, "grad_norm": 3.32177996635437, "learning_rate": 1.660891660891661e-06, "loss": 0.1571, "step": 4092 }, { "epoch": 4.254677754677755, "grad_norm": 5.463006496429443, "learning_rate": 1.6585816585816586e-06, "loss": 0.0908, "step": 4093 }, { "epoch": 4.255717255717256, "grad_norm": 0.16663727164268494, "learning_rate": 1.6562716562716564e-06, "loss": 0.0056, "step": 4094 }, { "epoch": 4.256756756756757, "grad_norm": 5.759579658508301, "learning_rate": 1.653961653961654e-06, "loss": 0.1231, "step": 4095 }, { "epoch": 4.257796257796258, "grad_norm": 2.817511558532715, "learning_rate": 1.6516516516516517e-06, "loss": 0.089, "step": 4096 }, { "epoch": 4.258835758835759, "grad_norm": 5.166245460510254, "learning_rate": 1.6493416493416495e-06, "loss": 0.1412, "step": 4097 }, { "epoch": 4.25987525987526, "grad_norm": 0.08398804813623428, "learning_rate": 1.6470316470316473e-06, "loss": 0.0015, "step": 4098 }, { "epoch": 4.260914760914761, "grad_norm": 3.944993734359741, "learning_rate": 1.6447216447216448e-06, "loss": 0.076, "step": 4099 }, { "epoch": 4.261954261954262, "grad_norm": 1.1812810897827148, "learning_rate": 1.6424116424116426e-06, "loss": 0.108, "step": 4100 }, { "epoch": 4.262993762993763, "grad_norm": 0.8852187991142273, "learning_rate": 1.6401016401016403e-06, "loss": 0.0227, "step": 4101 }, { "epoch": 4.264033264033264, "grad_norm": 8.84179973602295, "learning_rate": 1.637791637791638e-06, "loss": 0.5302, "step": 4102 }, { "epoch": 4.265072765072765, "grad_norm": 1.2681457996368408, "learning_rate": 1.6354816354816355e-06, "loss": 0.0299, "step": 4103 }, { "epoch": 4.266112266112266, "grad_norm": 0.2751096189022064, "learning_rate": 1.6331716331716332e-06, "loss": 0.0046, "step": 4104 }, { "epoch": 4.267151767151767, "grad_norm": 2.4823291301727295, "learning_rate": 1.630861630861631e-06, "loss": 0.0709, "step": 4105 }, { "epoch": 4.268191268191268, "grad_norm": 6.608827590942383, "learning_rate": 1.6285516285516288e-06, "loss": 0.5985, "step": 4106 }, { "epoch": 4.269230769230769, "grad_norm": 0.49379026889801025, "learning_rate": 1.6262416262416263e-06, "loss": 0.0086, "step": 4107 }, { "epoch": 4.27027027027027, "grad_norm": 0.2650279104709625, "learning_rate": 1.623931623931624e-06, "loss": 0.004, "step": 4108 }, { "epoch": 4.271309771309771, "grad_norm": 0.6954243779182434, "learning_rate": 1.6216216216216219e-06, "loss": 0.2501, "step": 4109 }, { "epoch": 4.272349272349272, "grad_norm": 1.0016794204711914, "learning_rate": 1.6193116193116196e-06, "loss": 0.0508, "step": 4110 }, { "epoch": 4.273388773388773, "grad_norm": 0.9003065228462219, "learning_rate": 1.617001617001617e-06, "loss": 0.0106, "step": 4111 }, { "epoch": 4.274428274428274, "grad_norm": 0.20467722415924072, "learning_rate": 1.6146916146916147e-06, "loss": 0.0035, "step": 4112 }, { "epoch": 4.275467775467775, "grad_norm": 7.248074531555176, "learning_rate": 1.6123816123816125e-06, "loss": 0.8804, "step": 4113 }, { "epoch": 4.276507276507276, "grad_norm": 10.930346488952637, "learning_rate": 1.6100716100716103e-06, "loss": 0.6996, "step": 4114 }, { "epoch": 4.277546777546777, "grad_norm": 0.48625805974006653, "learning_rate": 1.6077616077616078e-06, "loss": 0.0195, "step": 4115 }, { "epoch": 4.278586278586278, "grad_norm": 0.8230060935020447, "learning_rate": 1.6054516054516056e-06, "loss": 0.0188, "step": 4116 }, { "epoch": 4.279625779625779, "grad_norm": 0.6208006143569946, "learning_rate": 1.6031416031416034e-06, "loss": 0.008, "step": 4117 }, { "epoch": 4.28066528066528, "grad_norm": 8.017951965332031, "learning_rate": 1.6008316008316011e-06, "loss": 0.3204, "step": 4118 }, { "epoch": 4.281704781704781, "grad_norm": 0.3794807493686676, "learning_rate": 1.5985215985215987e-06, "loss": 0.0127, "step": 4119 }, { "epoch": 4.282744282744282, "grad_norm": 1.9137473106384277, "learning_rate": 1.5962115962115962e-06, "loss": 0.04, "step": 4120 }, { "epoch": 4.283783783783784, "grad_norm": 2.983814239501953, "learning_rate": 1.593901593901594e-06, "loss": 0.1822, "step": 4121 }, { "epoch": 4.284823284823285, "grad_norm": 0.028326358646154404, "learning_rate": 1.5915915915915916e-06, "loss": 0.0007, "step": 4122 }, { "epoch": 4.285862785862786, "grad_norm": 0.3476923704147339, "learning_rate": 1.5892815892815893e-06, "loss": 0.005, "step": 4123 }, { "epoch": 4.286902286902287, "grad_norm": 5.795024394989014, "learning_rate": 1.586971586971587e-06, "loss": 0.209, "step": 4124 }, { "epoch": 4.287941787941788, "grad_norm": 11.100296974182129, "learning_rate": 1.5846615846615849e-06, "loss": 0.9185, "step": 4125 }, { "epoch": 4.288981288981289, "grad_norm": 0.1736786961555481, "learning_rate": 1.5823515823515826e-06, "loss": 0.0049, "step": 4126 }, { "epoch": 4.29002079002079, "grad_norm": 0.010620493441820145, "learning_rate": 1.5800415800415802e-06, "loss": 0.0002, "step": 4127 }, { "epoch": 4.291060291060291, "grad_norm": 0.0010055977618321776, "learning_rate": 1.577731577731578e-06, "loss": 0.0, "step": 4128 }, { "epoch": 4.292099792099792, "grad_norm": 12.3736572265625, "learning_rate": 1.5754215754215755e-06, "loss": 1.4063, "step": 4129 }, { "epoch": 4.293139293139293, "grad_norm": 0.18099883198738098, "learning_rate": 1.573111573111573e-06, "loss": 0.0055, "step": 4130 }, { "epoch": 4.294178794178794, "grad_norm": 0.0030559615697711706, "learning_rate": 1.5708015708015708e-06, "loss": 0.0001, "step": 4131 }, { "epoch": 4.295218295218295, "grad_norm": 0.39262622594833374, "learning_rate": 1.5684915684915686e-06, "loss": 0.0121, "step": 4132 }, { "epoch": 4.296257796257796, "grad_norm": 9.991240501403809, "learning_rate": 1.5661815661815664e-06, "loss": 0.5465, "step": 4133 }, { "epoch": 4.297297297297297, "grad_norm": 0.22802601754665375, "learning_rate": 1.5638715638715641e-06, "loss": 0.0045, "step": 4134 }, { "epoch": 4.298336798336798, "grad_norm": 5.7973198890686035, "learning_rate": 1.5615615615615617e-06, "loss": 0.2766, "step": 4135 }, { "epoch": 4.299376299376299, "grad_norm": 0.11742614209651947, "learning_rate": 1.5592515592515594e-06, "loss": 0.0024, "step": 4136 }, { "epoch": 4.3004158004158, "grad_norm": 8.053942680358887, "learning_rate": 1.5569415569415572e-06, "loss": 0.36, "step": 4137 }, { "epoch": 4.301455301455301, "grad_norm": 8.541512489318848, "learning_rate": 1.5546315546315546e-06, "loss": 0.683, "step": 4138 }, { "epoch": 4.302494802494802, "grad_norm": 0.5943676829338074, "learning_rate": 1.5523215523215523e-06, "loss": 0.0168, "step": 4139 }, { "epoch": 4.303534303534303, "grad_norm": 1.387974500656128, "learning_rate": 1.55001155001155e-06, "loss": 0.0199, "step": 4140 }, { "epoch": 4.3045738045738045, "grad_norm": 0.07895734906196594, "learning_rate": 1.5477015477015479e-06, "loss": 0.0018, "step": 4141 }, { "epoch": 4.3056133056133055, "grad_norm": 0.5686338543891907, "learning_rate": 1.5453915453915454e-06, "loss": 0.0096, "step": 4142 }, { "epoch": 4.3066528066528065, "grad_norm": 12.089524269104004, "learning_rate": 1.5430815430815432e-06, "loss": 0.7951, "step": 4143 }, { "epoch": 4.3076923076923075, "grad_norm": 6.247905731201172, "learning_rate": 1.540771540771541e-06, "loss": 0.175, "step": 4144 }, { "epoch": 4.3087318087318085, "grad_norm": 1.3312230110168457, "learning_rate": 1.5384615384615387e-06, "loss": 0.026, "step": 4145 }, { "epoch": 4.3097713097713095, "grad_norm": 8.780720710754395, "learning_rate": 1.5361515361515365e-06, "loss": 1.2595, "step": 4146 }, { "epoch": 4.3108108108108105, "grad_norm": 8.90927791595459, "learning_rate": 1.5338415338415338e-06, "loss": 0.3484, "step": 4147 }, { "epoch": 4.3118503118503115, "grad_norm": 5.132906436920166, "learning_rate": 1.5315315315315316e-06, "loss": 0.246, "step": 4148 }, { "epoch": 4.3128898128898125, "grad_norm": 2.474851608276367, "learning_rate": 1.5292215292215294e-06, "loss": 0.0545, "step": 4149 }, { "epoch": 4.313929313929314, "grad_norm": 6.656509876251221, "learning_rate": 1.526911526911527e-06, "loss": 0.3199, "step": 4150 }, { "epoch": 4.314968814968815, "grad_norm": 0.038685161620378494, "learning_rate": 1.5246015246015247e-06, "loss": 0.0009, "step": 4151 }, { "epoch": 4.3160083160083165, "grad_norm": 0.49684134125709534, "learning_rate": 1.5222915222915225e-06, "loss": 0.0098, "step": 4152 }, { "epoch": 4.317047817047817, "grad_norm": 0.23896606266498566, "learning_rate": 1.5199815199815202e-06, "loss": 0.0067, "step": 4153 }, { "epoch": 4.3180873180873185, "grad_norm": 7.358112812042236, "learning_rate": 1.5176715176715178e-06, "loss": 0.1341, "step": 4154 }, { "epoch": 4.3191268191268195, "grad_norm": 4.544765949249268, "learning_rate": 1.5153615153615155e-06, "loss": 0.1136, "step": 4155 }, { "epoch": 4.3201663201663205, "grad_norm": 1.8596322536468506, "learning_rate": 1.513051513051513e-06, "loss": 0.2945, "step": 4156 }, { "epoch": 4.3212058212058215, "grad_norm": 10.27206802368164, "learning_rate": 1.5107415107415109e-06, "loss": 0.6398, "step": 4157 }, { "epoch": 4.3222453222453225, "grad_norm": 8.524518013000488, "learning_rate": 1.5084315084315084e-06, "loss": 0.5755, "step": 4158 }, { "epoch": 4.3232848232848236, "grad_norm": 3.3253977298736572, "learning_rate": 1.5061215061215062e-06, "loss": 0.0561, "step": 4159 }, { "epoch": 4.324324324324325, "grad_norm": 0.016905181109905243, "learning_rate": 1.503811503811504e-06, "loss": 0.0005, "step": 4160 }, { "epoch": 4.325363825363826, "grad_norm": 2.6460142135620117, "learning_rate": 1.5015015015015017e-06, "loss": 0.0686, "step": 4161 }, { "epoch": 4.326403326403327, "grad_norm": 11.561341285705566, "learning_rate": 1.4991914991914993e-06, "loss": 0.9557, "step": 4162 }, { "epoch": 4.327442827442828, "grad_norm": 4.719583988189697, "learning_rate": 1.496881496881497e-06, "loss": 0.3931, "step": 4163 }, { "epoch": 4.328482328482329, "grad_norm": 1.616438388824463, "learning_rate": 1.4945714945714948e-06, "loss": 0.0372, "step": 4164 }, { "epoch": 4.32952182952183, "grad_norm": 8.359436988830566, "learning_rate": 1.4922614922614922e-06, "loss": 0.1088, "step": 4165 }, { "epoch": 4.330561330561331, "grad_norm": 8.227208137512207, "learning_rate": 1.48995148995149e-06, "loss": 0.5338, "step": 4166 }, { "epoch": 4.331600831600832, "grad_norm": 1.4397103786468506, "learning_rate": 1.4876414876414877e-06, "loss": 0.0201, "step": 4167 }, { "epoch": 4.332640332640333, "grad_norm": 11.132816314697266, "learning_rate": 1.4853314853314855e-06, "loss": 0.3741, "step": 4168 }, { "epoch": 4.333679833679834, "grad_norm": 5.320253849029541, "learning_rate": 1.4830214830214832e-06, "loss": 0.2245, "step": 4169 }, { "epoch": 4.334719334719335, "grad_norm": 6.323849678039551, "learning_rate": 1.4807114807114808e-06, "loss": 0.287, "step": 4170 }, { "epoch": 4.335758835758836, "grad_norm": 6.769553184509277, "learning_rate": 1.4784014784014785e-06, "loss": 0.5321, "step": 4171 }, { "epoch": 4.336798336798337, "grad_norm": 0.191838338971138, "learning_rate": 1.4760914760914763e-06, "loss": 0.0044, "step": 4172 }, { "epoch": 4.337837837837838, "grad_norm": 7.636790752410889, "learning_rate": 1.473781473781474e-06, "loss": 0.5625, "step": 4173 }, { "epoch": 4.338877338877339, "grad_norm": 2.5185868740081787, "learning_rate": 1.4714714714714714e-06, "loss": 0.0744, "step": 4174 }, { "epoch": 4.33991683991684, "grad_norm": 1.3454338312149048, "learning_rate": 1.4691614691614692e-06, "loss": 0.0439, "step": 4175 }, { "epoch": 4.340956340956341, "grad_norm": 10.914813041687012, "learning_rate": 1.466851466851467e-06, "loss": 0.5639, "step": 4176 }, { "epoch": 4.341995841995842, "grad_norm": 0.06433071941137314, "learning_rate": 1.4645414645414645e-06, "loss": 0.0016, "step": 4177 }, { "epoch": 4.343035343035343, "grad_norm": 10.429954528808594, "learning_rate": 1.4622314622314623e-06, "loss": 0.484, "step": 4178 }, { "epoch": 4.344074844074844, "grad_norm": 7.5033955574035645, "learning_rate": 1.45992145992146e-06, "loss": 0.13, "step": 4179 }, { "epoch": 4.345114345114345, "grad_norm": 11.127370834350586, "learning_rate": 1.4576114576114578e-06, "loss": 0.1791, "step": 4180 }, { "epoch": 4.346153846153846, "grad_norm": 0.08277774602174759, "learning_rate": 1.4553014553014556e-06, "loss": 0.0026, "step": 4181 }, { "epoch": 4.347193347193347, "grad_norm": 11.606966018676758, "learning_rate": 1.4529914529914531e-06, "loss": 0.7642, "step": 4182 }, { "epoch": 4.348232848232848, "grad_norm": 0.2966299057006836, "learning_rate": 1.4506814506814507e-06, "loss": 0.0065, "step": 4183 }, { "epoch": 4.349272349272349, "grad_norm": 0.9831473231315613, "learning_rate": 1.4483714483714485e-06, "loss": 0.0312, "step": 4184 }, { "epoch": 4.350311850311851, "grad_norm": 0.03741445019841194, "learning_rate": 1.446061446061446e-06, "loss": 0.0009, "step": 4185 }, { "epoch": 4.351351351351352, "grad_norm": 1.2937564849853516, "learning_rate": 1.4437514437514438e-06, "loss": 0.2391, "step": 4186 }, { "epoch": 4.352390852390853, "grad_norm": 0.521981954574585, "learning_rate": 1.4414414414414416e-06, "loss": 0.0237, "step": 4187 }, { "epoch": 4.353430353430354, "grad_norm": 0.35736650228500366, "learning_rate": 1.4391314391314393e-06, "loss": 0.0087, "step": 4188 }, { "epoch": 4.354469854469855, "grad_norm": 1.3675105571746826, "learning_rate": 1.436821436821437e-06, "loss": 0.0442, "step": 4189 }, { "epoch": 4.355509355509356, "grad_norm": 4.082332134246826, "learning_rate": 1.4345114345114346e-06, "loss": 0.3379, "step": 4190 }, { "epoch": 4.356548856548857, "grad_norm": 3.070690870285034, "learning_rate": 1.4322014322014324e-06, "loss": 0.349, "step": 4191 }, { "epoch": 4.357588357588358, "grad_norm": 0.032250870019197464, "learning_rate": 1.42989142989143e-06, "loss": 0.0008, "step": 4192 }, { "epoch": 4.358627858627859, "grad_norm": 7.135365962982178, "learning_rate": 1.4275814275814275e-06, "loss": 0.3598, "step": 4193 }, { "epoch": 4.35966735966736, "grad_norm": 0.18401283025741577, "learning_rate": 1.4252714252714253e-06, "loss": 0.0034, "step": 4194 }, { "epoch": 4.360706860706861, "grad_norm": 1.852621078491211, "learning_rate": 1.422961422961423e-06, "loss": 0.0596, "step": 4195 }, { "epoch": 4.361746361746362, "grad_norm": 2.5233609676361084, "learning_rate": 1.4206514206514208e-06, "loss": 0.0822, "step": 4196 }, { "epoch": 4.362785862785863, "grad_norm": 16.242647171020508, "learning_rate": 1.4183414183414184e-06, "loss": 2.2336, "step": 4197 }, { "epoch": 4.363825363825364, "grad_norm": 2.0675549507141113, "learning_rate": 1.4160314160314161e-06, "loss": 0.0724, "step": 4198 }, { "epoch": 4.364864864864865, "grad_norm": 1.6818451881408691, "learning_rate": 1.413721413721414e-06, "loss": 0.0555, "step": 4199 }, { "epoch": 4.365904365904366, "grad_norm": 1.094651699066162, "learning_rate": 1.4114114114114117e-06, "loss": 0.0243, "step": 4200 }, { "epoch": 4.366943866943867, "grad_norm": 0.190381720662117, "learning_rate": 1.409101409101409e-06, "loss": 0.0055, "step": 4201 }, { "epoch": 4.367983367983368, "grad_norm": 4.859400272369385, "learning_rate": 1.4067914067914068e-06, "loss": 0.1467, "step": 4202 }, { "epoch": 4.369022869022869, "grad_norm": 1.171924352645874, "learning_rate": 1.4044814044814046e-06, "loss": 0.013, "step": 4203 }, { "epoch": 4.37006237006237, "grad_norm": 1.6685501337051392, "learning_rate": 1.4021714021714023e-06, "loss": 0.0578, "step": 4204 }, { "epoch": 4.371101871101871, "grad_norm": 10.510454177856445, "learning_rate": 1.3998613998613999e-06, "loss": 0.3318, "step": 4205 }, { "epoch": 4.372141372141372, "grad_norm": 1.731648325920105, "learning_rate": 1.3975513975513976e-06, "loss": 0.0509, "step": 4206 }, { "epoch": 4.373180873180873, "grad_norm": 3.0722243785858154, "learning_rate": 1.3952413952413954e-06, "loss": 0.0566, "step": 4207 }, { "epoch": 4.374220374220374, "grad_norm": 5.660260200500488, "learning_rate": 1.3929313929313932e-06, "loss": 0.2306, "step": 4208 }, { "epoch": 4.375259875259875, "grad_norm": 4.561119079589844, "learning_rate": 1.390621390621391e-06, "loss": 0.1269, "step": 4209 }, { "epoch": 4.376299376299376, "grad_norm": 0.12571902573108673, "learning_rate": 1.3883113883113885e-06, "loss": 0.0033, "step": 4210 }, { "epoch": 4.377338877338877, "grad_norm": 11.195829391479492, "learning_rate": 1.386001386001386e-06, "loss": 0.6889, "step": 4211 }, { "epoch": 4.378378378378378, "grad_norm": 3.5008299350738525, "learning_rate": 1.3836913836913838e-06, "loss": 0.1269, "step": 4212 }, { "epoch": 4.379417879417879, "grad_norm": 0.18679237365722656, "learning_rate": 1.3813813813813814e-06, "loss": 0.005, "step": 4213 }, { "epoch": 4.38045738045738, "grad_norm": 0.2088523656129837, "learning_rate": 1.3790713790713791e-06, "loss": 0.0034, "step": 4214 }, { "epoch": 4.381496881496881, "grad_norm": 0.37627220153808594, "learning_rate": 1.376761376761377e-06, "loss": 0.0109, "step": 4215 }, { "epoch": 4.382536382536383, "grad_norm": 0.010968077927827835, "learning_rate": 1.3744513744513747e-06, "loss": 0.0002, "step": 4216 }, { "epoch": 4.383575883575883, "grad_norm": 1.4702229499816895, "learning_rate": 1.3721413721413722e-06, "loss": 0.0225, "step": 4217 }, { "epoch": 4.384615384615385, "grad_norm": 0.6452034711837769, "learning_rate": 1.36983136983137e-06, "loss": 0.0138, "step": 4218 }, { "epoch": 4.385654885654886, "grad_norm": 1.4516841173171997, "learning_rate": 1.3675213675213678e-06, "loss": 0.3052, "step": 4219 }, { "epoch": 4.386694386694387, "grad_norm": 1.7895114421844482, "learning_rate": 1.3652113652113651e-06, "loss": 0.0439, "step": 4220 }, { "epoch": 4.387733887733888, "grad_norm": 0.15847811102867126, "learning_rate": 1.3629013629013629e-06, "loss": 0.0031, "step": 4221 }, { "epoch": 4.388773388773389, "grad_norm": 6.9709649085998535, "learning_rate": 1.3605913605913607e-06, "loss": 0.3052, "step": 4222 }, { "epoch": 4.38981288981289, "grad_norm": 1.299400806427002, "learning_rate": 1.3582813582813584e-06, "loss": 0.0293, "step": 4223 }, { "epoch": 4.390852390852391, "grad_norm": 11.985542297363281, "learning_rate": 1.3559713559713562e-06, "loss": 1.561, "step": 4224 }, { "epoch": 4.391891891891892, "grad_norm": 14.085444450378418, "learning_rate": 1.3536613536613537e-06, "loss": 0.8309, "step": 4225 }, { "epoch": 4.392931392931393, "grad_norm": 0.11452407389879227, "learning_rate": 1.3513513513513515e-06, "loss": 0.0021, "step": 4226 }, { "epoch": 4.393970893970894, "grad_norm": 3.1590518951416016, "learning_rate": 1.3490413490413493e-06, "loss": 0.1181, "step": 4227 }, { "epoch": 4.395010395010395, "grad_norm": 0.24588535726070404, "learning_rate": 1.346731346731347e-06, "loss": 0.0038, "step": 4228 }, { "epoch": 4.396049896049896, "grad_norm": 0.04706381633877754, "learning_rate": 1.3444213444213444e-06, "loss": 0.0013, "step": 4229 }, { "epoch": 4.397089397089397, "grad_norm": 0.4515562653541565, "learning_rate": 1.3421113421113422e-06, "loss": 0.0066, "step": 4230 }, { "epoch": 4.398128898128898, "grad_norm": 0.4878906011581421, "learning_rate": 1.33980133980134e-06, "loss": 0.0137, "step": 4231 }, { "epoch": 4.399168399168399, "grad_norm": 0.006022973917424679, "learning_rate": 1.3374913374913377e-06, "loss": 0.0001, "step": 4232 }, { "epoch": 4.4002079002079, "grad_norm": 1.3622678518295288, "learning_rate": 1.3351813351813352e-06, "loss": 0.0421, "step": 4233 }, { "epoch": 4.401247401247401, "grad_norm": 5.8447184562683105, "learning_rate": 1.332871332871333e-06, "loss": 0.1775, "step": 4234 }, { "epoch": 4.402286902286902, "grad_norm": 8.86520767211914, "learning_rate": 1.3305613305613308e-06, "loss": 0.8891, "step": 4235 }, { "epoch": 4.403326403326403, "grad_norm": 3.6811683177948, "learning_rate": 1.3282513282513285e-06, "loss": 0.103, "step": 4236 }, { "epoch": 4.404365904365904, "grad_norm": 0.169471874833107, "learning_rate": 1.325941325941326e-06, "loss": 0.0038, "step": 4237 }, { "epoch": 4.405405405405405, "grad_norm": 0.944815456867218, "learning_rate": 1.3236313236313237e-06, "loss": 0.0175, "step": 4238 }, { "epoch": 4.406444906444906, "grad_norm": 0.005099491681903601, "learning_rate": 1.3213213213213214e-06, "loss": 0.0001, "step": 4239 }, { "epoch": 4.407484407484407, "grad_norm": 2.1067795753479004, "learning_rate": 1.319011319011319e-06, "loss": 0.0942, "step": 4240 }, { "epoch": 4.408523908523908, "grad_norm": 5.469954013824463, "learning_rate": 1.3167013167013167e-06, "loss": 0.126, "step": 4241 }, { "epoch": 4.409563409563409, "grad_norm": 0.5456253886222839, "learning_rate": 1.3143913143913145e-06, "loss": 0.0107, "step": 4242 }, { "epoch": 4.41060291060291, "grad_norm": 1.104061484336853, "learning_rate": 1.3120813120813123e-06, "loss": 0.0221, "step": 4243 }, { "epoch": 4.411642411642411, "grad_norm": 0.017622362822294235, "learning_rate": 1.30977130977131e-06, "loss": 0.0005, "step": 4244 }, { "epoch": 4.412681912681912, "grad_norm": 0.05500205606222153, "learning_rate": 1.3074613074613076e-06, "loss": 0.001, "step": 4245 }, { "epoch": 4.413721413721413, "grad_norm": 11.015450477600098, "learning_rate": 1.3051513051513054e-06, "loss": 1.0906, "step": 4246 }, { "epoch": 4.414760914760914, "grad_norm": 12.540595054626465, "learning_rate": 1.302841302841303e-06, "loss": 0.7457, "step": 4247 }, { "epoch": 4.415800415800415, "grad_norm": 0.19104458391666412, "learning_rate": 1.3005313005313005e-06, "loss": 0.0068, "step": 4248 }, { "epoch": 4.416839916839917, "grad_norm": 0.00660668546333909, "learning_rate": 1.2982212982212982e-06, "loss": 0.0001, "step": 4249 }, { "epoch": 4.417879417879418, "grad_norm": 0.6729915142059326, "learning_rate": 1.295911295911296e-06, "loss": 0.0189, "step": 4250 }, { "epoch": 4.418918918918919, "grad_norm": 14.4362154006958, "learning_rate": 1.2936012936012938e-06, "loss": 0.6613, "step": 4251 }, { "epoch": 4.41995841995842, "grad_norm": 2.7787716388702393, "learning_rate": 1.2912912912912913e-06, "loss": 0.0493, "step": 4252 }, { "epoch": 4.420997920997921, "grad_norm": 5.139069557189941, "learning_rate": 1.288981288981289e-06, "loss": 0.6069, "step": 4253 }, { "epoch": 4.422037422037422, "grad_norm": 6.8310465812683105, "learning_rate": 1.2866712866712869e-06, "loss": 0.1973, "step": 4254 }, { "epoch": 4.423076923076923, "grad_norm": 1.6439793109893799, "learning_rate": 1.2843612843612846e-06, "loss": 0.0401, "step": 4255 }, { "epoch": 4.424116424116424, "grad_norm": 5.19002103805542, "learning_rate": 1.282051282051282e-06, "loss": 0.2427, "step": 4256 }, { "epoch": 4.425155925155925, "grad_norm": 6.832848072052002, "learning_rate": 1.2797412797412797e-06, "loss": 0.4795, "step": 4257 }, { "epoch": 4.426195426195426, "grad_norm": 10.052824974060059, "learning_rate": 1.2774312774312775e-06, "loss": 0.688, "step": 4258 }, { "epoch": 4.427234927234927, "grad_norm": 1.8471096754074097, "learning_rate": 1.2751212751212753e-06, "loss": 0.0632, "step": 4259 }, { "epoch": 4.428274428274428, "grad_norm": 2.3802993297576904, "learning_rate": 1.2728112728112728e-06, "loss": 0.0726, "step": 4260 }, { "epoch": 4.429313929313929, "grad_norm": 1.7276345491409302, "learning_rate": 1.2705012705012706e-06, "loss": 0.0318, "step": 4261 }, { "epoch": 4.43035343035343, "grad_norm": 1.354040503501892, "learning_rate": 1.2681912681912684e-06, "loss": 0.282, "step": 4262 }, { "epoch": 4.4313929313929314, "grad_norm": 4.793944358825684, "learning_rate": 1.2658812658812661e-06, "loss": 0.2987, "step": 4263 }, { "epoch": 4.4324324324324325, "grad_norm": 5.113119125366211, "learning_rate": 1.263571263571264e-06, "loss": 0.1797, "step": 4264 }, { "epoch": 4.4334719334719335, "grad_norm": 4.361006259918213, "learning_rate": 1.2612612612612613e-06, "loss": 0.3196, "step": 4265 }, { "epoch": 4.4345114345114345, "grad_norm": 2.3230502605438232, "learning_rate": 1.258951258951259e-06, "loss": 0.0671, "step": 4266 }, { "epoch": 4.4355509355509355, "grad_norm": 8.467252731323242, "learning_rate": 1.2566412566412568e-06, "loss": 0.2399, "step": 4267 }, { "epoch": 4.4365904365904365, "grad_norm": 0.09298921376466751, "learning_rate": 1.2543312543312543e-06, "loss": 0.0017, "step": 4268 }, { "epoch": 4.4376299376299375, "grad_norm": 4.662631034851074, "learning_rate": 1.2520212520212521e-06, "loss": 0.2203, "step": 4269 }, { "epoch": 4.4386694386694385, "grad_norm": 7.328342914581299, "learning_rate": 1.2497112497112499e-06, "loss": 0.2689, "step": 4270 }, { "epoch": 4.4397089397089395, "grad_norm": 0.9401403665542603, "learning_rate": 1.2474012474012476e-06, "loss": 0.032, "step": 4271 }, { "epoch": 4.4407484407484406, "grad_norm": 6.420727729797363, "learning_rate": 1.2450912450912452e-06, "loss": 0.3668, "step": 4272 }, { "epoch": 4.441787941787942, "grad_norm": 3.3101179599761963, "learning_rate": 1.242781242781243e-06, "loss": 0.131, "step": 4273 }, { "epoch": 4.442827442827443, "grad_norm": 0.010817071422934532, "learning_rate": 1.2404712404712405e-06, "loss": 0.0002, "step": 4274 }, { "epoch": 4.443866943866944, "grad_norm": 6.34616756439209, "learning_rate": 1.2381612381612383e-06, "loss": 0.0606, "step": 4275 }, { "epoch": 4.444906444906445, "grad_norm": 4.0047712326049805, "learning_rate": 1.2358512358512358e-06, "loss": 0.4552, "step": 4276 }, { "epoch": 4.445945945945946, "grad_norm": 7.248794078826904, "learning_rate": 1.2335412335412336e-06, "loss": 0.2012, "step": 4277 }, { "epoch": 4.446985446985447, "grad_norm": 4.6254425048828125, "learning_rate": 1.2312312312312314e-06, "loss": 0.3375, "step": 4278 }, { "epoch": 4.448024948024948, "grad_norm": 4.371951103210449, "learning_rate": 1.2289212289212291e-06, "loss": 0.0718, "step": 4279 }, { "epoch": 4.4490644490644495, "grad_norm": 0.24949759244918823, "learning_rate": 1.2266112266112267e-06, "loss": 0.0044, "step": 4280 }, { "epoch": 4.45010395010395, "grad_norm": 0.3559669256210327, "learning_rate": 1.2243012243012243e-06, "loss": 0.004, "step": 4281 }, { "epoch": 4.451143451143452, "grad_norm": 3.888190984725952, "learning_rate": 1.221991221991222e-06, "loss": 0.04, "step": 4282 }, { "epoch": 4.452182952182953, "grad_norm": 6.422785758972168, "learning_rate": 1.2196812196812198e-06, "loss": 0.2481, "step": 4283 }, { "epoch": 4.453222453222454, "grad_norm": 1.392903447151184, "learning_rate": 1.2173712173712176e-06, "loss": 0.021, "step": 4284 }, { "epoch": 4.454261954261955, "grad_norm": 0.9166313409805298, "learning_rate": 1.2150612150612151e-06, "loss": 0.0304, "step": 4285 }, { "epoch": 4.455301455301456, "grad_norm": 0.4601767957210541, "learning_rate": 1.2127512127512129e-06, "loss": 0.0083, "step": 4286 }, { "epoch": 4.456340956340957, "grad_norm": 7.392816543579102, "learning_rate": 1.2104412104412106e-06, "loss": 0.3828, "step": 4287 }, { "epoch": 4.457380457380458, "grad_norm": 0.008619803935289383, "learning_rate": 1.2081312081312082e-06, "loss": 0.0002, "step": 4288 }, { "epoch": 4.458419958419959, "grad_norm": 0.06648194044828415, "learning_rate": 1.205821205821206e-06, "loss": 0.0024, "step": 4289 }, { "epoch": 4.45945945945946, "grad_norm": 0.06525490432977676, "learning_rate": 1.2035112035112035e-06, "loss": 0.0012, "step": 4290 }, { "epoch": 4.460498960498961, "grad_norm": 7.3383469581604, "learning_rate": 1.2012012012012013e-06, "loss": 0.4375, "step": 4291 }, { "epoch": 4.461538461538462, "grad_norm": 2.6314022541046143, "learning_rate": 1.198891198891199e-06, "loss": 0.0536, "step": 4292 }, { "epoch": 4.462577962577963, "grad_norm": 5.249322891235352, "learning_rate": 1.1965811965811968e-06, "loss": 0.1795, "step": 4293 }, { "epoch": 4.463617463617464, "grad_norm": 7.783726692199707, "learning_rate": 1.1942711942711944e-06, "loss": 0.5374, "step": 4294 }, { "epoch": 4.464656964656965, "grad_norm": 9.920394897460938, "learning_rate": 1.191961191961192e-06, "loss": 1.2068, "step": 4295 }, { "epoch": 4.465696465696466, "grad_norm": 0.939612627029419, "learning_rate": 1.1896511896511897e-06, "loss": 0.0164, "step": 4296 }, { "epoch": 4.466735966735967, "grad_norm": 0.01430198922753334, "learning_rate": 1.1873411873411875e-06, "loss": 0.0003, "step": 4297 }, { "epoch": 4.467775467775468, "grad_norm": 6.761784076690674, "learning_rate": 1.1850311850311852e-06, "loss": 0.298, "step": 4298 }, { "epoch": 4.468814968814969, "grad_norm": 0.29458633065223694, "learning_rate": 1.1827211827211828e-06, "loss": 0.0058, "step": 4299 }, { "epoch": 4.46985446985447, "grad_norm": 5.258695125579834, "learning_rate": 1.1804111804111806e-06, "loss": 0.0941, "step": 4300 }, { "epoch": 4.470893970893971, "grad_norm": 6.842070579528809, "learning_rate": 1.1781011781011781e-06, "loss": 0.3605, "step": 4301 }, { "epoch": 4.471933471933472, "grad_norm": 3.9812428951263428, "learning_rate": 1.1757911757911759e-06, "loss": 0.268, "step": 4302 }, { "epoch": 4.472972972972973, "grad_norm": 5.772032737731934, "learning_rate": 1.1734811734811734e-06, "loss": 0.3061, "step": 4303 }, { "epoch": 4.474012474012474, "grad_norm": 0.419314444065094, "learning_rate": 1.1711711711711712e-06, "loss": 0.0084, "step": 4304 }, { "epoch": 4.475051975051975, "grad_norm": 0.5556364059448242, "learning_rate": 1.168861168861169e-06, "loss": 0.0126, "step": 4305 }, { "epoch": 4.476091476091476, "grad_norm": 8.333703994750977, "learning_rate": 1.1665511665511667e-06, "loss": 1.235, "step": 4306 }, { "epoch": 4.477130977130977, "grad_norm": 1.7269006967544556, "learning_rate": 1.1642411642411643e-06, "loss": 0.0509, "step": 4307 }, { "epoch": 4.478170478170478, "grad_norm": 5.627634048461914, "learning_rate": 1.161931161931162e-06, "loss": 0.2021, "step": 4308 }, { "epoch": 4.479209979209979, "grad_norm": 0.252849280834198, "learning_rate": 1.1596211596211596e-06, "loss": 0.0046, "step": 4309 }, { "epoch": 4.48024948024948, "grad_norm": 0.008383556269109249, "learning_rate": 1.1573111573111574e-06, "loss": 0.0002, "step": 4310 }, { "epoch": 4.481288981288981, "grad_norm": 0.9866765141487122, "learning_rate": 1.1550011550011552e-06, "loss": 0.018, "step": 4311 }, { "epoch": 4.482328482328482, "grad_norm": 0.01083228550851345, "learning_rate": 1.1526911526911527e-06, "loss": 0.0003, "step": 4312 }, { "epoch": 4.483367983367984, "grad_norm": 0.04575052112340927, "learning_rate": 1.1503811503811505e-06, "loss": 0.0012, "step": 4313 }, { "epoch": 4.484407484407485, "grad_norm": 0.0022366337943822145, "learning_rate": 1.1480711480711482e-06, "loss": 0.0, "step": 4314 }, { "epoch": 4.485446985446986, "grad_norm": 8.27820873260498, "learning_rate": 1.1457611457611458e-06, "loss": 0.3146, "step": 4315 }, { "epoch": 4.486486486486487, "grad_norm": 0.008490752428770065, "learning_rate": 1.1434511434511436e-06, "loss": 0.0002, "step": 4316 }, { "epoch": 4.487525987525988, "grad_norm": 0.03850068524479866, "learning_rate": 1.1411411411411411e-06, "loss": 0.0005, "step": 4317 }, { "epoch": 4.488565488565489, "grad_norm": 0.014865309000015259, "learning_rate": 1.1388311388311389e-06, "loss": 0.0005, "step": 4318 }, { "epoch": 4.48960498960499, "grad_norm": 8.617644309997559, "learning_rate": 1.1365211365211367e-06, "loss": 0.2046, "step": 4319 }, { "epoch": 4.490644490644491, "grad_norm": 0.02923273667693138, "learning_rate": 1.1342111342111344e-06, "loss": 0.0004, "step": 4320 }, { "epoch": 4.491683991683992, "grad_norm": 2.4688708782196045, "learning_rate": 1.131901131901132e-06, "loss": 0.0451, "step": 4321 }, { "epoch": 4.492723492723493, "grad_norm": 4.8640546798706055, "learning_rate": 1.1295911295911297e-06, "loss": 0.1706, "step": 4322 }, { "epoch": 4.493762993762994, "grad_norm": 0.36775505542755127, "learning_rate": 1.1272811272811273e-06, "loss": 0.0089, "step": 4323 }, { "epoch": 4.494802494802495, "grad_norm": 11.362357139587402, "learning_rate": 1.124971124971125e-06, "loss": 0.3145, "step": 4324 }, { "epoch": 4.495841995841996, "grad_norm": 10.543671607971191, "learning_rate": 1.1226611226611228e-06, "loss": 0.6145, "step": 4325 }, { "epoch": 4.496881496881497, "grad_norm": 5.450479030609131, "learning_rate": 1.1203511203511204e-06, "loss": 0.1512, "step": 4326 }, { "epoch": 4.497920997920998, "grad_norm": 0.10827375948429108, "learning_rate": 1.1180411180411182e-06, "loss": 0.0024, "step": 4327 }, { "epoch": 4.498960498960499, "grad_norm": 8.659412384033203, "learning_rate": 1.115731115731116e-06, "loss": 0.9051, "step": 4328 }, { "epoch": 4.5, "grad_norm": 10.345586776733398, "learning_rate": 1.1134211134211135e-06, "loss": 1.4444, "step": 4329 }, { "epoch": 4.501039501039501, "grad_norm": 1.9703149795532227, "learning_rate": 1.111111111111111e-06, "loss": 0.0309, "step": 4330 }, { "epoch": 4.502079002079002, "grad_norm": 4.598181247711182, "learning_rate": 1.1088011088011088e-06, "loss": 0.2541, "step": 4331 }, { "epoch": 4.503118503118503, "grad_norm": 7.416522026062012, "learning_rate": 1.1064911064911066e-06, "loss": 0.2692, "step": 4332 }, { "epoch": 4.504158004158004, "grad_norm": 7.817275524139404, "learning_rate": 1.1041811041811043e-06, "loss": 0.6485, "step": 4333 }, { "epoch": 4.505197505197505, "grad_norm": 5.300344944000244, "learning_rate": 1.1018711018711021e-06, "loss": 0.4363, "step": 4334 }, { "epoch": 4.506237006237006, "grad_norm": 10.25655460357666, "learning_rate": 1.0995610995610997e-06, "loss": 0.5223, "step": 4335 }, { "epoch": 4.507276507276507, "grad_norm": 5.892490863800049, "learning_rate": 1.0972510972510974e-06, "loss": 0.3733, "step": 4336 }, { "epoch": 4.508316008316008, "grad_norm": 8.785927772521973, "learning_rate": 1.094941094941095e-06, "loss": 0.6725, "step": 4337 }, { "epoch": 4.509355509355509, "grad_norm": 0.9067697525024414, "learning_rate": 1.0926310926310928e-06, "loss": 0.0146, "step": 4338 }, { "epoch": 4.51039501039501, "grad_norm": 5.748349666595459, "learning_rate": 1.0903210903210903e-06, "loss": 0.636, "step": 4339 }, { "epoch": 4.511434511434511, "grad_norm": 0.0022521372884511948, "learning_rate": 1.088011088011088e-06, "loss": 0.0001, "step": 4340 }, { "epoch": 4.512474012474012, "grad_norm": 0.20115229487419128, "learning_rate": 1.0857010857010858e-06, "loss": 0.0045, "step": 4341 }, { "epoch": 4.513513513513513, "grad_norm": 4.627670764923096, "learning_rate": 1.0833910833910836e-06, "loss": 0.1453, "step": 4342 }, { "epoch": 4.514553014553014, "grad_norm": 4.924317359924316, "learning_rate": 1.0810810810810812e-06, "loss": 0.1042, "step": 4343 }, { "epoch": 4.515592515592516, "grad_norm": 2.5569543838500977, "learning_rate": 1.0787710787710787e-06, "loss": 0.0342, "step": 4344 }, { "epoch": 4.516632016632016, "grad_norm": 6.637704372406006, "learning_rate": 1.0764610764610765e-06, "loss": 0.6174, "step": 4345 }, { "epoch": 4.517671517671518, "grad_norm": 0.4843396246433258, "learning_rate": 1.0741510741510743e-06, "loss": 0.014, "step": 4346 }, { "epoch": 4.518711018711018, "grad_norm": 5.531055927276611, "learning_rate": 1.071841071841072e-06, "loss": 0.2442, "step": 4347 }, { "epoch": 4.51975051975052, "grad_norm": 2.8017818927764893, "learning_rate": 1.0695310695310696e-06, "loss": 0.0746, "step": 4348 }, { "epoch": 4.520790020790021, "grad_norm": 7.472479820251465, "learning_rate": 1.0672210672210673e-06, "loss": 0.1791, "step": 4349 }, { "epoch": 4.521829521829522, "grad_norm": 1.5929454565048218, "learning_rate": 1.064911064911065e-06, "loss": 0.0339, "step": 4350 }, { "epoch": 4.522869022869023, "grad_norm": 2.3107383251190186, "learning_rate": 1.0626010626010627e-06, "loss": 0.0812, "step": 4351 }, { "epoch": 4.523908523908524, "grad_norm": 9.258277893066406, "learning_rate": 1.0602910602910604e-06, "loss": 0.6373, "step": 4352 }, { "epoch": 4.524948024948025, "grad_norm": 20.639976501464844, "learning_rate": 1.057981057981058e-06, "loss": 0.1888, "step": 4353 }, { "epoch": 4.525987525987526, "grad_norm": 0.026088906452059746, "learning_rate": 1.0556710556710558e-06, "loss": 0.0006, "step": 4354 }, { "epoch": 4.527027027027027, "grad_norm": 8.579378128051758, "learning_rate": 1.0533610533610535e-06, "loss": 0.5171, "step": 4355 }, { "epoch": 4.528066528066528, "grad_norm": 5.382491588592529, "learning_rate": 1.051051051051051e-06, "loss": 0.0538, "step": 4356 }, { "epoch": 4.529106029106029, "grad_norm": 8.6734037399292, "learning_rate": 1.0487410487410488e-06, "loss": 0.9482, "step": 4357 }, { "epoch": 4.53014553014553, "grad_norm": 1.555879831314087, "learning_rate": 1.0464310464310464e-06, "loss": 0.0433, "step": 4358 }, { "epoch": 4.531185031185031, "grad_norm": 6.786050796508789, "learning_rate": 1.0441210441210442e-06, "loss": 0.147, "step": 4359 }, { "epoch": 4.532224532224532, "grad_norm": 5.747766017913818, "learning_rate": 1.041811041811042e-06, "loss": 0.132, "step": 4360 }, { "epoch": 4.533264033264033, "grad_norm": 1.9380532503128052, "learning_rate": 1.0395010395010397e-06, "loss": 0.2936, "step": 4361 }, { "epoch": 4.534303534303534, "grad_norm": 9.890674591064453, "learning_rate": 1.0371910371910373e-06, "loss": 0.1638, "step": 4362 }, { "epoch": 4.535343035343035, "grad_norm": 2.7257440090179443, "learning_rate": 1.034881034881035e-06, "loss": 0.0508, "step": 4363 }, { "epoch": 4.536382536382536, "grad_norm": 8.347380638122559, "learning_rate": 1.0325710325710326e-06, "loss": 0.7168, "step": 4364 }, { "epoch": 4.537422037422037, "grad_norm": 6.967242240905762, "learning_rate": 1.0302610302610303e-06, "loss": 0.3137, "step": 4365 }, { "epoch": 4.538461538461538, "grad_norm": 2.8173623085021973, "learning_rate": 1.027951027951028e-06, "loss": 0.092, "step": 4366 }, { "epoch": 4.539501039501039, "grad_norm": 0.10028092563152313, "learning_rate": 1.0256410256410257e-06, "loss": 0.002, "step": 4367 }, { "epoch": 4.54054054054054, "grad_norm": 4.162265300750732, "learning_rate": 1.0233310233310234e-06, "loss": 0.1445, "step": 4368 }, { "epoch": 4.541580041580041, "grad_norm": 0.7075111865997314, "learning_rate": 1.0210210210210212e-06, "loss": 0.0087, "step": 4369 }, { "epoch": 4.542619542619542, "grad_norm": 10.8870210647583, "learning_rate": 1.0187110187110188e-06, "loss": 1.9218, "step": 4370 }, { "epoch": 4.543659043659043, "grad_norm": 0.6903623938560486, "learning_rate": 1.0164010164010165e-06, "loss": 0.0273, "step": 4371 }, { "epoch": 4.544698544698544, "grad_norm": 8.723344802856445, "learning_rate": 1.014091014091014e-06, "loss": 0.5867, "step": 4372 }, { "epoch": 4.545738045738045, "grad_norm": 0.913175106048584, "learning_rate": 1.0117810117810119e-06, "loss": 0.0585, "step": 4373 }, { "epoch": 4.546777546777546, "grad_norm": 11.532629013061523, "learning_rate": 1.0094710094710096e-06, "loss": 0.5133, "step": 4374 }, { "epoch": 4.547817047817047, "grad_norm": 1.2006914615631104, "learning_rate": 1.0071610071610072e-06, "loss": 0.0428, "step": 4375 }, { "epoch": 4.548856548856548, "grad_norm": 2.0305368900299072, "learning_rate": 1.004851004851005e-06, "loss": 0.0362, "step": 4376 }, { "epoch": 4.54989604989605, "grad_norm": 0.8395141959190369, "learning_rate": 1.0025410025410027e-06, "loss": 0.0093, "step": 4377 }, { "epoch": 4.5509355509355505, "grad_norm": 0.9320726990699768, "learning_rate": 1.0002310002310003e-06, "loss": 0.0164, "step": 4378 }, { "epoch": 4.551975051975052, "grad_norm": 9.359164237976074, "learning_rate": 9.97920997920998e-07, "loss": 0.3419, "step": 4379 }, { "epoch": 4.553014553014553, "grad_norm": 7.8454508781433105, "learning_rate": 9.956109956109956e-07, "loss": 0.444, "step": 4380 }, { "epoch": 4.554054054054054, "grad_norm": 0.4756433665752411, "learning_rate": 9.933009933009934e-07, "loss": 0.012, "step": 4381 }, { "epoch": 4.555093555093555, "grad_norm": 0.028461869806051254, "learning_rate": 9.909909909909911e-07, "loss": 0.0007, "step": 4382 }, { "epoch": 4.556133056133056, "grad_norm": 10.92263126373291, "learning_rate": 9.886809886809889e-07, "loss": 0.7289, "step": 4383 }, { "epoch": 4.557172557172557, "grad_norm": 0.09633204340934753, "learning_rate": 9.863709863709864e-07, "loss": 0.0025, "step": 4384 }, { "epoch": 4.558212058212058, "grad_norm": 1.7298146486282349, "learning_rate": 9.84060984060984e-07, "loss": 0.0215, "step": 4385 }, { "epoch": 4.5592515592515594, "grad_norm": 5.163122177124023, "learning_rate": 9.817509817509818e-07, "loss": 0.2054, "step": 4386 }, { "epoch": 4.5602910602910605, "grad_norm": 0.41875556111335754, "learning_rate": 9.794409794409795e-07, "loss": 0.0072, "step": 4387 }, { "epoch": 4.5613305613305615, "grad_norm": 8.408621788024902, "learning_rate": 9.771309771309773e-07, "loss": 0.2993, "step": 4388 }, { "epoch": 4.5623700623700625, "grad_norm": 3.3478004932403564, "learning_rate": 9.748209748209749e-07, "loss": 0.083, "step": 4389 }, { "epoch": 4.5634095634095635, "grad_norm": 9.404569625854492, "learning_rate": 9.725109725109726e-07, "loss": 0.3557, "step": 4390 }, { "epoch": 4.5644490644490645, "grad_norm": 2.7835049629211426, "learning_rate": 9.702009702009704e-07, "loss": 0.0758, "step": 4391 }, { "epoch": 4.5654885654885655, "grad_norm": 7.355697154998779, "learning_rate": 9.67890967890968e-07, "loss": 0.2352, "step": 4392 }, { "epoch": 4.5665280665280665, "grad_norm": 0.16567951440811157, "learning_rate": 9.655809655809657e-07, "loss": 0.0044, "step": 4393 }, { "epoch": 4.5675675675675675, "grad_norm": 0.0037194276228547096, "learning_rate": 9.632709632709633e-07, "loss": 0.0001, "step": 4394 }, { "epoch": 4.5686070686070686, "grad_norm": 0.23683440685272217, "learning_rate": 9.60960960960961e-07, "loss": 0.0066, "step": 4395 }, { "epoch": 4.56964656964657, "grad_norm": 0.004315780010074377, "learning_rate": 9.586509586509588e-07, "loss": 0.0001, "step": 4396 }, { "epoch": 4.570686070686071, "grad_norm": 0.0919046625494957, "learning_rate": 9.563409563409566e-07, "loss": 0.0021, "step": 4397 }, { "epoch": 4.571725571725572, "grad_norm": 0.9975956082344055, "learning_rate": 9.540309540309541e-07, "loss": 0.0538, "step": 4398 }, { "epoch": 4.572765072765073, "grad_norm": 0.11247812211513519, "learning_rate": 9.517209517209518e-07, "loss": 0.0032, "step": 4399 }, { "epoch": 4.573804573804574, "grad_norm": 4.416209697723389, "learning_rate": 9.494109494109494e-07, "loss": 0.1403, "step": 4400 }, { "epoch": 4.574844074844075, "grad_norm": 0.0890134945511818, "learning_rate": 9.471009471009472e-07, "loss": 0.0014, "step": 4401 }, { "epoch": 4.575883575883576, "grad_norm": 0.5805600881576538, "learning_rate": 9.44790944790945e-07, "loss": 0.0125, "step": 4402 }, { "epoch": 4.576923076923077, "grad_norm": 1.9322223663330078, "learning_rate": 9.424809424809425e-07, "loss": 0.0663, "step": 4403 }, { "epoch": 4.577962577962578, "grad_norm": 3.145690679550171, "learning_rate": 9.401709401709402e-07, "loss": 0.0971, "step": 4404 }, { "epoch": 4.579002079002079, "grad_norm": 2.712754011154175, "learning_rate": 9.37860937860938e-07, "loss": 0.1244, "step": 4405 }, { "epoch": 4.58004158004158, "grad_norm": 0.07731432467699051, "learning_rate": 9.355509355509356e-07, "loss": 0.0019, "step": 4406 }, { "epoch": 4.581081081081081, "grad_norm": 0.10027692466974258, "learning_rate": 9.332409332409333e-07, "loss": 0.0029, "step": 4407 }, { "epoch": 4.582120582120583, "grad_norm": 3.366645574569702, "learning_rate": 9.30930930930931e-07, "loss": 0.1166, "step": 4408 }, { "epoch": 4.583160083160083, "grad_norm": 8.035176277160645, "learning_rate": 9.286209286209287e-07, "loss": 0.2173, "step": 4409 }, { "epoch": 4.584199584199585, "grad_norm": 3.704331159591675, "learning_rate": 9.263109263109264e-07, "loss": 0.1451, "step": 4410 }, { "epoch": 4.585239085239085, "grad_norm": 1.452633023262024, "learning_rate": 9.240009240009241e-07, "loss": 0.0318, "step": 4411 }, { "epoch": 4.586278586278587, "grad_norm": 5.019649028778076, "learning_rate": 9.216909216909217e-07, "loss": 0.2807, "step": 4412 }, { "epoch": 4.587318087318088, "grad_norm": 5.823719501495361, "learning_rate": 9.193809193809195e-07, "loss": 0.1205, "step": 4413 }, { "epoch": 4.588357588357589, "grad_norm": 1.7746714353561401, "learning_rate": 9.170709170709171e-07, "loss": 0.0313, "step": 4414 }, { "epoch": 4.58939708939709, "grad_norm": 7.47543478012085, "learning_rate": 9.147609147609149e-07, "loss": 0.2132, "step": 4415 }, { "epoch": 4.590436590436591, "grad_norm": 0.004619097337126732, "learning_rate": 9.124509124509125e-07, "loss": 0.0001, "step": 4416 }, { "epoch": 4.591476091476092, "grad_norm": 2.1546518802642822, "learning_rate": 9.101409101409102e-07, "loss": 0.0622, "step": 4417 }, { "epoch": 4.592515592515593, "grad_norm": 0.25829172134399414, "learning_rate": 9.078309078309079e-07, "loss": 0.0048, "step": 4418 }, { "epoch": 4.593555093555094, "grad_norm": 6.955380916595459, "learning_rate": 9.055209055209056e-07, "loss": 0.1169, "step": 4419 }, { "epoch": 4.594594594594595, "grad_norm": 2.080986976623535, "learning_rate": 9.032109032109033e-07, "loss": 0.0479, "step": 4420 }, { "epoch": 4.595634095634096, "grad_norm": 19.40532875061035, "learning_rate": 9.00900900900901e-07, "loss": 0.5309, "step": 4421 }, { "epoch": 4.596673596673597, "grad_norm": 0.6829842925071716, "learning_rate": 8.985908985908986e-07, "loss": 0.0135, "step": 4422 }, { "epoch": 4.597713097713098, "grad_norm": 0.22129260003566742, "learning_rate": 8.962808962808964e-07, "loss": 0.0056, "step": 4423 }, { "epoch": 4.598752598752599, "grad_norm": 5.869988918304443, "learning_rate": 8.939708939708941e-07, "loss": 0.0672, "step": 4424 }, { "epoch": 4.5997920997921, "grad_norm": 9.109978675842285, "learning_rate": 8.916608916608917e-07, "loss": 0.4428, "step": 4425 }, { "epoch": 4.600831600831601, "grad_norm": 3.746086359024048, "learning_rate": 8.893508893508894e-07, "loss": 0.2627, "step": 4426 }, { "epoch": 4.601871101871102, "grad_norm": 0.024450652301311493, "learning_rate": 8.870408870408871e-07, "loss": 0.0004, "step": 4427 }, { "epoch": 4.602910602910603, "grad_norm": 0.3670468330383301, "learning_rate": 8.847308847308848e-07, "loss": 0.0115, "step": 4428 }, { "epoch": 4.603950103950104, "grad_norm": 0.1896979957818985, "learning_rate": 8.824208824208826e-07, "loss": 0.0047, "step": 4429 }, { "epoch": 4.604989604989605, "grad_norm": 5.6438822746276855, "learning_rate": 8.801108801108801e-07, "loss": 0.5649, "step": 4430 }, { "epoch": 4.606029106029106, "grad_norm": 0.4361482858657837, "learning_rate": 8.778008778008779e-07, "loss": 0.0154, "step": 4431 }, { "epoch": 4.607068607068607, "grad_norm": 0.12333600968122482, "learning_rate": 8.754908754908756e-07, "loss": 0.0031, "step": 4432 }, { "epoch": 4.608108108108108, "grad_norm": 1.3953226804733276, "learning_rate": 8.731808731808733e-07, "loss": 0.0636, "step": 4433 }, { "epoch": 4.609147609147609, "grad_norm": 0.24091799557209015, "learning_rate": 8.708708708708709e-07, "loss": 0.0075, "step": 4434 }, { "epoch": 4.61018711018711, "grad_norm": 0.6747987270355225, "learning_rate": 8.685608685608687e-07, "loss": 0.014, "step": 4435 }, { "epoch": 4.611226611226611, "grad_norm": 2.1281895637512207, "learning_rate": 8.662508662508663e-07, "loss": 0.0412, "step": 4436 }, { "epoch": 4.612266112266112, "grad_norm": 1.3151452541351318, "learning_rate": 8.639408639408641e-07, "loss": 0.2408, "step": 4437 }, { "epoch": 4.613305613305613, "grad_norm": 5.143826961517334, "learning_rate": 8.616308616308617e-07, "loss": 0.1529, "step": 4438 }, { "epoch": 4.614345114345114, "grad_norm": 15.18270206451416, "learning_rate": 8.593208593208593e-07, "loss": 1.7764, "step": 4439 }, { "epoch": 4.615384615384615, "grad_norm": 6.709432125091553, "learning_rate": 8.570108570108571e-07, "loss": 0.1883, "step": 4440 }, { "epoch": 4.616424116424117, "grad_norm": 0.1298711895942688, "learning_rate": 8.547008547008548e-07, "loss": 0.0029, "step": 4441 }, { "epoch": 4.617463617463617, "grad_norm": 8.232173919677734, "learning_rate": 8.523908523908525e-07, "loss": 0.6129, "step": 4442 }, { "epoch": 4.618503118503119, "grad_norm": 1.6797841787338257, "learning_rate": 8.5008085008085e-07, "loss": 0.0228, "step": 4443 }, { "epoch": 4.61954261954262, "grad_norm": 0.4660455286502838, "learning_rate": 8.477708477708478e-07, "loss": 0.0106, "step": 4444 }, { "epoch": 4.620582120582121, "grad_norm": 0.5190257430076599, "learning_rate": 8.454608454608456e-07, "loss": 0.0069, "step": 4445 }, { "epoch": 4.621621621621622, "grad_norm": 4.557394027709961, "learning_rate": 8.431508431508432e-07, "loss": 0.1984, "step": 4446 }, { "epoch": 4.622661122661123, "grad_norm": 3.3669612407684326, "learning_rate": 8.40840840840841e-07, "loss": 0.0821, "step": 4447 }, { "epoch": 4.623700623700624, "grad_norm": 7.057306289672852, "learning_rate": 8.385308385308386e-07, "loss": 0.1986, "step": 4448 }, { "epoch": 4.624740124740125, "grad_norm": 6.849547386169434, "learning_rate": 8.362208362208362e-07, "loss": 0.6603, "step": 4449 }, { "epoch": 4.625779625779626, "grad_norm": 1.4122143983840942, "learning_rate": 8.33910833910834e-07, "loss": 0.0221, "step": 4450 }, { "epoch": 4.626819126819127, "grad_norm": 0.011345437727868557, "learning_rate": 8.316008316008318e-07, "loss": 0.0002, "step": 4451 }, { "epoch": 4.627858627858628, "grad_norm": 2.111825942993164, "learning_rate": 8.292908292908293e-07, "loss": 0.0669, "step": 4452 }, { "epoch": 4.628898128898129, "grad_norm": 0.4430828094482422, "learning_rate": 8.26980826980827e-07, "loss": 0.008, "step": 4453 }, { "epoch": 4.62993762993763, "grad_norm": 4.777477264404297, "learning_rate": 8.246708246708247e-07, "loss": 0.3356, "step": 4454 }, { "epoch": 4.630977130977131, "grad_norm": 10.0228910446167, "learning_rate": 8.223608223608224e-07, "loss": 0.2197, "step": 4455 }, { "epoch": 4.632016632016632, "grad_norm": 9.412625312805176, "learning_rate": 8.200508200508202e-07, "loss": 0.6145, "step": 4456 }, { "epoch": 4.633056133056133, "grad_norm": 2.350175619125366, "learning_rate": 8.177408177408177e-07, "loss": 0.0595, "step": 4457 }, { "epoch": 4.634095634095634, "grad_norm": 0.00910639762878418, "learning_rate": 8.154308154308155e-07, "loss": 0.0002, "step": 4458 }, { "epoch": 4.635135135135135, "grad_norm": 3.016007900238037, "learning_rate": 8.131208131208132e-07, "loss": 0.0865, "step": 4459 }, { "epoch": 4.636174636174636, "grad_norm": 5.475106716156006, "learning_rate": 8.108108108108109e-07, "loss": 0.1247, "step": 4460 }, { "epoch": 4.637214137214137, "grad_norm": 2.1070916652679443, "learning_rate": 8.085008085008085e-07, "loss": 0.0539, "step": 4461 }, { "epoch": 4.638253638253638, "grad_norm": 8.32010555267334, "learning_rate": 8.061908061908062e-07, "loss": 0.2201, "step": 4462 }, { "epoch": 4.639293139293139, "grad_norm": 11.642264366149902, "learning_rate": 8.038808038808039e-07, "loss": 0.3801, "step": 4463 }, { "epoch": 4.64033264033264, "grad_norm": 1.360060691833496, "learning_rate": 8.015708015708017e-07, "loss": 0.0294, "step": 4464 }, { "epoch": 4.641372141372141, "grad_norm": 3.858584403991699, "learning_rate": 7.992607992607993e-07, "loss": 0.0837, "step": 4465 }, { "epoch": 4.642411642411642, "grad_norm": 0.033442914485931396, "learning_rate": 7.96950796950797e-07, "loss": 0.0006, "step": 4466 }, { "epoch": 4.643451143451143, "grad_norm": 2.088732957839966, "learning_rate": 7.946407946407947e-07, "loss": 0.0696, "step": 4467 }, { "epoch": 4.644490644490644, "grad_norm": 14.464248657226562, "learning_rate": 7.923307923307924e-07, "loss": 0.8585, "step": 4468 }, { "epoch": 4.645530145530145, "grad_norm": 9.30983829498291, "learning_rate": 7.900207900207901e-07, "loss": 0.8977, "step": 4469 }, { "epoch": 4.646569646569646, "grad_norm": 0.00840518344193697, "learning_rate": 7.877107877107878e-07, "loss": 0.0003, "step": 4470 }, { "epoch": 4.647609147609147, "grad_norm": 0.9638694524765015, "learning_rate": 7.854007854007854e-07, "loss": 0.0188, "step": 4471 }, { "epoch": 4.648648648648649, "grad_norm": 9.1583833694458, "learning_rate": 7.830907830907832e-07, "loss": 0.3321, "step": 4472 }, { "epoch": 4.649688149688149, "grad_norm": 0.06451886147260666, "learning_rate": 7.807807807807808e-07, "loss": 0.0016, "step": 4473 }, { "epoch": 4.650727650727651, "grad_norm": 0.4256269335746765, "learning_rate": 7.784707784707786e-07, "loss": 0.0079, "step": 4474 }, { "epoch": 4.651767151767151, "grad_norm": 8.92230224609375, "learning_rate": 7.761607761607762e-07, "loss": 0.4972, "step": 4475 }, { "epoch": 4.652806652806653, "grad_norm": 0.922160267829895, "learning_rate": 7.738507738507739e-07, "loss": 0.0193, "step": 4476 }, { "epoch": 4.653846153846154, "grad_norm": 1.281999111175537, "learning_rate": 7.715407715407716e-07, "loss": 0.0445, "step": 4477 }, { "epoch": 4.654885654885655, "grad_norm": 1.8824506998062134, "learning_rate": 7.692307692307694e-07, "loss": 0.0331, "step": 4478 }, { "epoch": 4.655925155925156, "grad_norm": 5.897076606750488, "learning_rate": 7.669207669207669e-07, "loss": 0.1943, "step": 4479 }, { "epoch": 4.656964656964657, "grad_norm": 4.859743118286133, "learning_rate": 7.646107646107647e-07, "loss": 0.1367, "step": 4480 }, { "epoch": 4.658004158004158, "grad_norm": 0.13624556362628937, "learning_rate": 7.623007623007623e-07, "loss": 0.0035, "step": 4481 }, { "epoch": 4.659043659043659, "grad_norm": 2.1516993045806885, "learning_rate": 7.599907599907601e-07, "loss": 0.1007, "step": 4482 }, { "epoch": 4.66008316008316, "grad_norm": 0.3950836658477783, "learning_rate": 7.576807576807578e-07, "loss": 0.0109, "step": 4483 }, { "epoch": 4.661122661122661, "grad_norm": 4.261602401733398, "learning_rate": 7.553707553707554e-07, "loss": 0.195, "step": 4484 }, { "epoch": 4.662162162162162, "grad_norm": 11.667570114135742, "learning_rate": 7.530607530607531e-07, "loss": 0.7816, "step": 4485 }, { "epoch": 4.663201663201663, "grad_norm": 0.5048899054527283, "learning_rate": 7.507507507507509e-07, "loss": 0.0096, "step": 4486 }, { "epoch": 4.664241164241164, "grad_norm": 0.7986839413642883, "learning_rate": 7.484407484407485e-07, "loss": 0.0319, "step": 4487 }, { "epoch": 4.665280665280665, "grad_norm": 6.918520927429199, "learning_rate": 7.461307461307461e-07, "loss": 0.254, "step": 4488 }, { "epoch": 4.666320166320166, "grad_norm": 0.032249923795461655, "learning_rate": 7.438207438207438e-07, "loss": 0.0007, "step": 4489 }, { "epoch": 4.667359667359667, "grad_norm": 0.021945137530565262, "learning_rate": 7.415107415107416e-07, "loss": 0.0004, "step": 4490 }, { "epoch": 4.668399168399168, "grad_norm": 12.442795753479004, "learning_rate": 7.392007392007393e-07, "loss": 0.7439, "step": 4491 }, { "epoch": 4.669438669438669, "grad_norm": 2.157111883163452, "learning_rate": 7.36890736890737e-07, "loss": 0.0531, "step": 4492 }, { "epoch": 4.67047817047817, "grad_norm": 0.023635277524590492, "learning_rate": 7.345807345807346e-07, "loss": 0.0006, "step": 4493 }, { "epoch": 4.671517671517671, "grad_norm": 5.190340995788574, "learning_rate": 7.322707322707323e-07, "loss": 0.1485, "step": 4494 }, { "epoch": 4.672557172557172, "grad_norm": 0.08936876803636551, "learning_rate": 7.2996072996073e-07, "loss": 0.0018, "step": 4495 }, { "epoch": 4.673596673596673, "grad_norm": 7.4356465339660645, "learning_rate": 7.276507276507278e-07, "loss": 0.313, "step": 4496 }, { "epoch": 4.674636174636174, "grad_norm": 9.407391548156738, "learning_rate": 7.253407253407253e-07, "loss": 0.7526, "step": 4497 }, { "epoch": 4.675675675675675, "grad_norm": 1.671047568321228, "learning_rate": 7.23030723030723e-07, "loss": 0.0295, "step": 4498 }, { "epoch": 4.6767151767151764, "grad_norm": 9.059718132019043, "learning_rate": 7.207207207207208e-07, "loss": 0.9281, "step": 4499 }, { "epoch": 4.6777546777546775, "grad_norm": 0.3992685079574585, "learning_rate": 7.184107184107185e-07, "loss": 0.0069, "step": 4500 }, { "epoch": 4.6787941787941785, "grad_norm": 10.279568672180176, "learning_rate": 7.161007161007162e-07, "loss": 0.6143, "step": 4501 }, { "epoch": 4.6798336798336795, "grad_norm": 3.262343168258667, "learning_rate": 7.137907137907138e-07, "loss": 0.3237, "step": 4502 }, { "epoch": 4.6808731808731805, "grad_norm": 3.7217352390289307, "learning_rate": 7.114807114807115e-07, "loss": 0.1122, "step": 4503 }, { "epoch": 4.6819126819126815, "grad_norm": 0.7044948935508728, "learning_rate": 7.091707091707092e-07, "loss": 0.0217, "step": 4504 }, { "epoch": 4.682952182952183, "grad_norm": 5.225720405578613, "learning_rate": 7.06860706860707e-07, "loss": 0.2687, "step": 4505 }, { "epoch": 4.6839916839916835, "grad_norm": 3.316580295562744, "learning_rate": 7.045507045507045e-07, "loss": 0.1959, "step": 4506 }, { "epoch": 4.685031185031185, "grad_norm": 16.33987045288086, "learning_rate": 7.022407022407023e-07, "loss": 1.5211, "step": 4507 }, { "epoch": 4.686070686070686, "grad_norm": 7.938897132873535, "learning_rate": 6.999306999306999e-07, "loss": 0.2641, "step": 4508 }, { "epoch": 4.6871101871101875, "grad_norm": 0.2571137845516205, "learning_rate": 6.976206976206977e-07, "loss": 0.0094, "step": 4509 }, { "epoch": 4.6881496881496885, "grad_norm": 6.694173336029053, "learning_rate": 6.953106953106955e-07, "loss": 0.6494, "step": 4510 }, { "epoch": 4.6891891891891895, "grad_norm": 0.22093908488750458, "learning_rate": 6.93000693000693e-07, "loss": 0.0045, "step": 4511 }, { "epoch": 4.6902286902286905, "grad_norm": 0.020347382873296738, "learning_rate": 6.906906906906907e-07, "loss": 0.0003, "step": 4512 }, { "epoch": 4.6912681912681915, "grad_norm": 5.891967296600342, "learning_rate": 6.883806883806885e-07, "loss": 0.3368, "step": 4513 }, { "epoch": 4.6923076923076925, "grad_norm": 3.0930402278900146, "learning_rate": 6.860706860706861e-07, "loss": 0.0542, "step": 4514 }, { "epoch": 4.6933471933471935, "grad_norm": 11.057426452636719, "learning_rate": 6.837606837606839e-07, "loss": 0.6941, "step": 4515 }, { "epoch": 4.6943866943866945, "grad_norm": 0.2353089153766632, "learning_rate": 6.814506814506814e-07, "loss": 0.0044, "step": 4516 }, { "epoch": 4.6954261954261955, "grad_norm": 0.016913872212171555, "learning_rate": 6.791406791406792e-07, "loss": 0.0005, "step": 4517 }, { "epoch": 4.696465696465697, "grad_norm": 3.098769187927246, "learning_rate": 6.768306768306769e-07, "loss": 0.0645, "step": 4518 }, { "epoch": 4.697505197505198, "grad_norm": 0.6526622772216797, "learning_rate": 6.745206745206746e-07, "loss": 0.0176, "step": 4519 }, { "epoch": 4.698544698544699, "grad_norm": 1.5812019109725952, "learning_rate": 6.722106722106722e-07, "loss": 0.0398, "step": 4520 }, { "epoch": 4.6995841995842, "grad_norm": 0.003646696452051401, "learning_rate": 6.6990066990067e-07, "loss": 0.0001, "step": 4521 }, { "epoch": 4.700623700623701, "grad_norm": 1.0215805768966675, "learning_rate": 6.675906675906676e-07, "loss": 0.0511, "step": 4522 }, { "epoch": 4.701663201663202, "grad_norm": 2.810720682144165, "learning_rate": 6.652806652806654e-07, "loss": 0.0212, "step": 4523 }, { "epoch": 4.702702702702703, "grad_norm": 2.5942187309265137, "learning_rate": 6.62970662970663e-07, "loss": 0.043, "step": 4524 }, { "epoch": 4.703742203742204, "grad_norm": 0.024856634438037872, "learning_rate": 6.606606606606607e-07, "loss": 0.0006, "step": 4525 }, { "epoch": 4.704781704781705, "grad_norm": 8.373519897460938, "learning_rate": 6.583506583506584e-07, "loss": 0.7647, "step": 4526 }, { "epoch": 4.705821205821206, "grad_norm": 0.19453732669353485, "learning_rate": 6.560406560406561e-07, "loss": 0.0029, "step": 4527 }, { "epoch": 4.706860706860707, "grad_norm": 0.9466361999511719, "learning_rate": 6.537306537306538e-07, "loss": 0.0188, "step": 4528 }, { "epoch": 4.707900207900208, "grad_norm": 0.2725234627723694, "learning_rate": 6.514206514206515e-07, "loss": 0.0052, "step": 4529 }, { "epoch": 4.708939708939709, "grad_norm": 5.90593957901001, "learning_rate": 6.491106491106491e-07, "loss": 0.2407, "step": 4530 }, { "epoch": 4.70997920997921, "grad_norm": 8.032899856567383, "learning_rate": 6.468006468006469e-07, "loss": 0.4303, "step": 4531 }, { "epoch": 4.711018711018711, "grad_norm": 3.0624608993530273, "learning_rate": 6.444906444906446e-07, "loss": 0.0978, "step": 4532 }, { "epoch": 4.712058212058212, "grad_norm": 2.9606127738952637, "learning_rate": 6.421806421806423e-07, "loss": 0.0711, "step": 4533 }, { "epoch": 4.713097713097713, "grad_norm": 1.1739325523376465, "learning_rate": 6.398706398706399e-07, "loss": 0.2751, "step": 4534 }, { "epoch": 4.714137214137214, "grad_norm": 2.5979039669036865, "learning_rate": 6.375606375606376e-07, "loss": 0.0699, "step": 4535 }, { "epoch": 4.715176715176716, "grad_norm": 6.968671798706055, "learning_rate": 6.352506352506353e-07, "loss": 0.432, "step": 4536 }, { "epoch": 4.716216216216216, "grad_norm": 9.019438743591309, "learning_rate": 6.329406329406331e-07, "loss": 0.3449, "step": 4537 }, { "epoch": 4.717255717255718, "grad_norm": 5.718798637390137, "learning_rate": 6.306306306306306e-07, "loss": 0.2865, "step": 4538 }, { "epoch": 4.718295218295218, "grad_norm": 2.0874598026275635, "learning_rate": 6.283206283206284e-07, "loss": 0.0485, "step": 4539 }, { "epoch": 4.71933471933472, "grad_norm": 2.445598602294922, "learning_rate": 6.260106260106261e-07, "loss": 0.0968, "step": 4540 }, { "epoch": 4.720374220374221, "grad_norm": 9.9757661819458, "learning_rate": 6.237006237006238e-07, "loss": 0.5018, "step": 4541 }, { "epoch": 4.721413721413722, "grad_norm": 3.8065030574798584, "learning_rate": 6.213906213906215e-07, "loss": 0.0682, "step": 4542 }, { "epoch": 4.722453222453223, "grad_norm": 2.1041741371154785, "learning_rate": 6.190806190806191e-07, "loss": 0.0431, "step": 4543 }, { "epoch": 4.723492723492724, "grad_norm": 9.70471477508545, "learning_rate": 6.167706167706168e-07, "loss": 0.6736, "step": 4544 }, { "epoch": 4.724532224532225, "grad_norm": 4.165168285369873, "learning_rate": 6.144606144606146e-07, "loss": 0.201, "step": 4545 }, { "epoch": 4.725571725571726, "grad_norm": 1.6179542541503906, "learning_rate": 6.121506121506121e-07, "loss": 0.0314, "step": 4546 }, { "epoch": 4.726611226611227, "grad_norm": 6.723485946655273, "learning_rate": 6.098406098406099e-07, "loss": 0.2245, "step": 4547 }, { "epoch": 4.727650727650728, "grad_norm": 9.48215103149414, "learning_rate": 6.075306075306076e-07, "loss": 1.1988, "step": 4548 }, { "epoch": 4.728690228690229, "grad_norm": 2.4443297386169434, "learning_rate": 6.052206052206053e-07, "loss": 0.0589, "step": 4549 }, { "epoch": 4.72972972972973, "grad_norm": 0.08004553616046906, "learning_rate": 6.02910602910603e-07, "loss": 0.0017, "step": 4550 }, { "epoch": 4.730769230769231, "grad_norm": 6.5797200202941895, "learning_rate": 6.006006006006006e-07, "loss": 0.5225, "step": 4551 }, { "epoch": 4.731808731808732, "grad_norm": 1.0097525119781494, "learning_rate": 5.982905982905984e-07, "loss": 0.0388, "step": 4552 }, { "epoch": 4.732848232848233, "grad_norm": 0.07979341596364975, "learning_rate": 5.95980595980596e-07, "loss": 0.0026, "step": 4553 }, { "epoch": 4.733887733887734, "grad_norm": 9.43011474609375, "learning_rate": 5.936705936705937e-07, "loss": 0.4164, "step": 4554 }, { "epoch": 4.734927234927235, "grad_norm": 7.376195430755615, "learning_rate": 5.913605913605914e-07, "loss": 0.4523, "step": 4555 }, { "epoch": 4.735966735966736, "grad_norm": 1.2433668375015259, "learning_rate": 5.890505890505891e-07, "loss": 0.0614, "step": 4556 }, { "epoch": 4.737006237006237, "grad_norm": 4.30912971496582, "learning_rate": 5.867405867405867e-07, "loss": 0.1076, "step": 4557 }, { "epoch": 4.738045738045738, "grad_norm": 0.028791455551981926, "learning_rate": 5.844305844305845e-07, "loss": 0.0007, "step": 4558 }, { "epoch": 4.739085239085239, "grad_norm": 0.13918457925319672, "learning_rate": 5.821205821205821e-07, "loss": 0.0031, "step": 4559 }, { "epoch": 4.74012474012474, "grad_norm": 3.475895643234253, "learning_rate": 5.798105798105798e-07, "loss": 0.3015, "step": 4560 }, { "epoch": 4.741164241164241, "grad_norm": 0.010119406506419182, "learning_rate": 5.775005775005776e-07, "loss": 0.0003, "step": 4561 }, { "epoch": 4.742203742203742, "grad_norm": 1.498502492904663, "learning_rate": 5.751905751905752e-07, "loss": 0.0301, "step": 4562 }, { "epoch": 4.743243243243243, "grad_norm": 9.394128799438477, "learning_rate": 5.728805728805729e-07, "loss": 0.4929, "step": 4563 }, { "epoch": 4.744282744282744, "grad_norm": 0.28972873091697693, "learning_rate": 5.705705705705706e-07, "loss": 0.0087, "step": 4564 }, { "epoch": 4.745322245322245, "grad_norm": 1.398381233215332, "learning_rate": 5.682605682605683e-07, "loss": 0.0394, "step": 4565 }, { "epoch": 4.746361746361746, "grad_norm": 5.117181777954102, "learning_rate": 5.65950565950566e-07, "loss": 0.2151, "step": 4566 }, { "epoch": 4.747401247401247, "grad_norm": 6.448148250579834, "learning_rate": 5.636405636405637e-07, "loss": 0.3025, "step": 4567 }, { "epoch": 4.748440748440748, "grad_norm": 0.009139902889728546, "learning_rate": 5.613305613305614e-07, "loss": 0.0002, "step": 4568 }, { "epoch": 4.74948024948025, "grad_norm": 2.174379825592041, "learning_rate": 5.590205590205591e-07, "loss": 0.0265, "step": 4569 }, { "epoch": 4.75051975051975, "grad_norm": 6.330443382263184, "learning_rate": 5.567105567105567e-07, "loss": 0.1853, "step": 4570 }, { "epoch": 4.751559251559252, "grad_norm": 1.084800124168396, "learning_rate": 5.544005544005544e-07, "loss": 0.0169, "step": 4571 }, { "epoch": 4.752598752598753, "grad_norm": 10.983011245727539, "learning_rate": 5.520905520905522e-07, "loss": 0.3542, "step": 4572 }, { "epoch": 4.753638253638254, "grad_norm": 7.211549758911133, "learning_rate": 5.497805497805498e-07, "loss": 0.2281, "step": 4573 }, { "epoch": 4.754677754677755, "grad_norm": 0.026495108380913734, "learning_rate": 5.474705474705475e-07, "loss": 0.0007, "step": 4574 }, { "epoch": 4.755717255717256, "grad_norm": 1.1615225076675415, "learning_rate": 5.451605451605452e-07, "loss": 0.0235, "step": 4575 }, { "epoch": 4.756756756756757, "grad_norm": 0.4452331066131592, "learning_rate": 5.428505428505429e-07, "loss": 0.0107, "step": 4576 }, { "epoch": 4.757796257796258, "grad_norm": 0.0032271817326545715, "learning_rate": 5.405405405405406e-07, "loss": 0.0001, "step": 4577 }, { "epoch": 4.758835758835759, "grad_norm": 5.279394626617432, "learning_rate": 5.382305382305382e-07, "loss": 0.3176, "step": 4578 }, { "epoch": 4.75987525987526, "grad_norm": 10.044146537780762, "learning_rate": 5.35920535920536e-07, "loss": 0.4006, "step": 4579 }, { "epoch": 4.760914760914761, "grad_norm": 2.109079599380493, "learning_rate": 5.336105336105337e-07, "loss": 0.0414, "step": 4580 }, { "epoch": 4.761954261954262, "grad_norm": 3.126941204071045, "learning_rate": 5.313005313005313e-07, "loss": 0.0685, "step": 4581 }, { "epoch": 4.762993762993763, "grad_norm": 3.8069844245910645, "learning_rate": 5.28990528990529e-07, "loss": 0.1809, "step": 4582 }, { "epoch": 4.764033264033264, "grad_norm": 2.9434378147125244, "learning_rate": 5.266805266805268e-07, "loss": 0.252, "step": 4583 }, { "epoch": 4.765072765072765, "grad_norm": 0.6716585755348206, "learning_rate": 5.243705243705244e-07, "loss": 0.0158, "step": 4584 }, { "epoch": 4.766112266112266, "grad_norm": 5.513317108154297, "learning_rate": 5.220605220605221e-07, "loss": 0.1182, "step": 4585 }, { "epoch": 4.767151767151767, "grad_norm": 3.289609670639038, "learning_rate": 5.197505197505199e-07, "loss": 0.0477, "step": 4586 }, { "epoch": 4.768191268191268, "grad_norm": 0.6133396625518799, "learning_rate": 5.174405174405175e-07, "loss": 0.0119, "step": 4587 }, { "epoch": 4.769230769230769, "grad_norm": 4.571869373321533, "learning_rate": 5.151305151305152e-07, "loss": 0.5846, "step": 4588 }, { "epoch": 4.77027027027027, "grad_norm": 8.87611198425293, "learning_rate": 5.128205128205128e-07, "loss": 1.162, "step": 4589 }, { "epoch": 4.771309771309771, "grad_norm": 4.447170257568359, "learning_rate": 5.105105105105106e-07, "loss": 0.3333, "step": 4590 }, { "epoch": 4.772349272349272, "grad_norm": 10.814187049865723, "learning_rate": 5.082005082005083e-07, "loss": 0.506, "step": 4591 }, { "epoch": 4.773388773388773, "grad_norm": 8.977923393249512, "learning_rate": 5.058905058905059e-07, "loss": 0.5942, "step": 4592 }, { "epoch": 4.774428274428274, "grad_norm": 8.605423927307129, "learning_rate": 5.035805035805036e-07, "loss": 0.2784, "step": 4593 }, { "epoch": 4.775467775467775, "grad_norm": 0.4176355004310608, "learning_rate": 5.012705012705014e-07, "loss": 0.0105, "step": 4594 }, { "epoch": 4.776507276507276, "grad_norm": 11.354757308959961, "learning_rate": 4.98960498960499e-07, "loss": 0.7986, "step": 4595 }, { "epoch": 4.777546777546777, "grad_norm": 2.4823195934295654, "learning_rate": 4.966504966504967e-07, "loss": 0.0429, "step": 4596 }, { "epoch": 4.778586278586278, "grad_norm": 0.0783335343003273, "learning_rate": 4.943404943404944e-07, "loss": 0.0017, "step": 4597 }, { "epoch": 4.779625779625779, "grad_norm": 0.4593014121055603, "learning_rate": 4.92030492030492e-07, "loss": 0.0121, "step": 4598 }, { "epoch": 4.78066528066528, "grad_norm": 3.2849011421203613, "learning_rate": 4.897204897204898e-07, "loss": 0.0756, "step": 4599 }, { "epoch": 4.781704781704782, "grad_norm": 8.635833740234375, "learning_rate": 4.874104874104874e-07, "loss": 0.2125, "step": 4600 }, { "epoch": 4.782744282744282, "grad_norm": 1.040634274482727, "learning_rate": 4.851004851004852e-07, "loss": 0.0258, "step": 4601 }, { "epoch": 4.783783783783784, "grad_norm": 0.10529931634664536, "learning_rate": 4.827904827904829e-07, "loss": 0.0019, "step": 4602 }, { "epoch": 4.784823284823284, "grad_norm": 7.396679401397705, "learning_rate": 4.804804804804805e-07, "loss": 0.7434, "step": 4603 }, { "epoch": 4.785862785862786, "grad_norm": 11.395238876342773, "learning_rate": 4.781704781704783e-07, "loss": 1.2236, "step": 4604 }, { "epoch": 4.786902286902287, "grad_norm": 0.009259616024792194, "learning_rate": 4.758604758604759e-07, "loss": 0.0002, "step": 4605 }, { "epoch": 4.787941787941788, "grad_norm": 7.3028788566589355, "learning_rate": 4.735504735504736e-07, "loss": 0.2805, "step": 4606 }, { "epoch": 4.788981288981289, "grad_norm": 0.01691441796720028, "learning_rate": 4.7124047124047127e-07, "loss": 0.0003, "step": 4607 }, { "epoch": 4.79002079002079, "grad_norm": 0.09052679687738419, "learning_rate": 4.68930468930469e-07, "loss": 0.0014, "step": 4608 }, { "epoch": 4.791060291060291, "grad_norm": 3.935920476913452, "learning_rate": 4.6662046662046664e-07, "loss": 0.0984, "step": 4609 }, { "epoch": 4.792099792099792, "grad_norm": 12.794758796691895, "learning_rate": 4.6431046431046436e-07, "loss": 0.9687, "step": 4610 }, { "epoch": 4.793139293139293, "grad_norm": 1.4812192916870117, "learning_rate": 4.6200046200046207e-07, "loss": 0.0423, "step": 4611 }, { "epoch": 4.794178794178794, "grad_norm": 2.92372989654541, "learning_rate": 4.5969045969045973e-07, "loss": 0.084, "step": 4612 }, { "epoch": 4.795218295218295, "grad_norm": 3.692225217819214, "learning_rate": 4.5738045738045745e-07, "loss": 0.0971, "step": 4613 }, { "epoch": 4.796257796257796, "grad_norm": 0.20043006539344788, "learning_rate": 4.550704550704551e-07, "loss": 0.0063, "step": 4614 }, { "epoch": 4.797297297297297, "grad_norm": 6.279969215393066, "learning_rate": 4.527604527604528e-07, "loss": 0.2496, "step": 4615 }, { "epoch": 4.798336798336798, "grad_norm": 0.7676145434379578, "learning_rate": 4.504504504504505e-07, "loss": 0.0142, "step": 4616 }, { "epoch": 4.799376299376299, "grad_norm": 0.9839637875556946, "learning_rate": 4.481404481404482e-07, "loss": 0.0507, "step": 4617 }, { "epoch": 4.8004158004158, "grad_norm": 1.1574064493179321, "learning_rate": 4.4583044583044586e-07, "loss": 0.2651, "step": 4618 }, { "epoch": 4.801455301455301, "grad_norm": 2.5498995780944824, "learning_rate": 4.435204435204436e-07, "loss": 0.074, "step": 4619 }, { "epoch": 4.802494802494802, "grad_norm": 11.176398277282715, "learning_rate": 4.412104412104413e-07, "loss": 1.3905, "step": 4620 }, { "epoch": 4.803534303534303, "grad_norm": NaN, "learning_rate": 4.3890043890043895e-07, "loss": 0.0692, "step": 4621 }, { "epoch": 4.8045738045738045, "grad_norm": 0.02660849131643772, "learning_rate": 4.3659043659043666e-07, "loss": 0.0005, "step": 4622 }, { "epoch": 4.8056133056133055, "grad_norm": 15.595175743103027, "learning_rate": 4.342804342804343e-07, "loss": 0.5628, "step": 4623 }, { "epoch": 4.8066528066528065, "grad_norm": 0.4597373902797699, "learning_rate": 4.3197043197043204e-07, "loss": 0.0093, "step": 4624 }, { "epoch": 4.8076923076923075, "grad_norm": 6.716993808746338, "learning_rate": 4.2966042966042965e-07, "loss": 0.3927, "step": 4625 }, { "epoch": 4.8087318087318085, "grad_norm": 3.250091314315796, "learning_rate": 4.273504273504274e-07, "loss": 0.1009, "step": 4626 }, { "epoch": 4.8097713097713095, "grad_norm": 9.529396057128906, "learning_rate": 4.25040425040425e-07, "loss": 1.1618, "step": 4627 }, { "epoch": 4.8108108108108105, "grad_norm": 0.3871168792247772, "learning_rate": 4.227304227304228e-07, "loss": 0.0091, "step": 4628 }, { "epoch": 4.8118503118503115, "grad_norm": 0.20890498161315918, "learning_rate": 4.204204204204205e-07, "loss": 0.0053, "step": 4629 }, { "epoch": 4.8128898128898125, "grad_norm": 0.05548560619354248, "learning_rate": 4.181104181104181e-07, "loss": 0.001, "step": 4630 }, { "epoch": 4.813929313929314, "grad_norm": 4.1424078941345215, "learning_rate": 4.158004158004159e-07, "loss": 0.4125, "step": 4631 }, { "epoch": 4.814968814968815, "grad_norm": 6.571180820465088, "learning_rate": 4.134904134904135e-07, "loss": 0.1227, "step": 4632 }, { "epoch": 4.8160083160083165, "grad_norm": 1.9572172164916992, "learning_rate": 4.111804111804112e-07, "loss": 0.1683, "step": 4633 }, { "epoch": 4.817047817047817, "grad_norm": 0.7159575819969177, "learning_rate": 4.0887040887040887e-07, "loss": 0.0164, "step": 4634 }, { "epoch": 4.8180873180873185, "grad_norm": 4.800737380981445, "learning_rate": 4.065604065604066e-07, "loss": 0.1449, "step": 4635 }, { "epoch": 4.8191268191268195, "grad_norm": 0.29823213815689087, "learning_rate": 4.0425040425040424e-07, "loss": 0.0079, "step": 4636 }, { "epoch": 4.8201663201663205, "grad_norm": 9.392901420593262, "learning_rate": 4.0194040194040195e-07, "loss": 0.4268, "step": 4637 }, { "epoch": 4.8212058212058215, "grad_norm": 0.028579136356711388, "learning_rate": 3.9963039963039967e-07, "loss": 0.0008, "step": 4638 }, { "epoch": 4.8222453222453225, "grad_norm": 2.9507510662078857, "learning_rate": 3.9732039732039733e-07, "loss": 0.0765, "step": 4639 }, { "epoch": 4.8232848232848236, "grad_norm": 7.703818321228027, "learning_rate": 3.9501039501039504e-07, "loss": 0.338, "step": 4640 }, { "epoch": 4.824324324324325, "grad_norm": 4.047659397125244, "learning_rate": 3.927003927003927e-07, "loss": 0.2549, "step": 4641 }, { "epoch": 4.825363825363826, "grad_norm": 0.6981263756752014, "learning_rate": 3.903903903903904e-07, "loss": 0.0229, "step": 4642 }, { "epoch": 4.826403326403327, "grad_norm": 4.9491777420043945, "learning_rate": 3.880803880803881e-07, "loss": 0.1437, "step": 4643 }, { "epoch": 4.827442827442828, "grad_norm": 1.2429845333099365, "learning_rate": 3.857703857703858e-07, "loss": 0.039, "step": 4644 }, { "epoch": 4.828482328482329, "grad_norm": 0.000703688245266676, "learning_rate": 3.8346038346038346e-07, "loss": 0.0, "step": 4645 }, { "epoch": 4.82952182952183, "grad_norm": 6.003587245941162, "learning_rate": 3.8115038115038117e-07, "loss": 0.2291, "step": 4646 }, { "epoch": 4.830561330561331, "grad_norm": 9.933477401733398, "learning_rate": 3.788403788403789e-07, "loss": 0.7847, "step": 4647 }, { "epoch": 4.831600831600832, "grad_norm": 14.56813907623291, "learning_rate": 3.7653037653037655e-07, "loss": 0.6007, "step": 4648 }, { "epoch": 4.832640332640333, "grad_norm": 4.413318157196045, "learning_rate": 3.7422037422037426e-07, "loss": 0.1072, "step": 4649 }, { "epoch": 4.833679833679834, "grad_norm": 1.5492770671844482, "learning_rate": 3.719103719103719e-07, "loss": 0.0357, "step": 4650 }, { "epoch": 4.834719334719335, "grad_norm": 9.484484672546387, "learning_rate": 3.6960036960036964e-07, "loss": 0.5245, "step": 4651 }, { "epoch": 4.835758835758836, "grad_norm": 0.01474912278354168, "learning_rate": 3.672903672903673e-07, "loss": 0.0004, "step": 4652 }, { "epoch": 4.836798336798337, "grad_norm": 10.440610885620117, "learning_rate": 3.64980364980365e-07, "loss": 1.6802, "step": 4653 }, { "epoch": 4.837837837837838, "grad_norm": 3.246966600418091, "learning_rate": 3.626703626703627e-07, "loss": 0.2942, "step": 4654 }, { "epoch": 4.838877338877339, "grad_norm": 8.040372848510742, "learning_rate": 3.603603603603604e-07, "loss": 0.2389, "step": 4655 }, { "epoch": 4.83991683991684, "grad_norm": 0.1538681983947754, "learning_rate": 3.580503580503581e-07, "loss": 0.0029, "step": 4656 }, { "epoch": 4.840956340956341, "grad_norm": 8.433476448059082, "learning_rate": 3.5574035574035576e-07, "loss": 0.7995, "step": 4657 }, { "epoch": 4.841995841995842, "grad_norm": 0.0016116875922307372, "learning_rate": 3.534303534303535e-07, "loss": 0.0, "step": 4658 }, { "epoch": 4.843035343035343, "grad_norm": 4.5516133308410645, "learning_rate": 3.5112035112035114e-07, "loss": 0.1731, "step": 4659 }, { "epoch": 4.844074844074844, "grad_norm": 0.5104297399520874, "learning_rate": 3.4881034881034885e-07, "loss": 0.0108, "step": 4660 }, { "epoch": 4.845114345114345, "grad_norm": 0.16368019580841064, "learning_rate": 3.465003465003465e-07, "loss": 0.0047, "step": 4661 }, { "epoch": 4.846153846153846, "grad_norm": 9.44383716583252, "learning_rate": 3.4419034419034423e-07, "loss": 0.2378, "step": 4662 }, { "epoch": 4.847193347193347, "grad_norm": 0.10227713733911514, "learning_rate": 3.4188034188034194e-07, "loss": 0.0021, "step": 4663 }, { "epoch": 4.848232848232849, "grad_norm": 5.760284423828125, "learning_rate": 3.395703395703396e-07, "loss": 0.2681, "step": 4664 }, { "epoch": 4.849272349272349, "grad_norm": 5.341785907745361, "learning_rate": 3.372603372603373e-07, "loss": 0.2131, "step": 4665 }, { "epoch": 4.850311850311851, "grad_norm": 7.600803852081299, "learning_rate": 3.34950334950335e-07, "loss": 0.3075, "step": 4666 }, { "epoch": 4.851351351351351, "grad_norm": 0.27108874917030334, "learning_rate": 3.326403326403327e-07, "loss": 0.0066, "step": 4667 }, { "epoch": 4.852390852390853, "grad_norm": 26.737104415893555, "learning_rate": 3.3033033033033036e-07, "loss": 0.524, "step": 4668 }, { "epoch": 4.853430353430354, "grad_norm": 1.14380943775177, "learning_rate": 3.2802032802032807e-07, "loss": 0.0217, "step": 4669 }, { "epoch": 4.854469854469855, "grad_norm": 5.919650077819824, "learning_rate": 3.2571032571032573e-07, "loss": 0.3197, "step": 4670 }, { "epoch": 4.855509355509356, "grad_norm": 3.418506622314453, "learning_rate": 3.2340032340032345e-07, "loss": 0.1859, "step": 4671 }, { "epoch": 4.856548856548857, "grad_norm": 6.070278167724609, "learning_rate": 3.2109032109032116e-07, "loss": 0.1995, "step": 4672 }, { "epoch": 4.857588357588358, "grad_norm": 0.038140926510095596, "learning_rate": 3.187803187803188e-07, "loss": 0.0008, "step": 4673 }, { "epoch": 4.858627858627859, "grad_norm": 0.01792663149535656, "learning_rate": 3.1647031647031654e-07, "loss": 0.0003, "step": 4674 }, { "epoch": 4.85966735966736, "grad_norm": 11.694622039794922, "learning_rate": 3.141603141603142e-07, "loss": 0.6092, "step": 4675 }, { "epoch": 4.860706860706861, "grad_norm": 0.8555666208267212, "learning_rate": 3.118503118503119e-07, "loss": 0.018, "step": 4676 }, { "epoch": 4.861746361746362, "grad_norm": 1.2930259704589844, "learning_rate": 3.0954030954030957e-07, "loss": 0.0328, "step": 4677 }, { "epoch": 4.862785862785863, "grad_norm": 0.6727796792984009, "learning_rate": 3.072303072303073e-07, "loss": 0.0152, "step": 4678 }, { "epoch": 4.863825363825364, "grad_norm": 0.08168227225542068, "learning_rate": 3.0492030492030495e-07, "loss": 0.0014, "step": 4679 }, { "epoch": 4.864864864864865, "grad_norm": 13.4307279586792, "learning_rate": 3.0261030261030266e-07, "loss": 1.2033, "step": 4680 }, { "epoch": 4.865904365904366, "grad_norm": 1.3976296186447144, "learning_rate": 3.003003003003003e-07, "loss": 0.0443, "step": 4681 }, { "epoch": 4.866943866943867, "grad_norm": 5.100250244140625, "learning_rate": 2.97990297990298e-07, "loss": 0.2364, "step": 4682 }, { "epoch": 4.867983367983368, "grad_norm": 1.120133399963379, "learning_rate": 2.956802956802957e-07, "loss": 0.0242, "step": 4683 }, { "epoch": 4.869022869022869, "grad_norm": 9.938117980957031, "learning_rate": 2.9337029337029336e-07, "loss": 0.483, "step": 4684 }, { "epoch": 4.87006237006237, "grad_norm": 0.07843750715255737, "learning_rate": 2.910602910602911e-07, "loss": 0.0017, "step": 4685 }, { "epoch": 4.871101871101871, "grad_norm": 4.453456401824951, "learning_rate": 2.887502887502888e-07, "loss": 0.113, "step": 4686 }, { "epoch": 4.872141372141372, "grad_norm": 1.9144773483276367, "learning_rate": 2.8644028644028645e-07, "loss": 0.0655, "step": 4687 }, { "epoch": 4.873180873180873, "grad_norm": 8.914125442504883, "learning_rate": 2.8413028413028416e-07, "loss": 0.7485, "step": 4688 }, { "epoch": 4.874220374220374, "grad_norm": 9.680907249450684, "learning_rate": 2.818202818202818e-07, "loss": 0.9015, "step": 4689 }, { "epoch": 4.875259875259875, "grad_norm": 5.181673526763916, "learning_rate": 2.7951027951027954e-07, "loss": 0.0536, "step": 4690 }, { "epoch": 4.876299376299376, "grad_norm": 7.63839864730835, "learning_rate": 2.772002772002772e-07, "loss": 0.5203, "step": 4691 }, { "epoch": 4.877338877338877, "grad_norm": 2.4442501068115234, "learning_rate": 2.748902748902749e-07, "loss": 0.069, "step": 4692 }, { "epoch": 4.878378378378378, "grad_norm": 11.12451457977295, "learning_rate": 2.725802725802726e-07, "loss": 0.8014, "step": 4693 }, { "epoch": 4.879417879417879, "grad_norm": 1.978690505027771, "learning_rate": 2.702702702702703e-07, "loss": 0.054, "step": 4694 }, { "epoch": 4.88045738045738, "grad_norm": 0.7453691959381104, "learning_rate": 2.67960267960268e-07, "loss": 0.0075, "step": 4695 }, { "epoch": 4.881496881496881, "grad_norm": 0.621982753276825, "learning_rate": 2.6565026565026567e-07, "loss": 0.014, "step": 4696 }, { "epoch": 4.882536382536383, "grad_norm": 1.5242571830749512, "learning_rate": 2.633402633402634e-07, "loss": 0.0337, "step": 4697 }, { "epoch": 4.883575883575883, "grad_norm": 7.722758769989014, "learning_rate": 2.6103026103026104e-07, "loss": 0.3065, "step": 4698 }, { "epoch": 4.884615384615385, "grad_norm": 9.232809066772461, "learning_rate": 2.5872025872025876e-07, "loss": 0.5869, "step": 4699 }, { "epoch": 4.885654885654886, "grad_norm": 7.493601322174072, "learning_rate": 2.564102564102564e-07, "loss": 0.7301, "step": 4700 }, { "epoch": 4.886694386694387, "grad_norm": 3.2477588653564453, "learning_rate": 2.5410025410025413e-07, "loss": 0.059, "step": 4701 }, { "epoch": 4.887733887733888, "grad_norm": 2.048398017883301, "learning_rate": 2.517902517902518e-07, "loss": 0.0386, "step": 4702 }, { "epoch": 4.888773388773389, "grad_norm": 10.010037422180176, "learning_rate": 2.494802494802495e-07, "loss": 0.5694, "step": 4703 }, { "epoch": 4.88981288981289, "grad_norm": 5.604851245880127, "learning_rate": 2.471702471702472e-07, "loss": 0.4441, "step": 4704 }, { "epoch": 4.890852390852391, "grad_norm": 1.5705468654632568, "learning_rate": 2.448602448602449e-07, "loss": 0.2935, "step": 4705 }, { "epoch": 4.891891891891892, "grad_norm": 8.666454315185547, "learning_rate": 2.425502425502426e-07, "loss": 0.9524, "step": 4706 }, { "epoch": 4.892931392931393, "grad_norm": 3.345090389251709, "learning_rate": 2.4024024024024026e-07, "loss": 0.096, "step": 4707 }, { "epoch": 4.893970893970894, "grad_norm": 0.24659332633018494, "learning_rate": 2.3793023793023795e-07, "loss": 0.0042, "step": 4708 }, { "epoch": 4.895010395010395, "grad_norm": 3.87328839302063, "learning_rate": 2.3562023562023563e-07, "loss": 0.1188, "step": 4709 }, { "epoch": 4.896049896049896, "grad_norm": 0.04412136226892471, "learning_rate": 2.3331023331023332e-07, "loss": 0.0009, "step": 4710 }, { "epoch": 4.897089397089397, "grad_norm": 0.9146865010261536, "learning_rate": 2.3100023100023104e-07, "loss": 0.0171, "step": 4711 }, { "epoch": 4.898128898128898, "grad_norm": 2.897141456604004, "learning_rate": 2.2869022869022872e-07, "loss": 0.0577, "step": 4712 }, { "epoch": 4.899168399168399, "grad_norm": 7.913065433502197, "learning_rate": 2.263802263802264e-07, "loss": 0.6139, "step": 4713 }, { "epoch": 4.9002079002079, "grad_norm": 21.14871597290039, "learning_rate": 2.240702240702241e-07, "loss": 1.5482, "step": 4714 }, { "epoch": 4.901247401247401, "grad_norm": 0.7635882496833801, "learning_rate": 2.217602217602218e-07, "loss": 0.0128, "step": 4715 }, { "epoch": 4.902286902286902, "grad_norm": 5.794342994689941, "learning_rate": 2.1945021945021948e-07, "loss": 0.1829, "step": 4716 }, { "epoch": 4.903326403326403, "grad_norm": 8.451240539550781, "learning_rate": 2.1714021714021716e-07, "loss": 0.1605, "step": 4717 }, { "epoch": 4.904365904365904, "grad_norm": 4.783844470977783, "learning_rate": 2.1483021483021482e-07, "loss": 0.0833, "step": 4718 }, { "epoch": 4.905405405405405, "grad_norm": 9.323697090148926, "learning_rate": 2.125202125202125e-07, "loss": 0.9744, "step": 4719 }, { "epoch": 4.906444906444906, "grad_norm": 0.004284221678972244, "learning_rate": 2.1021021021021025e-07, "loss": 0.0001, "step": 4720 }, { "epoch": 4.907484407484407, "grad_norm": 0.03907490149140358, "learning_rate": 2.0790020790020794e-07, "loss": 0.0011, "step": 4721 }, { "epoch": 4.908523908523908, "grad_norm": 1.798534631729126, "learning_rate": 2.055902055902056e-07, "loss": 0.0782, "step": 4722 }, { "epoch": 4.909563409563409, "grad_norm": 0.10509315133094788, "learning_rate": 2.032802032802033e-07, "loss": 0.0033, "step": 4723 }, { "epoch": 4.91060291060291, "grad_norm": 0.04313625395298004, "learning_rate": 2.0097020097020098e-07, "loss": 0.0008, "step": 4724 }, { "epoch": 4.911642411642411, "grad_norm": 3.6045427322387695, "learning_rate": 1.9866019866019867e-07, "loss": 0.0667, "step": 4725 }, { "epoch": 4.912681912681912, "grad_norm": 0.20354896783828735, "learning_rate": 1.9635019635019635e-07, "loss": 0.0038, "step": 4726 }, { "epoch": 4.913721413721413, "grad_norm": 3.984830856323242, "learning_rate": 1.9404019404019404e-07, "loss": 0.2169, "step": 4727 }, { "epoch": 4.914760914760915, "grad_norm": 4.478424072265625, "learning_rate": 1.9173019173019173e-07, "loss": 0.2199, "step": 4728 }, { "epoch": 4.915800415800415, "grad_norm": 0.11140380799770355, "learning_rate": 1.8942018942018944e-07, "loss": 0.0027, "step": 4729 }, { "epoch": 4.916839916839917, "grad_norm": 2.437990665435791, "learning_rate": 1.8711018711018713e-07, "loss": 0.2274, "step": 4730 }, { "epoch": 4.917879417879417, "grad_norm": 11.711197853088379, "learning_rate": 1.8480018480018482e-07, "loss": 1.0555, "step": 4731 }, { "epoch": 4.918918918918919, "grad_norm": 13.96945571899414, "learning_rate": 1.824901824901825e-07, "loss": 1.3861, "step": 4732 }, { "epoch": 4.91995841995842, "grad_norm": 8.046520233154297, "learning_rate": 1.801801801801802e-07, "loss": 0.5197, "step": 4733 }, { "epoch": 4.920997920997921, "grad_norm": 3.5241611003875732, "learning_rate": 1.7787017787017788e-07, "loss": 0.1781, "step": 4734 }, { "epoch": 4.922037422037422, "grad_norm": 0.1690087765455246, "learning_rate": 1.7556017556017557e-07, "loss": 0.0038, "step": 4735 }, { "epoch": 4.923076923076923, "grad_norm": 1.308184027671814, "learning_rate": 1.7325017325017326e-07, "loss": 0.0249, "step": 4736 }, { "epoch": 4.924116424116424, "grad_norm": 0.6014237999916077, "learning_rate": 1.7094017094017097e-07, "loss": 0.0142, "step": 4737 }, { "epoch": 4.925155925155925, "grad_norm": 3.1152658462524414, "learning_rate": 1.6863016863016866e-07, "loss": 0.0834, "step": 4738 }, { "epoch": 4.926195426195426, "grad_norm": 3.200228452682495, "learning_rate": 1.6632016632016635e-07, "loss": 0.0566, "step": 4739 }, { "epoch": 4.927234927234927, "grad_norm": 6.555817127227783, "learning_rate": 1.6401016401016403e-07, "loss": 0.1928, "step": 4740 }, { "epoch": 4.928274428274428, "grad_norm": 10.922245979309082, "learning_rate": 1.6170016170016172e-07, "loss": 1.2626, "step": 4741 }, { "epoch": 4.929313929313929, "grad_norm": 8.992452621459961, "learning_rate": 1.593901593901594e-07, "loss": 0.381, "step": 4742 }, { "epoch": 4.93035343035343, "grad_norm": 1.0346816778182983, "learning_rate": 1.570801570801571e-07, "loss": 0.0261, "step": 4743 }, { "epoch": 4.9313929313929314, "grad_norm": 0.0066374437883496284, "learning_rate": 1.5477015477015479e-07, "loss": 0.0002, "step": 4744 }, { "epoch": 4.9324324324324325, "grad_norm": 0.47477760910987854, "learning_rate": 1.5246015246015247e-07, "loss": 0.0115, "step": 4745 }, { "epoch": 4.9334719334719335, "grad_norm": 10.687585830688477, "learning_rate": 1.5015015015015016e-07, "loss": 1.1373, "step": 4746 }, { "epoch": 4.9345114345114345, "grad_norm": 7.225226879119873, "learning_rate": 1.4784014784014785e-07, "loss": 0.2633, "step": 4747 }, { "epoch": 4.9355509355509355, "grad_norm": 2.706840991973877, "learning_rate": 1.4553014553014554e-07, "loss": 0.0638, "step": 4748 }, { "epoch": 4.9365904365904365, "grad_norm": 4.533247470855713, "learning_rate": 1.4322014322014322e-07, "loss": 0.2218, "step": 4749 }, { "epoch": 4.9376299376299375, "grad_norm": 9.23600959777832, "learning_rate": 1.409101409101409e-07, "loss": 0.2363, "step": 4750 }, { "epoch": 4.9386694386694385, "grad_norm": 4.492886543273926, "learning_rate": 1.386001386001386e-07, "loss": 0.1644, "step": 4751 }, { "epoch": 4.9397089397089395, "grad_norm": 1.5420405864715576, "learning_rate": 1.362901362901363e-07, "loss": 0.2551, "step": 4752 }, { "epoch": 4.9407484407484406, "grad_norm": 2.434460163116455, "learning_rate": 1.33980133980134e-07, "loss": 0.0508, "step": 4753 }, { "epoch": 4.941787941787942, "grad_norm": 6.422155857086182, "learning_rate": 1.316701316701317e-07, "loss": 0.2384, "step": 4754 }, { "epoch": 4.942827442827443, "grad_norm": 0.10697789490222931, "learning_rate": 1.2936012936012938e-07, "loss": 0.0013, "step": 4755 }, { "epoch": 4.943866943866944, "grad_norm": 4.407161712646484, "learning_rate": 1.2705012705012707e-07, "loss": 0.2158, "step": 4756 }, { "epoch": 4.944906444906445, "grad_norm": 1.721889853477478, "learning_rate": 1.2474012474012475e-07, "loss": 0.0384, "step": 4757 }, { "epoch": 4.945945945945946, "grad_norm": 4.189394950866699, "learning_rate": 1.2243012243012244e-07, "loss": 0.1161, "step": 4758 }, { "epoch": 4.946985446985447, "grad_norm": 7.549504280090332, "learning_rate": 1.2012012012012013e-07, "loss": 0.2183, "step": 4759 }, { "epoch": 4.948024948024948, "grad_norm": 5.708642959594727, "learning_rate": 1.1781011781011782e-07, "loss": 0.4252, "step": 4760 }, { "epoch": 4.9490644490644495, "grad_norm": 2.221500873565674, "learning_rate": 1.1550011550011552e-07, "loss": 0.0502, "step": 4761 }, { "epoch": 4.95010395010395, "grad_norm": 0.014133865013718605, "learning_rate": 1.131901131901132e-07, "loss": 0.0003, "step": 4762 }, { "epoch": 4.951143451143452, "grad_norm": 0.6203381419181824, "learning_rate": 1.108801108801109e-07, "loss": 0.0177, "step": 4763 }, { "epoch": 4.952182952182953, "grad_norm": 8.374246597290039, "learning_rate": 1.0857010857010858e-07, "loss": 0.7584, "step": 4764 }, { "epoch": 4.953222453222454, "grad_norm": 5.350039482116699, "learning_rate": 1.0626010626010626e-07, "loss": 0.1403, "step": 4765 }, { "epoch": 4.954261954261955, "grad_norm": 0.00029671346419490874, "learning_rate": 1.0395010395010397e-07, "loss": 0.0, "step": 4766 }, { "epoch": 4.955301455301456, "grad_norm": 0.12956593930721283, "learning_rate": 1.0164010164010164e-07, "loss": 0.0037, "step": 4767 }, { "epoch": 4.956340956340957, "grad_norm": 6.041904449462891, "learning_rate": 9.933009933009933e-08, "loss": 0.1916, "step": 4768 }, { "epoch": 4.957380457380458, "grad_norm": 2.301347255706787, "learning_rate": 9.702009702009702e-08, "loss": 0.0397, "step": 4769 }, { "epoch": 4.958419958419959, "grad_norm": 11.286805152893066, "learning_rate": 9.471009471009472e-08, "loss": 0.6097, "step": 4770 }, { "epoch": 4.95945945945946, "grad_norm": 1.3557261228561401, "learning_rate": 9.240009240009241e-08, "loss": 0.0239, "step": 4771 }, { "epoch": 4.960498960498961, "grad_norm": 1.0138343572616577, "learning_rate": 9.00900900900901e-08, "loss": 0.0236, "step": 4772 }, { "epoch": 4.961538461538462, "grad_norm": 4.4559006690979, "learning_rate": 8.778008778008778e-08, "loss": 0.1399, "step": 4773 }, { "epoch": 4.962577962577963, "grad_norm": 5.926146984100342, "learning_rate": 8.547008547008549e-08, "loss": 0.3067, "step": 4774 }, { "epoch": 4.963617463617464, "grad_norm": 9.967759132385254, "learning_rate": 8.316008316008317e-08, "loss": 0.5148, "step": 4775 }, { "epoch": 4.964656964656965, "grad_norm": 1.4153497219085693, "learning_rate": 8.085008085008086e-08, "loss": 0.024, "step": 4776 }, { "epoch": 4.965696465696466, "grad_norm": 0.16361723840236664, "learning_rate": 7.854007854007855e-08, "loss": 0.0024, "step": 4777 }, { "epoch": 4.966735966735967, "grad_norm": 0.07358971983194351, "learning_rate": 7.623007623007624e-08, "loss": 0.0019, "step": 4778 }, { "epoch": 4.967775467775468, "grad_norm": 5.493592739105225, "learning_rate": 7.392007392007392e-08, "loss": 0.1629, "step": 4779 }, { "epoch": 4.968814968814969, "grad_norm": 6.8674139976501465, "learning_rate": 7.161007161007161e-08, "loss": 0.2934, "step": 4780 }, { "epoch": 4.96985446985447, "grad_norm": 3.8765203952789307, "learning_rate": 6.93000693000693e-08, "loss": 0.117, "step": 4781 }, { "epoch": 4.970893970893971, "grad_norm": 4.589378833770752, "learning_rate": 6.6990066990067e-08, "loss": 0.3599, "step": 4782 }, { "epoch": 4.971933471933472, "grad_norm": 0.12670215964317322, "learning_rate": 6.468006468006469e-08, "loss": 0.0035, "step": 4783 }, { "epoch": 4.972972972972973, "grad_norm": 6.151996612548828, "learning_rate": 6.237006237006238e-08, "loss": 0.2022, "step": 4784 }, { "epoch": 4.974012474012474, "grad_norm": 10.151179313659668, "learning_rate": 6.006006006006006e-08, "loss": 0.5903, "step": 4785 }, { "epoch": 4.975051975051975, "grad_norm": 3.344877004623413, "learning_rate": 5.775005775005776e-08, "loss": 0.0535, "step": 4786 }, { "epoch": 4.976091476091476, "grad_norm": 6.596832752227783, "learning_rate": 5.544005544005545e-08, "loss": 0.5492, "step": 4787 }, { "epoch": 4.977130977130977, "grad_norm": 2.9688796997070312, "learning_rate": 5.313005313005313e-08, "loss": 0.0638, "step": 4788 }, { "epoch": 4.978170478170478, "grad_norm": 1.099982500076294, "learning_rate": 5.082005082005082e-08, "loss": 0.0387, "step": 4789 }, { "epoch": 4.979209979209979, "grad_norm": 4.487499713897705, "learning_rate": 4.851004851004851e-08, "loss": 0.1387, "step": 4790 }, { "epoch": 4.98024948024948, "grad_norm": 8.446113586425781, "learning_rate": 4.6200046200046205e-08, "loss": 1.1905, "step": 4791 }, { "epoch": 4.981288981288982, "grad_norm": 8.844257354736328, "learning_rate": 4.389004389004389e-08, "loss": 0.4875, "step": 4792 }, { "epoch": 4.982328482328482, "grad_norm": 0.06120000034570694, "learning_rate": 4.158004158004159e-08, "loss": 0.0015, "step": 4793 }, { "epoch": 4.983367983367984, "grad_norm": 6.0494279861450195, "learning_rate": 3.9270039270039275e-08, "loss": 0.3082, "step": 4794 }, { "epoch": 4.984407484407484, "grad_norm": 9.200035095214844, "learning_rate": 3.696003696003696e-08, "loss": 0.4815, "step": 4795 }, { "epoch": 4.985446985446986, "grad_norm": 0.027429446578025818, "learning_rate": 3.465003465003465e-08, "loss": 0.0007, "step": 4796 }, { "epoch": 4.986486486486487, "grad_norm": 2.307527542114258, "learning_rate": 3.2340032340032345e-08, "loss": 0.0316, "step": 4797 }, { "epoch": 4.987525987525988, "grad_norm": 4.906373500823975, "learning_rate": 3.003003003003003e-08, "loss": 0.1492, "step": 4798 }, { "epoch": 4.988565488565489, "grad_norm": 1.1830755472183228, "learning_rate": 2.7720027720027723e-08, "loss": 0.0295, "step": 4799 }, { "epoch": 4.98960498960499, "grad_norm": 7.704885482788086, "learning_rate": 2.541002541002541e-08, "loss": 0.4183, "step": 4800 }, { "epoch": 4.990644490644491, "grad_norm": 1.4846118688583374, "learning_rate": 2.3100023100023102e-08, "loss": 0.0445, "step": 4801 }, { "epoch": 4.991683991683992, "grad_norm": 0.034488894045352936, "learning_rate": 2.0790020790020793e-08, "loss": 0.0008, "step": 4802 }, { "epoch": 4.992723492723493, "grad_norm": 1.4385161399841309, "learning_rate": 1.848001848001848e-08, "loss": 0.0268, "step": 4803 }, { "epoch": 4.993762993762994, "grad_norm": 10.849470138549805, "learning_rate": 1.6170016170016172e-08, "loss": 0.7787, "step": 4804 }, { "epoch": 4.994802494802495, "grad_norm": 0.08578181266784668, "learning_rate": 1.3860013860013862e-08, "loss": 0.0029, "step": 4805 }, { "epoch": 4.995841995841996, "grad_norm": 1.4152899980545044, "learning_rate": 1.1550011550011551e-08, "loss": 0.0174, "step": 4806 }, { "epoch": 4.996881496881497, "grad_norm": 8.894526481628418, "learning_rate": 9.24000924000924e-09, "loss": 0.4251, "step": 4807 }, { "epoch": 4.997920997920998, "grad_norm": 0.022568803280591965, "learning_rate": 6.930006930006931e-09, "loss": 0.0005, "step": 4808 }, { "epoch": 4.998960498960499, "grad_norm": 5.386369228363037, "learning_rate": 4.62000462000462e-09, "loss": 0.1613, "step": 4809 }, { "epoch": 5.0, "grad_norm": 5.23478364944458, "learning_rate": 2.31000231000231e-09, "loss": 0.1585, "step": 4810 }, { "epoch": 5.0, "step": 4810, "total_flos": 0.0, "train_loss": 0.37849533693159876, "train_runtime": 13899.499, "train_samples_per_second": 2.768, "train_steps_per_second": 0.346 } ], "logging_steps": 1.0, "max_steps": 4810, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }