{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3147, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009534024550113217, "grad_norm": 53.393329759395314, "learning_rate": 0.0, "loss": 4.7998, "step": 1 }, { "epoch": 0.0019068049100226434, "grad_norm": 60.262461056130604, "learning_rate": 3.174603174603175e-08, "loss": 4.8186, "step": 2 }, { "epoch": 0.002860207365033965, "grad_norm": 54.39964133478734, "learning_rate": 6.34920634920635e-08, "loss": 4.8369, "step": 3 }, { "epoch": 0.003813609820045287, "grad_norm": 56.76879051973779, "learning_rate": 9.523809523809525e-08, "loss": 4.841, "step": 4 }, { "epoch": 0.004767012275056608, "grad_norm": 47.2038250907207, "learning_rate": 1.26984126984127e-07, "loss": 4.5383, "step": 5 }, { "epoch": 0.00572041473006793, "grad_norm": 49.627189033467495, "learning_rate": 1.5873015873015874e-07, "loss": 4.6592, "step": 6 }, { "epoch": 0.006673817185079252, "grad_norm": 56.906412143809526, "learning_rate": 1.904761904761905e-07, "loss": 4.8986, "step": 7 }, { "epoch": 0.007627219640090574, "grad_norm": 78.78347732427558, "learning_rate": 2.2222222222222224e-07, "loss": 5.1051, "step": 8 }, { "epoch": 0.008580622095101895, "grad_norm": 42.1819003138915, "learning_rate": 2.53968253968254e-07, "loss": 4.674, "step": 9 }, { "epoch": 0.009534024550113216, "grad_norm": 40.23999513431063, "learning_rate": 2.8571428571428575e-07, "loss": 4.572, "step": 10 }, { "epoch": 0.010487427005124539, "grad_norm": 57.59064272602256, "learning_rate": 3.174603174603175e-07, "loss": 4.8437, "step": 11 }, { "epoch": 0.01144082946013586, "grad_norm": 53.90374033245987, "learning_rate": 3.492063492063492e-07, "loss": 4.9861, "step": 12 }, { "epoch": 0.012394231915147182, "grad_norm": 46.81537531829387, "learning_rate": 3.80952380952381e-07, "loss": 4.8131, "step": 13 }, { "epoch": 0.013347634370158503, "grad_norm": 51.145443272814205, "learning_rate": 4.126984126984127e-07, "loss": 4.6762, "step": 14 }, { "epoch": 0.014301036825169824, "grad_norm": 56.757914232219875, "learning_rate": 4.444444444444445e-07, "loss": 4.9432, "step": 15 }, { "epoch": 0.015254439280181147, "grad_norm": 40.426102550779106, "learning_rate": 4.7619047619047623e-07, "loss": 4.5409, "step": 16 }, { "epoch": 0.016207841735192467, "grad_norm": 47.69865239392605, "learning_rate": 5.07936507936508e-07, "loss": 4.8296, "step": 17 }, { "epoch": 0.01716124419020379, "grad_norm": 46.99488136446772, "learning_rate": 5.396825396825398e-07, "loss": 4.6262, "step": 18 }, { "epoch": 0.018114646645215112, "grad_norm": 55.236561715688104, "learning_rate": 5.714285714285715e-07, "loss": 4.6588, "step": 19 }, { "epoch": 0.01906804910022643, "grad_norm": 51.29207128464149, "learning_rate": 6.031746031746032e-07, "loss": 4.7611, "step": 20 }, { "epoch": 0.020021451555237754, "grad_norm": 53.97242664444237, "learning_rate": 6.34920634920635e-07, "loss": 4.7534, "step": 21 }, { "epoch": 0.020974854010249077, "grad_norm": 45.88966513764988, "learning_rate": 6.666666666666667e-07, "loss": 4.6266, "step": 22 }, { "epoch": 0.021928256465260396, "grad_norm": 61.713040202586775, "learning_rate": 6.984126984126984e-07, "loss": 4.7537, "step": 23 }, { "epoch": 0.02288165892027172, "grad_norm": 49.621617375772054, "learning_rate": 7.301587301587303e-07, "loss": 4.3054, "step": 24 }, { "epoch": 0.023835061375283042, "grad_norm": 51.39821924920083, "learning_rate": 7.61904761904762e-07, "loss": 4.7478, "step": 25 }, { "epoch": 0.024788463830294365, "grad_norm": 48.06786067251509, "learning_rate": 7.936507936507937e-07, "loss": 4.6197, "step": 26 }, { "epoch": 0.025741866285305684, "grad_norm": 63.0033364156634, "learning_rate": 8.253968253968254e-07, "loss": 4.8118, "step": 27 }, { "epoch": 0.026695268740317007, "grad_norm": 52.62154979696581, "learning_rate": 8.571428571428572e-07, "loss": 4.676, "step": 28 }, { "epoch": 0.02764867119532833, "grad_norm": 41.788739986643456, "learning_rate": 8.88888888888889e-07, "loss": 4.6958, "step": 29 }, { "epoch": 0.02860207365033965, "grad_norm": 70.23728901185979, "learning_rate": 9.206349206349208e-07, "loss": 5.0378, "step": 30 }, { "epoch": 0.02955547610535097, "grad_norm": 40.67890400239111, "learning_rate": 9.523809523809525e-07, "loss": 4.3834, "step": 31 }, { "epoch": 0.030508878560362294, "grad_norm": 68.97728345917278, "learning_rate": 9.84126984126984e-07, "loss": 4.8255, "step": 32 }, { "epoch": 0.031462281015373614, "grad_norm": 50.53434715681306, "learning_rate": 1.015873015873016e-06, "loss": 4.6184, "step": 33 }, { "epoch": 0.03241568347038493, "grad_norm": 52.56755185587121, "learning_rate": 1.0476190476190478e-06, "loss": 4.6559, "step": 34 }, { "epoch": 0.03336908592539626, "grad_norm": 41.40501715285961, "learning_rate": 1.0793650793650795e-06, "loss": 4.4155, "step": 35 }, { "epoch": 0.03432248838040758, "grad_norm": 69.69473410414915, "learning_rate": 1.111111111111111e-06, "loss": 4.6984, "step": 36 }, { "epoch": 0.0352758908354189, "grad_norm": 40.825035669527416, "learning_rate": 1.142857142857143e-06, "loss": 4.7524, "step": 37 }, { "epoch": 0.036229293290430224, "grad_norm": 40.97955880825247, "learning_rate": 1.1746031746031747e-06, "loss": 4.3261, "step": 38 }, { "epoch": 0.037182695745441544, "grad_norm": 37.689162105786906, "learning_rate": 1.2063492063492065e-06, "loss": 4.4312, "step": 39 }, { "epoch": 0.03813609820045286, "grad_norm": 47.4021932571967, "learning_rate": 1.2380952380952382e-06, "loss": 4.5223, "step": 40 }, { "epoch": 0.03908950065546419, "grad_norm": 54.280336743224225, "learning_rate": 1.26984126984127e-06, "loss": 4.5006, "step": 41 }, { "epoch": 0.04004290311047551, "grad_norm": 46.908088196299154, "learning_rate": 1.3015873015873019e-06, "loss": 4.6186, "step": 42 }, { "epoch": 0.04099630556548683, "grad_norm": 42.41701250202182, "learning_rate": 1.3333333333333334e-06, "loss": 4.2628, "step": 43 }, { "epoch": 0.041949708020498154, "grad_norm": 45.47516801846036, "learning_rate": 1.3650793650793652e-06, "loss": 4.4519, "step": 44 }, { "epoch": 0.04290311047550947, "grad_norm": 45.61375367801291, "learning_rate": 1.3968253968253969e-06, "loss": 4.4334, "step": 45 }, { "epoch": 0.04385651293052079, "grad_norm": 49.888014687260224, "learning_rate": 1.4285714285714286e-06, "loss": 4.5774, "step": 46 }, { "epoch": 0.04480991538553212, "grad_norm": 34.10295675936898, "learning_rate": 1.4603174603174606e-06, "loss": 4.2928, "step": 47 }, { "epoch": 0.04576331784054344, "grad_norm": 40.86416869646028, "learning_rate": 1.492063492063492e-06, "loss": 4.297, "step": 48 }, { "epoch": 0.046716720295554764, "grad_norm": 40.86506614925927, "learning_rate": 1.523809523809524e-06, "loss": 4.2778, "step": 49 }, { "epoch": 0.047670122750566084, "grad_norm": 42.23149274852975, "learning_rate": 1.5555555555555558e-06, "loss": 4.1276, "step": 50 }, { "epoch": 0.0486235252055774, "grad_norm": 51.85162673017363, "learning_rate": 1.5873015873015873e-06, "loss": 4.5038, "step": 51 }, { "epoch": 0.04957692766058873, "grad_norm": 38.606695973322864, "learning_rate": 1.6190476190476193e-06, "loss": 4.1797, "step": 52 }, { "epoch": 0.05053033011560005, "grad_norm": 33.95817780818485, "learning_rate": 1.6507936507936508e-06, "loss": 4.1247, "step": 53 }, { "epoch": 0.05148373257061137, "grad_norm": 41.73698850436416, "learning_rate": 1.6825396825396827e-06, "loss": 4.0811, "step": 54 }, { "epoch": 0.052437135025622694, "grad_norm": 34.771668477695876, "learning_rate": 1.7142857142857145e-06, "loss": 4.0516, "step": 55 }, { "epoch": 0.053390537480634014, "grad_norm": 35.14518607886963, "learning_rate": 1.746031746031746e-06, "loss": 4.0843, "step": 56 }, { "epoch": 0.05434393993564533, "grad_norm": 32.498805677813785, "learning_rate": 1.777777777777778e-06, "loss": 3.9998, "step": 57 }, { "epoch": 0.05529734239065666, "grad_norm": 40.10656147582167, "learning_rate": 1.8095238095238097e-06, "loss": 4.3555, "step": 58 }, { "epoch": 0.05625074484566798, "grad_norm": 38.79383986324083, "learning_rate": 1.8412698412698416e-06, "loss": 4.2186, "step": 59 }, { "epoch": 0.0572041473006793, "grad_norm": 36.27663520182929, "learning_rate": 1.8730158730158732e-06, "loss": 4.2333, "step": 60 }, { "epoch": 0.058157549755690624, "grad_norm": 28.58023237972772, "learning_rate": 1.904761904761905e-06, "loss": 4.1511, "step": 61 }, { "epoch": 0.05911095221070194, "grad_norm": 34.80467692455818, "learning_rate": 1.936507936507937e-06, "loss": 4.1563, "step": 62 }, { "epoch": 0.06006435466571326, "grad_norm": 29.68210272566548, "learning_rate": 1.968253968253968e-06, "loss": 3.8818, "step": 63 }, { "epoch": 0.06101775712072459, "grad_norm": 27.770478612733267, "learning_rate": 2.0000000000000003e-06, "loss": 3.8984, "step": 64 }, { "epoch": 0.06197115957573591, "grad_norm": 28.551660473347688, "learning_rate": 2.031746031746032e-06, "loss": 4.0822, "step": 65 }, { "epoch": 0.06292456203074723, "grad_norm": 33.25377762157382, "learning_rate": 2.0634920634920634e-06, "loss": 4.2152, "step": 66 }, { "epoch": 0.06387796448575855, "grad_norm": 33.154869723541445, "learning_rate": 2.0952380952380955e-06, "loss": 4.1193, "step": 67 }, { "epoch": 0.06483136694076987, "grad_norm": 30.62376673465911, "learning_rate": 2.1269841269841273e-06, "loss": 4.1785, "step": 68 }, { "epoch": 0.06578476939578119, "grad_norm": 24.38794501856989, "learning_rate": 2.158730158730159e-06, "loss": 4.1628, "step": 69 }, { "epoch": 0.06673817185079252, "grad_norm": 26.619568343195763, "learning_rate": 2.1904761904761908e-06, "loss": 4.1912, "step": 70 }, { "epoch": 0.06769157430580383, "grad_norm": 31.974603179712904, "learning_rate": 2.222222222222222e-06, "loss": 3.8771, "step": 71 }, { "epoch": 0.06864497676081516, "grad_norm": 33.568152542373205, "learning_rate": 2.2539682539682542e-06, "loss": 4.113, "step": 72 }, { "epoch": 0.06959837921582648, "grad_norm": 28.0684357588321, "learning_rate": 2.285714285714286e-06, "loss": 4.3359, "step": 73 }, { "epoch": 0.0705517816708378, "grad_norm": 26.321523783267757, "learning_rate": 2.3174603174603177e-06, "loss": 3.8204, "step": 74 }, { "epoch": 0.07150518412584912, "grad_norm": 24.497592979867328, "learning_rate": 2.3492063492063494e-06, "loss": 3.9844, "step": 75 }, { "epoch": 0.07245858658086045, "grad_norm": 25.968655564948275, "learning_rate": 2.380952380952381e-06, "loss": 3.972, "step": 76 }, { "epoch": 0.07341198903587176, "grad_norm": 28.901455141756294, "learning_rate": 2.412698412698413e-06, "loss": 3.9385, "step": 77 }, { "epoch": 0.07436539149088309, "grad_norm": 30.604414409538187, "learning_rate": 2.4444444444444447e-06, "loss": 3.9611, "step": 78 }, { "epoch": 0.07531879394589441, "grad_norm": 26.82281755045421, "learning_rate": 2.4761904761904764e-06, "loss": 3.8841, "step": 79 }, { "epoch": 0.07627219640090573, "grad_norm": 24.24209382159052, "learning_rate": 2.507936507936508e-06, "loss": 3.8414, "step": 80 }, { "epoch": 0.07722559885591705, "grad_norm": 28.329806766278825, "learning_rate": 2.53968253968254e-06, "loss": 4.0798, "step": 81 }, { "epoch": 0.07817900131092838, "grad_norm": 35.399572785384194, "learning_rate": 2.571428571428571e-06, "loss": 3.9481, "step": 82 }, { "epoch": 0.07913240376593969, "grad_norm": 26.783962476264428, "learning_rate": 2.6031746031746038e-06, "loss": 3.8993, "step": 83 }, { "epoch": 0.08008580622095102, "grad_norm": 27.359427343547683, "learning_rate": 2.634920634920635e-06, "loss": 3.9117, "step": 84 }, { "epoch": 0.08103920867596234, "grad_norm": 26.61992100557692, "learning_rate": 2.666666666666667e-06, "loss": 3.695, "step": 85 }, { "epoch": 0.08199261113097366, "grad_norm": 30.90070940550583, "learning_rate": 2.6984126984126986e-06, "loss": 4.0111, "step": 86 }, { "epoch": 0.08294601358598498, "grad_norm": 32.15109011244054, "learning_rate": 2.7301587301587303e-06, "loss": 3.9185, "step": 87 }, { "epoch": 0.08389941604099631, "grad_norm": 23.150267264163155, "learning_rate": 2.7619047619047625e-06, "loss": 3.8434, "step": 88 }, { "epoch": 0.08485281849600762, "grad_norm": 27.414991755375276, "learning_rate": 2.7936507936507938e-06, "loss": 4.0571, "step": 89 }, { "epoch": 0.08580622095101895, "grad_norm": 31.786762235330727, "learning_rate": 2.8253968253968255e-06, "loss": 4.2549, "step": 90 }, { "epoch": 0.08675962340603027, "grad_norm": 28.846430848228547, "learning_rate": 2.8571428571428573e-06, "loss": 4.0321, "step": 91 }, { "epoch": 0.08771302586104159, "grad_norm": 28.71736620066419, "learning_rate": 2.888888888888889e-06, "loss": 3.7418, "step": 92 }, { "epoch": 0.08866642831605291, "grad_norm": 28.213932284850564, "learning_rate": 2.920634920634921e-06, "loss": 3.7745, "step": 93 }, { "epoch": 0.08961983077106424, "grad_norm": 22.46904975843183, "learning_rate": 2.9523809523809525e-06, "loss": 3.9922, "step": 94 }, { "epoch": 0.09057323322607556, "grad_norm": 21.871674819675853, "learning_rate": 2.984126984126984e-06, "loss": 3.6767, "step": 95 }, { "epoch": 0.09152663568108688, "grad_norm": 27.60499311626371, "learning_rate": 3.015873015873016e-06, "loss": 3.9218, "step": 96 }, { "epoch": 0.0924800381360982, "grad_norm": 30.685542497632316, "learning_rate": 3.047619047619048e-06, "loss": 3.7618, "step": 97 }, { "epoch": 0.09343344059110953, "grad_norm": 26.93077366944968, "learning_rate": 3.07936507936508e-06, "loss": 3.9583, "step": 98 }, { "epoch": 0.09438684304612084, "grad_norm": 22.955651842163224, "learning_rate": 3.1111111111111116e-06, "loss": 3.8155, "step": 99 }, { "epoch": 0.09534024550113217, "grad_norm": 25.24923157083308, "learning_rate": 3.142857142857143e-06, "loss": 4.0243, "step": 100 }, { "epoch": 0.0962936479561435, "grad_norm": 24.064936336720663, "learning_rate": 3.1746031746031746e-06, "loss": 3.9655, "step": 101 }, { "epoch": 0.0972470504111548, "grad_norm": 27.176148492807407, "learning_rate": 3.206349206349207e-06, "loss": 3.9429, "step": 102 }, { "epoch": 0.09820045286616613, "grad_norm": 29.132110111126774, "learning_rate": 3.2380952380952385e-06, "loss": 3.8571, "step": 103 }, { "epoch": 0.09915385532117746, "grad_norm": 25.88012233533854, "learning_rate": 3.2698412698412703e-06, "loss": 4.0144, "step": 104 }, { "epoch": 0.10010725777618877, "grad_norm": 25.15204798169861, "learning_rate": 3.3015873015873016e-06, "loss": 3.8942, "step": 105 }, { "epoch": 0.1010606602312001, "grad_norm": 22.679284769150485, "learning_rate": 3.3333333333333333e-06, "loss": 3.831, "step": 106 }, { "epoch": 0.10201406268621142, "grad_norm": 22.20021296198107, "learning_rate": 3.3650793650793655e-06, "loss": 3.5648, "step": 107 }, { "epoch": 0.10296746514122274, "grad_norm": 25.232617783569065, "learning_rate": 3.3968253968253972e-06, "loss": 3.945, "step": 108 }, { "epoch": 0.10392086759623406, "grad_norm": 20.86378516651486, "learning_rate": 3.428571428571429e-06, "loss": 3.7693, "step": 109 }, { "epoch": 0.10487427005124539, "grad_norm": 23.268549841522013, "learning_rate": 3.4603174603174607e-06, "loss": 3.9512, "step": 110 }, { "epoch": 0.1058276725062567, "grad_norm": 23.927841179208563, "learning_rate": 3.492063492063492e-06, "loss": 3.5792, "step": 111 }, { "epoch": 0.10678107496126803, "grad_norm": 24.34002373237191, "learning_rate": 3.523809523809524e-06, "loss": 3.8919, "step": 112 }, { "epoch": 0.10773447741627935, "grad_norm": 24.66885445115125, "learning_rate": 3.555555555555556e-06, "loss": 4.0227, "step": 113 }, { "epoch": 0.10868787987129067, "grad_norm": 25.67973058568013, "learning_rate": 3.5873015873015877e-06, "loss": 3.8348, "step": 114 }, { "epoch": 0.10964128232630199, "grad_norm": 29.847055250614908, "learning_rate": 3.6190476190476194e-06, "loss": 4.2077, "step": 115 }, { "epoch": 0.11059468478131332, "grad_norm": 27.802859340812812, "learning_rate": 3.6507936507936507e-06, "loss": 4.0937, "step": 116 }, { "epoch": 0.11154808723632463, "grad_norm": 21.5399550976174, "learning_rate": 3.6825396825396833e-06, "loss": 3.7123, "step": 117 }, { "epoch": 0.11250148969133596, "grad_norm": 23.737488252930223, "learning_rate": 3.7142857142857146e-06, "loss": 4.0476, "step": 118 }, { "epoch": 0.11345489214634728, "grad_norm": 21.509174941635244, "learning_rate": 3.7460317460317463e-06, "loss": 3.7852, "step": 119 }, { "epoch": 0.1144082946013586, "grad_norm": 21.527092707399444, "learning_rate": 3.777777777777778e-06, "loss": 3.6394, "step": 120 }, { "epoch": 0.11536169705636992, "grad_norm": 23.019290229097887, "learning_rate": 3.80952380952381e-06, "loss": 3.8127, "step": 121 }, { "epoch": 0.11631509951138125, "grad_norm": 25.02057487820951, "learning_rate": 3.841269841269842e-06, "loss": 3.9095, "step": 122 }, { "epoch": 0.11726850196639256, "grad_norm": 28.547061376830843, "learning_rate": 3.873015873015874e-06, "loss": 4.009, "step": 123 }, { "epoch": 0.11822190442140389, "grad_norm": 20.75279765642302, "learning_rate": 3.9047619047619055e-06, "loss": 3.6674, "step": 124 }, { "epoch": 0.11917530687641521, "grad_norm": 19.405879377389915, "learning_rate": 3.936507936507936e-06, "loss": 3.4702, "step": 125 }, { "epoch": 0.12012870933142653, "grad_norm": 21.30598367566384, "learning_rate": 3.968253968253968e-06, "loss": 3.8163, "step": 126 }, { "epoch": 0.12108211178643785, "grad_norm": 18.616199299189628, "learning_rate": 4.000000000000001e-06, "loss": 3.8188, "step": 127 }, { "epoch": 0.12203551424144918, "grad_norm": 27.71077380371966, "learning_rate": 4.031746031746032e-06, "loss": 3.9603, "step": 128 }, { "epoch": 0.12298891669646049, "grad_norm": 25.10449465269353, "learning_rate": 4.063492063492064e-06, "loss": 3.7683, "step": 129 }, { "epoch": 0.12394231915147182, "grad_norm": 19.04544752125123, "learning_rate": 4.095238095238096e-06, "loss": 3.9902, "step": 130 }, { "epoch": 0.12489572160648314, "grad_norm": 20.704961086581616, "learning_rate": 4.126984126984127e-06, "loss": 3.7554, "step": 131 }, { "epoch": 0.12584912406149446, "grad_norm": 22.667897713584644, "learning_rate": 4.158730158730159e-06, "loss": 3.907, "step": 132 }, { "epoch": 0.12680252651650578, "grad_norm": 22.3666386170698, "learning_rate": 4.190476190476191e-06, "loss": 3.6729, "step": 133 }, { "epoch": 0.1277559289715171, "grad_norm": 30.55556570686289, "learning_rate": 4.222222222222223e-06, "loss": 4.0549, "step": 134 }, { "epoch": 0.12870933142652843, "grad_norm": 20.209961425170558, "learning_rate": 4.2539682539682546e-06, "loss": 3.8258, "step": 135 }, { "epoch": 0.12966273388153973, "grad_norm": 25.69967061159014, "learning_rate": 4.2857142857142855e-06, "loss": 3.8382, "step": 136 }, { "epoch": 0.13061613633655106, "grad_norm": 25.118106024729506, "learning_rate": 4.317460317460318e-06, "loss": 3.9146, "step": 137 }, { "epoch": 0.13156953879156238, "grad_norm": 20.276492996090195, "learning_rate": 4.34920634920635e-06, "loss": 3.501, "step": 138 }, { "epoch": 0.1325229412465737, "grad_norm": 24.988742249531686, "learning_rate": 4.3809523809523815e-06, "loss": 3.7569, "step": 139 }, { "epoch": 0.13347634370158504, "grad_norm": 21.72404715720259, "learning_rate": 4.412698412698413e-06, "loss": 3.6888, "step": 140 }, { "epoch": 0.13442974615659636, "grad_norm": 20.113489886385704, "learning_rate": 4.444444444444444e-06, "loss": 3.8521, "step": 141 }, { "epoch": 0.13538314861160766, "grad_norm": 20.629490925105973, "learning_rate": 4.476190476190477e-06, "loss": 3.9141, "step": 142 }, { "epoch": 0.136336551066619, "grad_norm": 24.50293759901704, "learning_rate": 4.5079365079365085e-06, "loss": 3.8515, "step": 143 }, { "epoch": 0.13728995352163031, "grad_norm": 20.333618604873255, "learning_rate": 4.53968253968254e-06, "loss": 3.8909, "step": 144 }, { "epoch": 0.13824335597664164, "grad_norm": 24.155497291385217, "learning_rate": 4.571428571428572e-06, "loss": 3.808, "step": 145 }, { "epoch": 0.13919675843165297, "grad_norm": 22.923723936268267, "learning_rate": 4.603174603174604e-06, "loss": 3.6558, "step": 146 }, { "epoch": 0.1401501608866643, "grad_norm": 21.405424380910766, "learning_rate": 4.634920634920635e-06, "loss": 3.5827, "step": 147 }, { "epoch": 0.1411035633416756, "grad_norm": 28.00273078037717, "learning_rate": 4.666666666666667e-06, "loss": 4.0737, "step": 148 }, { "epoch": 0.14205696579668692, "grad_norm": 24.855058036599203, "learning_rate": 4.698412698412699e-06, "loss": 3.7761, "step": 149 }, { "epoch": 0.14301036825169824, "grad_norm": 22.036910296757714, "learning_rate": 4.730158730158731e-06, "loss": 3.7127, "step": 150 }, { "epoch": 0.14396377070670957, "grad_norm": 26.955373153487272, "learning_rate": 4.761904761904762e-06, "loss": 3.8495, "step": 151 }, { "epoch": 0.1449171731617209, "grad_norm": 18.802814399036045, "learning_rate": 4.793650793650794e-06, "loss": 3.8444, "step": 152 }, { "epoch": 0.14587057561673222, "grad_norm": 23.706500957827767, "learning_rate": 4.825396825396826e-06, "loss": 3.8304, "step": 153 }, { "epoch": 0.14682397807174352, "grad_norm": 23.56771743687887, "learning_rate": 4.857142857142858e-06, "loss": 4.0708, "step": 154 }, { "epoch": 0.14777738052675485, "grad_norm": 25.78653288021449, "learning_rate": 4.888888888888889e-06, "loss": 3.8364, "step": 155 }, { "epoch": 0.14873078298176617, "grad_norm": 20.34115676591284, "learning_rate": 4.920634920634921e-06, "loss": 3.5674, "step": 156 }, { "epoch": 0.1496841854367775, "grad_norm": 24.933685952931093, "learning_rate": 4.952380952380953e-06, "loss": 3.9145, "step": 157 }, { "epoch": 0.15063758789178883, "grad_norm": 23.412401712656685, "learning_rate": 4.9841269841269845e-06, "loss": 3.6203, "step": 158 }, { "epoch": 0.15159099034680015, "grad_norm": 22.56714758264083, "learning_rate": 5.015873015873016e-06, "loss": 3.5453, "step": 159 }, { "epoch": 0.15254439280181145, "grad_norm": 26.575258928602246, "learning_rate": 5.047619047619048e-06, "loss": 4.0379, "step": 160 }, { "epoch": 0.15349779525682278, "grad_norm": 22.238019068363663, "learning_rate": 5.07936507936508e-06, "loss": 3.8499, "step": 161 }, { "epoch": 0.1544511977118341, "grad_norm": 20.44856732393176, "learning_rate": 5.1111111111111115e-06, "loss": 3.7187, "step": 162 }, { "epoch": 0.15540460016684543, "grad_norm": 23.45219105634945, "learning_rate": 5.142857142857142e-06, "loss": 3.855, "step": 163 }, { "epoch": 0.15635800262185676, "grad_norm": 20.95966229210069, "learning_rate": 5.174603174603176e-06, "loss": 3.8146, "step": 164 }, { "epoch": 0.15731140507686808, "grad_norm": 22.707595831509003, "learning_rate": 5.2063492063492076e-06, "loss": 3.6553, "step": 165 }, { "epoch": 0.15826480753187938, "grad_norm": 20.22169286715092, "learning_rate": 5.2380952380952384e-06, "loss": 3.7334, "step": 166 }, { "epoch": 0.1592182099868907, "grad_norm": 20.937379348850833, "learning_rate": 5.26984126984127e-06, "loss": 3.6961, "step": 167 }, { "epoch": 0.16017161244190203, "grad_norm": 23.756566607708734, "learning_rate": 5.301587301587302e-06, "loss": 3.9069, "step": 168 }, { "epoch": 0.16112501489691336, "grad_norm": 24.056510909248573, "learning_rate": 5.333333333333334e-06, "loss": 3.8917, "step": 169 }, { "epoch": 0.1620784173519247, "grad_norm": 27.56180328692935, "learning_rate": 5.365079365079365e-06, "loss": 3.8608, "step": 170 }, { "epoch": 0.163031819806936, "grad_norm": 22.986602288349673, "learning_rate": 5.396825396825397e-06, "loss": 3.6046, "step": 171 }, { "epoch": 0.1639852222619473, "grad_norm": 30.99545546329986, "learning_rate": 5.428571428571429e-06, "loss": 3.963, "step": 172 }, { "epoch": 0.16493862471695864, "grad_norm": 23.5552405209786, "learning_rate": 5.460317460317461e-06, "loss": 3.8758, "step": 173 }, { "epoch": 0.16589202717196996, "grad_norm": 25.592836605632208, "learning_rate": 5.492063492063493e-06, "loss": 3.9228, "step": 174 }, { "epoch": 0.1668454296269813, "grad_norm": 22.927252140886708, "learning_rate": 5.523809523809525e-06, "loss": 3.7904, "step": 175 }, { "epoch": 0.16779883208199262, "grad_norm": 17.6417021437888, "learning_rate": 5.555555555555557e-06, "loss": 3.7261, "step": 176 }, { "epoch": 0.16875223453700394, "grad_norm": 23.918860563225564, "learning_rate": 5.5873015873015876e-06, "loss": 3.6772, "step": 177 }, { "epoch": 0.16970563699201524, "grad_norm": 20.37703743682148, "learning_rate": 5.619047619047619e-06, "loss": 3.7753, "step": 178 }, { "epoch": 0.17065903944702657, "grad_norm": 20.093449211894583, "learning_rate": 5.650793650793651e-06, "loss": 3.9107, "step": 179 }, { "epoch": 0.1716124419020379, "grad_norm": 26.35304063457244, "learning_rate": 5.682539682539683e-06, "loss": 3.6644, "step": 180 }, { "epoch": 0.17256584435704922, "grad_norm": 20.35237839701343, "learning_rate": 5.7142857142857145e-06, "loss": 3.9009, "step": 181 }, { "epoch": 0.17351924681206055, "grad_norm": 25.70720464877836, "learning_rate": 5.746031746031746e-06, "loss": 3.7263, "step": 182 }, { "epoch": 0.17447264926707187, "grad_norm": 28.167699097422226, "learning_rate": 5.777777777777778e-06, "loss": 3.6065, "step": 183 }, { "epoch": 0.17542605172208317, "grad_norm": 31.001414622270225, "learning_rate": 5.8095238095238106e-06, "loss": 3.8104, "step": 184 }, { "epoch": 0.1763794541770945, "grad_norm": 20.779279419267468, "learning_rate": 5.841269841269842e-06, "loss": 3.5852, "step": 185 }, { "epoch": 0.17733285663210582, "grad_norm": 21.11092488940955, "learning_rate": 5.873015873015874e-06, "loss": 3.7531, "step": 186 }, { "epoch": 0.17828625908711715, "grad_norm": 25.680426399411502, "learning_rate": 5.904761904761905e-06, "loss": 3.7282, "step": 187 }, { "epoch": 0.17923966154212848, "grad_norm": 18.794963280821545, "learning_rate": 5.936507936507937e-06, "loss": 3.6728, "step": 188 }, { "epoch": 0.1801930639971398, "grad_norm": 25.630855074674812, "learning_rate": 5.968253968253968e-06, "loss": 3.6079, "step": 189 }, { "epoch": 0.18114646645215113, "grad_norm": 24.725720897613748, "learning_rate": 6e-06, "loss": 3.5829, "step": 190 }, { "epoch": 0.18209986890716243, "grad_norm": 25.148120642554844, "learning_rate": 6.031746031746032e-06, "loss": 3.6613, "step": 191 }, { "epoch": 0.18305327136217375, "grad_norm": 27.101280228684224, "learning_rate": 6.063492063492064e-06, "loss": 3.9001, "step": 192 }, { "epoch": 0.18400667381718508, "grad_norm": 22.26136539008637, "learning_rate": 6.095238095238096e-06, "loss": 3.9376, "step": 193 }, { "epoch": 0.1849600762721964, "grad_norm": 25.88812208590668, "learning_rate": 6.126984126984128e-06, "loss": 3.9001, "step": 194 }, { "epoch": 0.18591347872720773, "grad_norm": 21.917728585851403, "learning_rate": 6.15873015873016e-06, "loss": 3.9536, "step": 195 }, { "epoch": 0.18686688118221906, "grad_norm": 23.46387203827075, "learning_rate": 6.1904761904761914e-06, "loss": 3.5461, "step": 196 }, { "epoch": 0.18782028363723036, "grad_norm": 27.546145524068898, "learning_rate": 6.222222222222223e-06, "loss": 3.8412, "step": 197 }, { "epoch": 0.18877368609224168, "grad_norm": 27.013584203113847, "learning_rate": 6.253968253968254e-06, "loss": 3.9284, "step": 198 }, { "epoch": 0.189727088547253, "grad_norm": 25.528491929904565, "learning_rate": 6.285714285714286e-06, "loss": 3.7893, "step": 199 }, { "epoch": 0.19068049100226434, "grad_norm": 22.608270767908436, "learning_rate": 6.3174603174603175e-06, "loss": 3.7768, "step": 200 }, { "epoch": 0.19163389345727566, "grad_norm": 24.985405251686675, "learning_rate": 6.349206349206349e-06, "loss": 3.681, "step": 201 }, { "epoch": 0.192587295912287, "grad_norm": 21.83011260955719, "learning_rate": 6.380952380952381e-06, "loss": 3.6998, "step": 202 }, { "epoch": 0.1935406983672983, "grad_norm": 26.653405695768235, "learning_rate": 6.412698412698414e-06, "loss": 3.9499, "step": 203 }, { "epoch": 0.1944941008223096, "grad_norm": 23.034818797011607, "learning_rate": 6.444444444444445e-06, "loss": 3.894, "step": 204 }, { "epoch": 0.19544750327732094, "grad_norm": 23.87304143988143, "learning_rate": 6.476190476190477e-06, "loss": 3.7951, "step": 205 }, { "epoch": 0.19640090573233226, "grad_norm": 21.655688390847764, "learning_rate": 6.507936507936509e-06, "loss": 3.899, "step": 206 }, { "epoch": 0.1973543081873436, "grad_norm": 22.439056465121602, "learning_rate": 6.5396825396825405e-06, "loss": 3.6212, "step": 207 }, { "epoch": 0.19830771064235492, "grad_norm": 21.065928309958803, "learning_rate": 6.571428571428572e-06, "loss": 3.7417, "step": 208 }, { "epoch": 0.19926111309736622, "grad_norm": 23.822598654322057, "learning_rate": 6.603174603174603e-06, "loss": 3.558, "step": 209 }, { "epoch": 0.20021451555237754, "grad_norm": 26.826490914002083, "learning_rate": 6.634920634920635e-06, "loss": 3.8973, "step": 210 }, { "epoch": 0.20116791800738887, "grad_norm": 24.548747586757845, "learning_rate": 6.666666666666667e-06, "loss": 3.8343, "step": 211 }, { "epoch": 0.2021213204624002, "grad_norm": 30.382746988863445, "learning_rate": 6.698412698412698e-06, "loss": 3.9557, "step": 212 }, { "epoch": 0.20307472291741152, "grad_norm": 22.902144277813402, "learning_rate": 6.730158730158731e-06, "loss": 3.7334, "step": 213 }, { "epoch": 0.20402812537242285, "grad_norm": 26.546113976077162, "learning_rate": 6.761904761904763e-06, "loss": 3.8016, "step": 214 }, { "epoch": 0.20498152782743415, "grad_norm": 21.39486773236719, "learning_rate": 6.7936507936507944e-06, "loss": 3.2854, "step": 215 }, { "epoch": 0.20593493028244547, "grad_norm": 25.502815695239676, "learning_rate": 6.825396825396826e-06, "loss": 3.7211, "step": 216 }, { "epoch": 0.2068883327374568, "grad_norm": 25.782486096394685, "learning_rate": 6.857142857142858e-06, "loss": 3.9659, "step": 217 }, { "epoch": 0.20784173519246812, "grad_norm": 22.437036870307853, "learning_rate": 6.88888888888889e-06, "loss": 3.6977, "step": 218 }, { "epoch": 0.20879513764747945, "grad_norm": 27.177151466771782, "learning_rate": 6.920634920634921e-06, "loss": 3.5294, "step": 219 }, { "epoch": 0.20974854010249078, "grad_norm": 24.863207025850016, "learning_rate": 6.952380952380952e-06, "loss": 3.8593, "step": 220 }, { "epoch": 0.21070194255750208, "grad_norm": 22.651907017557086, "learning_rate": 6.984126984126984e-06, "loss": 3.6774, "step": 221 }, { "epoch": 0.2116553450125134, "grad_norm": 24.270849995718407, "learning_rate": 7.015873015873016e-06, "loss": 3.4248, "step": 222 }, { "epoch": 0.21260874746752473, "grad_norm": 25.328491522252392, "learning_rate": 7.047619047619048e-06, "loss": 3.8021, "step": 223 }, { "epoch": 0.21356214992253605, "grad_norm": 29.682348453220314, "learning_rate": 7.07936507936508e-06, "loss": 3.9469, "step": 224 }, { "epoch": 0.21451555237754738, "grad_norm": 19.336600239701795, "learning_rate": 7.111111111111112e-06, "loss": 3.9097, "step": 225 }, { "epoch": 0.2154689548325587, "grad_norm": 27.079481224771097, "learning_rate": 7.1428571428571436e-06, "loss": 3.8375, "step": 226 }, { "epoch": 0.21642235728757, "grad_norm": 24.744779623088466, "learning_rate": 7.174603174603175e-06, "loss": 3.6264, "step": 227 }, { "epoch": 0.21737575974258133, "grad_norm": 28.603717399383697, "learning_rate": 7.206349206349207e-06, "loss": 3.6651, "step": 228 }, { "epoch": 0.21832916219759266, "grad_norm": 32.87974885667489, "learning_rate": 7.238095238095239e-06, "loss": 3.9731, "step": 229 }, { "epoch": 0.21928256465260398, "grad_norm": 30.240605500124673, "learning_rate": 7.2698412698412705e-06, "loss": 3.7042, "step": 230 }, { "epoch": 0.2202359671076153, "grad_norm": 25.09881904051568, "learning_rate": 7.301587301587301e-06, "loss": 3.6841, "step": 231 }, { "epoch": 0.22118936956262664, "grad_norm": 24.801055369199425, "learning_rate": 7.333333333333333e-06, "loss": 3.6012, "step": 232 }, { "epoch": 0.22214277201763794, "grad_norm": 20.319333761139976, "learning_rate": 7.3650793650793666e-06, "loss": 3.7692, "step": 233 }, { "epoch": 0.22309617447264926, "grad_norm": 22.567642400410524, "learning_rate": 7.3968253968253975e-06, "loss": 3.6217, "step": 234 }, { "epoch": 0.2240495769276606, "grad_norm": 24.648779831971126, "learning_rate": 7.428571428571429e-06, "loss": 4.028, "step": 235 }, { "epoch": 0.2250029793826719, "grad_norm": 22.545685813718777, "learning_rate": 7.460317460317461e-06, "loss": 3.8512, "step": 236 }, { "epoch": 0.22595638183768324, "grad_norm": 20.28730962858718, "learning_rate": 7.492063492063493e-06, "loss": 3.7653, "step": 237 }, { "epoch": 0.22690978429269457, "grad_norm": 29.283191295989226, "learning_rate": 7.523809523809524e-06, "loss": 3.8973, "step": 238 }, { "epoch": 0.22786318674770586, "grad_norm": 25.545356270791594, "learning_rate": 7.555555555555556e-06, "loss": 3.6672, "step": 239 }, { "epoch": 0.2288165892027172, "grad_norm": 25.78540285279256, "learning_rate": 7.587301587301588e-06, "loss": 3.9468, "step": 240 }, { "epoch": 0.22976999165772852, "grad_norm": 22.784364923263105, "learning_rate": 7.61904761904762e-06, "loss": 3.7245, "step": 241 }, { "epoch": 0.23072339411273984, "grad_norm": 19.51643564057132, "learning_rate": 7.65079365079365e-06, "loss": 3.2738, "step": 242 }, { "epoch": 0.23167679656775117, "grad_norm": 24.409483249559187, "learning_rate": 7.682539682539684e-06, "loss": 3.9626, "step": 243 }, { "epoch": 0.2326301990227625, "grad_norm": 22.966922209006697, "learning_rate": 7.714285714285716e-06, "loss": 3.854, "step": 244 }, { "epoch": 0.2335836014777738, "grad_norm": 25.78784626735858, "learning_rate": 7.746031746031747e-06, "loss": 3.6962, "step": 245 }, { "epoch": 0.23453700393278512, "grad_norm": 21.29857617676169, "learning_rate": 7.77777777777778e-06, "loss": 3.8171, "step": 246 }, { "epoch": 0.23549040638779645, "grad_norm": 25.690789850416458, "learning_rate": 7.809523809523811e-06, "loss": 3.8258, "step": 247 }, { "epoch": 0.23644380884280777, "grad_norm": 23.398536577460053, "learning_rate": 7.841269841269843e-06, "loss": 3.8346, "step": 248 }, { "epoch": 0.2373972112978191, "grad_norm": 23.96643314461373, "learning_rate": 7.873015873015873e-06, "loss": 3.8207, "step": 249 }, { "epoch": 0.23835061375283043, "grad_norm": 21.384441413431944, "learning_rate": 7.904761904761904e-06, "loss": 3.6994, "step": 250 }, { "epoch": 0.23930401620784172, "grad_norm": 23.227836749973964, "learning_rate": 7.936507936507936e-06, "loss": 3.7109, "step": 251 }, { "epoch": 0.24025741866285305, "grad_norm": 24.02713006092369, "learning_rate": 7.968253968253968e-06, "loss": 4.0252, "step": 252 }, { "epoch": 0.24121082111786438, "grad_norm": 23.0726093974765, "learning_rate": 8.000000000000001e-06, "loss": 3.6775, "step": 253 }, { "epoch": 0.2421642235728757, "grad_norm": 25.18600083401705, "learning_rate": 8.031746031746033e-06, "loss": 3.9726, "step": 254 }, { "epoch": 0.24311762602788703, "grad_norm": 20.593689450351125, "learning_rate": 8.063492063492065e-06, "loss": 3.7605, "step": 255 }, { "epoch": 0.24407102848289836, "grad_norm": 19.391090282346237, "learning_rate": 8.095238095238097e-06, "loss": 3.709, "step": 256 }, { "epoch": 0.24502443093790965, "grad_norm": 20.634565912874507, "learning_rate": 8.126984126984128e-06, "loss": 3.7038, "step": 257 }, { "epoch": 0.24597783339292098, "grad_norm": 20.98184836010203, "learning_rate": 8.15873015873016e-06, "loss": 3.6461, "step": 258 }, { "epoch": 0.2469312358479323, "grad_norm": 26.79456385142145, "learning_rate": 8.190476190476192e-06, "loss": 3.7797, "step": 259 }, { "epoch": 0.24788463830294363, "grad_norm": 19.81848156742451, "learning_rate": 8.222222222222222e-06, "loss": 3.72, "step": 260 }, { "epoch": 0.24883804075795496, "grad_norm": 19.999558997714036, "learning_rate": 8.253968253968254e-06, "loss": 3.8226, "step": 261 }, { "epoch": 0.24979144321296629, "grad_norm": 21.739238998233095, "learning_rate": 8.285714285714287e-06, "loss": 3.9364, "step": 262 }, { "epoch": 0.2507448456679776, "grad_norm": 27.621306614866835, "learning_rate": 8.317460317460319e-06, "loss": 3.6949, "step": 263 }, { "epoch": 0.2516982481229889, "grad_norm": 24.7517669529679, "learning_rate": 8.34920634920635e-06, "loss": 3.8191, "step": 264 }, { "epoch": 0.25265165057800026, "grad_norm": 26.72053096763988, "learning_rate": 8.380952380952382e-06, "loss": 3.66, "step": 265 }, { "epoch": 0.25360505303301156, "grad_norm": 22.69853397827511, "learning_rate": 8.412698412698414e-06, "loss": 3.7697, "step": 266 }, { "epoch": 0.25455845548802286, "grad_norm": 22.811579086203068, "learning_rate": 8.444444444444446e-06, "loss": 3.6593, "step": 267 }, { "epoch": 0.2555118579430342, "grad_norm": 25.562069489997363, "learning_rate": 8.476190476190477e-06, "loss": 3.7462, "step": 268 }, { "epoch": 0.2564652603980455, "grad_norm": 26.21868584682217, "learning_rate": 8.507936507936509e-06, "loss": 3.6028, "step": 269 }, { "epoch": 0.25741866285305687, "grad_norm": 24.305382444832205, "learning_rate": 8.53968253968254e-06, "loss": 3.7018, "step": 270 }, { "epoch": 0.25837206530806817, "grad_norm": 24.375587189938678, "learning_rate": 8.571428571428571e-06, "loss": 3.7754, "step": 271 }, { "epoch": 0.25932546776307946, "grad_norm": 24.405975254235027, "learning_rate": 8.603174603174604e-06, "loss": 3.7412, "step": 272 }, { "epoch": 0.2602788702180908, "grad_norm": 25.04024786990487, "learning_rate": 8.634920634920636e-06, "loss": 3.9655, "step": 273 }, { "epoch": 0.2612322726731021, "grad_norm": 23.73649148708009, "learning_rate": 8.666666666666668e-06, "loss": 3.9587, "step": 274 }, { "epoch": 0.26218567512811347, "grad_norm": 20.40601614409869, "learning_rate": 8.6984126984127e-06, "loss": 4.0355, "step": 275 }, { "epoch": 0.26313907758312477, "grad_norm": 22.88785027504328, "learning_rate": 8.730158730158731e-06, "loss": 3.5993, "step": 276 }, { "epoch": 0.2640924800381361, "grad_norm": 23.624599440042484, "learning_rate": 8.761904761904763e-06, "loss": 3.9613, "step": 277 }, { "epoch": 0.2650458824931474, "grad_norm": 25.807970410793185, "learning_rate": 8.793650793650795e-06, "loss": 3.6293, "step": 278 }, { "epoch": 0.2659992849481587, "grad_norm": 23.39484760264322, "learning_rate": 8.825396825396827e-06, "loss": 3.6898, "step": 279 }, { "epoch": 0.2669526874031701, "grad_norm": 23.264379928731007, "learning_rate": 8.857142857142858e-06, "loss": 3.6899, "step": 280 }, { "epoch": 0.2679060898581814, "grad_norm": 27.643305743760905, "learning_rate": 8.888888888888888e-06, "loss": 3.8845, "step": 281 }, { "epoch": 0.2688594923131927, "grad_norm": 25.622173457126703, "learning_rate": 8.920634920634922e-06, "loss": 3.8825, "step": 282 }, { "epoch": 0.269812894768204, "grad_norm": 22.107346867913687, "learning_rate": 8.952380952380953e-06, "loss": 3.7763, "step": 283 }, { "epoch": 0.2707662972232153, "grad_norm": 23.793459663140975, "learning_rate": 8.984126984126985e-06, "loss": 3.9107, "step": 284 }, { "epoch": 0.2717196996782267, "grad_norm": 27.221577041185444, "learning_rate": 9.015873015873017e-06, "loss": 3.7565, "step": 285 }, { "epoch": 0.272673102133238, "grad_norm": 27.578476745518408, "learning_rate": 9.047619047619049e-06, "loss": 3.712, "step": 286 }, { "epoch": 0.27362650458824933, "grad_norm": 21.868245559521633, "learning_rate": 9.07936507936508e-06, "loss": 3.5351, "step": 287 }, { "epoch": 0.27457990704326063, "grad_norm": 24.48788472107463, "learning_rate": 9.111111111111112e-06, "loss": 3.6148, "step": 288 }, { "epoch": 0.275533309498272, "grad_norm": 28.78230649423868, "learning_rate": 9.142857142857144e-06, "loss": 3.5885, "step": 289 }, { "epoch": 0.2764867119532833, "grad_norm": 23.704080433319362, "learning_rate": 9.174603174603176e-06, "loss": 3.7325, "step": 290 }, { "epoch": 0.2774401144082946, "grad_norm": 21.757281042657038, "learning_rate": 9.206349206349207e-06, "loss": 3.7595, "step": 291 }, { "epoch": 0.27839351686330593, "grad_norm": 24.830628238767606, "learning_rate": 9.238095238095239e-06, "loss": 3.7755, "step": 292 }, { "epoch": 0.27934691931831723, "grad_norm": 23.609986292681825, "learning_rate": 9.26984126984127e-06, "loss": 3.8301, "step": 293 }, { "epoch": 0.2803003217733286, "grad_norm": 20.558921286557272, "learning_rate": 9.301587301587303e-06, "loss": 3.7241, "step": 294 }, { "epoch": 0.2812537242283399, "grad_norm": 21.66503942731972, "learning_rate": 9.333333333333334e-06, "loss": 3.4307, "step": 295 }, { "epoch": 0.2822071266833512, "grad_norm": 22.93620441567889, "learning_rate": 9.365079365079366e-06, "loss": 3.7835, "step": 296 }, { "epoch": 0.28316052913836254, "grad_norm": 26.51799736961046, "learning_rate": 9.396825396825398e-06, "loss": 3.6863, "step": 297 }, { "epoch": 0.28411393159337384, "grad_norm": 26.587180286009104, "learning_rate": 9.42857142857143e-06, "loss": 3.5973, "step": 298 }, { "epoch": 0.2850673340483852, "grad_norm": 22.4523872591535, "learning_rate": 9.460317460317461e-06, "loss": 3.5416, "step": 299 }, { "epoch": 0.2860207365033965, "grad_norm": 23.119365835290058, "learning_rate": 9.492063492063493e-06, "loss": 3.4916, "step": 300 }, { "epoch": 0.28697413895840784, "grad_norm": 21.87498779427271, "learning_rate": 9.523809523809525e-06, "loss": 3.6111, "step": 301 }, { "epoch": 0.28792754141341914, "grad_norm": 29.965198793858733, "learning_rate": 9.555555555555556e-06, "loss": 3.9269, "step": 302 }, { "epoch": 0.28888094386843044, "grad_norm": 21.227472962763766, "learning_rate": 9.587301587301588e-06, "loss": 3.6383, "step": 303 }, { "epoch": 0.2898343463234418, "grad_norm": 20.601621991903592, "learning_rate": 9.61904761904762e-06, "loss": 3.5899, "step": 304 }, { "epoch": 0.2907877487784531, "grad_norm": 21.17388052775387, "learning_rate": 9.650793650793652e-06, "loss": 3.7096, "step": 305 }, { "epoch": 0.29174115123346445, "grad_norm": 21.52853653349652, "learning_rate": 9.682539682539683e-06, "loss": 3.8431, "step": 306 }, { "epoch": 0.29269455368847574, "grad_norm": 22.833071696893047, "learning_rate": 9.714285714285715e-06, "loss": 3.5402, "step": 307 }, { "epoch": 0.29364795614348704, "grad_norm": 25.81949601237942, "learning_rate": 9.746031746031747e-06, "loss": 3.728, "step": 308 }, { "epoch": 0.2946013585984984, "grad_norm": 21.481589422478287, "learning_rate": 9.777777777777779e-06, "loss": 3.5263, "step": 309 }, { "epoch": 0.2955547610535097, "grad_norm": 20.695656961450926, "learning_rate": 9.80952380952381e-06, "loss": 3.9372, "step": 310 }, { "epoch": 0.29650816350852105, "grad_norm": 20.97026395353241, "learning_rate": 9.841269841269842e-06, "loss": 3.8446, "step": 311 }, { "epoch": 0.29746156596353235, "grad_norm": 19.68018801238521, "learning_rate": 9.873015873015874e-06, "loss": 3.7017, "step": 312 }, { "epoch": 0.2984149684185437, "grad_norm": 28.03947474640774, "learning_rate": 9.904761904761906e-06, "loss": 3.9323, "step": 313 }, { "epoch": 0.299368370873555, "grad_norm": 19.231497255268764, "learning_rate": 9.936507936507937e-06, "loss": 3.8433, "step": 314 }, { "epoch": 0.3003217733285663, "grad_norm": 21.98790199625271, "learning_rate": 9.968253968253969e-06, "loss": 3.7374, "step": 315 }, { "epoch": 0.30127517578357765, "grad_norm": 26.81715837668887, "learning_rate": 1e-05, "loss": 3.8882, "step": 316 }, { "epoch": 0.30222857823858895, "grad_norm": 22.653867033264437, "learning_rate": 9.999996923526267e-06, "loss": 3.6442, "step": 317 }, { "epoch": 0.3031819806936003, "grad_norm": 28.499504961545536, "learning_rate": 9.999987694108852e-06, "loss": 3.6337, "step": 318 }, { "epoch": 0.3041353831486116, "grad_norm": 20.802422341223306, "learning_rate": 9.999972311759115e-06, "loss": 3.8883, "step": 319 }, { "epoch": 0.3050887856036229, "grad_norm": 26.84714534588163, "learning_rate": 9.999950776495983e-06, "loss": 3.7072, "step": 320 }, { "epoch": 0.30604218805863426, "grad_norm": 21.027285297907365, "learning_rate": 9.999923088345957e-06, "loss": 3.4964, "step": 321 }, { "epoch": 0.30699559051364556, "grad_norm": 21.84989320761671, "learning_rate": 9.999889247343113e-06, "loss": 3.697, "step": 322 }, { "epoch": 0.3079489929686569, "grad_norm": 21.643776778755978, "learning_rate": 9.999849253529092e-06, "loss": 3.4716, "step": 323 }, { "epoch": 0.3089023954236682, "grad_norm": 23.697863603442613, "learning_rate": 9.99980310695311e-06, "loss": 3.7335, "step": 324 }, { "epoch": 0.30985579787867956, "grad_norm": 24.788163304746245, "learning_rate": 9.999750807671957e-06, "loss": 3.8124, "step": 325 }, { "epoch": 0.31080920033369086, "grad_norm": 25.67214697273634, "learning_rate": 9.99969235574999e-06, "loss": 3.8333, "step": 326 }, { "epoch": 0.31176260278870216, "grad_norm": 24.447919921873986, "learning_rate": 9.99962775125914e-06, "loss": 3.5926, "step": 327 }, { "epoch": 0.3127160052437135, "grad_norm": 18.431918370961576, "learning_rate": 9.99955699427891e-06, "loss": 3.6015, "step": 328 }, { "epoch": 0.3136694076987248, "grad_norm": 21.73552440806544, "learning_rate": 9.999480084896369e-06, "loss": 3.7343, "step": 329 }, { "epoch": 0.31462281015373617, "grad_norm": 20.556258476603517, "learning_rate": 9.999397023206165e-06, "loss": 3.8637, "step": 330 }, { "epoch": 0.31557621260874746, "grad_norm": 23.231430371453186, "learning_rate": 9.99930780931051e-06, "loss": 3.9729, "step": 331 }, { "epoch": 0.31652961506375876, "grad_norm": 26.204986790170345, "learning_rate": 9.999212443319191e-06, "loss": 3.9622, "step": 332 }, { "epoch": 0.3174830175187701, "grad_norm": 25.24824378828437, "learning_rate": 9.999110925349564e-06, "loss": 3.4758, "step": 333 }, { "epoch": 0.3184364199737814, "grad_norm": 19.855942721402137, "learning_rate": 9.999003255526555e-06, "loss": 3.6844, "step": 334 }, { "epoch": 0.31938982242879277, "grad_norm": 26.79326991772549, "learning_rate": 9.998889433982663e-06, "loss": 3.8408, "step": 335 }, { "epoch": 0.32034322488380407, "grad_norm": 26.57974889378851, "learning_rate": 9.998769460857955e-06, "loss": 3.8045, "step": 336 }, { "epoch": 0.3212966273388154, "grad_norm": 24.345116192853638, "learning_rate": 9.99864333630007e-06, "loss": 3.8326, "step": 337 }, { "epoch": 0.3222500297938267, "grad_norm": 30.80480108676015, "learning_rate": 9.998511060464211e-06, "loss": 3.4912, "step": 338 }, { "epoch": 0.323203432248838, "grad_norm": 18.2254545288648, "learning_rate": 9.99837263351316e-06, "loss": 3.8733, "step": 339 }, { "epoch": 0.3241568347038494, "grad_norm": 28.18939545864241, "learning_rate": 9.998228055617264e-06, "loss": 3.6994, "step": 340 }, { "epoch": 0.32511023715886067, "grad_norm": 23.185459893950714, "learning_rate": 9.998077326954437e-06, "loss": 3.6573, "step": 341 }, { "epoch": 0.326063639613872, "grad_norm": 19.756591744804236, "learning_rate": 9.997920447710162e-06, "loss": 3.4911, "step": 342 }, { "epoch": 0.3270170420688833, "grad_norm": 21.021683193572212, "learning_rate": 9.997757418077496e-06, "loss": 3.5346, "step": 343 }, { "epoch": 0.3279704445238946, "grad_norm": 23.387385807823854, "learning_rate": 9.99758823825706e-06, "loss": 3.6859, "step": 344 }, { "epoch": 0.328923846978906, "grad_norm": 22.946825980691486, "learning_rate": 9.997412908457049e-06, "loss": 3.7528, "step": 345 }, { "epoch": 0.3298772494339173, "grad_norm": 23.441980816877734, "learning_rate": 9.997231428893216e-06, "loss": 3.7507, "step": 346 }, { "epoch": 0.33083065188892863, "grad_norm": 31.690641379366422, "learning_rate": 9.997043799788892e-06, "loss": 3.861, "step": 347 }, { "epoch": 0.3317840543439399, "grad_norm": 18.470758688320622, "learning_rate": 9.996850021374969e-06, "loss": 3.5557, "step": 348 }, { "epoch": 0.3327374567989513, "grad_norm": 24.99633527186745, "learning_rate": 9.996650093889911e-06, "loss": 3.4819, "step": 349 }, { "epoch": 0.3336908592539626, "grad_norm": 20.933467950116132, "learning_rate": 9.996444017579742e-06, "loss": 3.6212, "step": 350 }, { "epoch": 0.3346442617089739, "grad_norm": 24.691390882028514, "learning_rate": 9.996231792698065e-06, "loss": 3.9207, "step": 351 }, { "epoch": 0.33559766416398523, "grad_norm": 22.709555478073753, "learning_rate": 9.996013419506035e-06, "loss": 4.0103, "step": 352 }, { "epoch": 0.33655106661899653, "grad_norm": 20.40080428389596, "learning_rate": 9.995788898272383e-06, "loss": 3.7619, "step": 353 }, { "epoch": 0.3375044690740079, "grad_norm": 22.947518538340642, "learning_rate": 9.9955582292734e-06, "loss": 3.7035, "step": 354 }, { "epoch": 0.3384578715290192, "grad_norm": 19.13229552830128, "learning_rate": 9.995321412792948e-06, "loss": 3.7617, "step": 355 }, { "epoch": 0.3394112739840305, "grad_norm": 23.438605462307986, "learning_rate": 9.99507844912245e-06, "loss": 3.5661, "step": 356 }, { "epoch": 0.34036467643904184, "grad_norm": 24.55204236003422, "learning_rate": 9.994829338560892e-06, "loss": 4.1256, "step": 357 }, { "epoch": 0.34131807889405313, "grad_norm": 21.01893770282939, "learning_rate": 9.994574081414831e-06, "loss": 3.5833, "step": 358 }, { "epoch": 0.3422714813490645, "grad_norm": 27.10988062636206, "learning_rate": 9.99431267799838e-06, "loss": 3.6618, "step": 359 }, { "epoch": 0.3432248838040758, "grad_norm": 25.260019171203233, "learning_rate": 9.994045128633221e-06, "loss": 3.6203, "step": 360 }, { "epoch": 0.34417828625908714, "grad_norm": 19.343079195232555, "learning_rate": 9.993771433648598e-06, "loss": 3.6475, "step": 361 }, { "epoch": 0.34513168871409844, "grad_norm": 25.674168318067135, "learning_rate": 9.993491593381315e-06, "loss": 3.6898, "step": 362 }, { "epoch": 0.34608509116910974, "grad_norm": 20.539130131801684, "learning_rate": 9.993205608175743e-06, "loss": 3.4697, "step": 363 }, { "epoch": 0.3470384936241211, "grad_norm": 23.051360219881428, "learning_rate": 9.99291347838381e-06, "loss": 4.0722, "step": 364 }, { "epoch": 0.3479918960791324, "grad_norm": 18.913013866372197, "learning_rate": 9.99261520436501e-06, "loss": 3.5901, "step": 365 }, { "epoch": 0.34894529853414374, "grad_norm": 24.26673929622656, "learning_rate": 9.992310786486395e-06, "loss": 3.7779, "step": 366 }, { "epoch": 0.34989870098915504, "grad_norm": 18.893547699969023, "learning_rate": 9.992000225122579e-06, "loss": 3.5522, "step": 367 }, { "epoch": 0.35085210344416634, "grad_norm": 21.49409358693466, "learning_rate": 9.991683520655735e-06, "loss": 3.5501, "step": 368 }, { "epoch": 0.3518055058991777, "grad_norm": 21.878480225854673, "learning_rate": 9.991360673475596e-06, "loss": 3.5917, "step": 369 }, { "epoch": 0.352758908354189, "grad_norm": 22.669844060797328, "learning_rate": 9.991031683979453e-06, "loss": 3.7868, "step": 370 }, { "epoch": 0.35371231080920035, "grad_norm": 24.7486168016118, "learning_rate": 9.99069655257216e-06, "loss": 3.9925, "step": 371 }, { "epoch": 0.35466571326421165, "grad_norm": 20.8854322666735, "learning_rate": 9.990355279666124e-06, "loss": 3.9822, "step": 372 }, { "epoch": 0.355619115719223, "grad_norm": 20.112251724233317, "learning_rate": 9.990007865681314e-06, "loss": 3.628, "step": 373 }, { "epoch": 0.3565725181742343, "grad_norm": 22.00481480135722, "learning_rate": 9.989654311045251e-06, "loss": 3.528, "step": 374 }, { "epoch": 0.3575259206292456, "grad_norm": 20.935209660259627, "learning_rate": 9.989294616193018e-06, "loss": 3.7792, "step": 375 }, { "epoch": 0.35847932308425695, "grad_norm": 26.05916402807932, "learning_rate": 9.988928781567251e-06, "loss": 3.7813, "step": 376 }, { "epoch": 0.35943272553926825, "grad_norm": 33.50274798709381, "learning_rate": 9.988556807618142e-06, "loss": 3.851, "step": 377 }, { "epoch": 0.3603861279942796, "grad_norm": 21.5933226423023, "learning_rate": 9.988178694803439e-06, "loss": 3.6525, "step": 378 }, { "epoch": 0.3613395304492909, "grad_norm": 18.907833415026108, "learning_rate": 9.987794443588442e-06, "loss": 3.6395, "step": 379 }, { "epoch": 0.36229293290430226, "grad_norm": 28.24215673513184, "learning_rate": 9.987404054446009e-06, "loss": 3.7905, "step": 380 }, { "epoch": 0.36324633535931355, "grad_norm": 28.91552575134746, "learning_rate": 9.987007527856545e-06, "loss": 4.0473, "step": 381 }, { "epoch": 0.36419973781432485, "grad_norm": 25.77992887300261, "learning_rate": 9.986604864308016e-06, "loss": 3.662, "step": 382 }, { "epoch": 0.3651531402693362, "grad_norm": 20.567337748030894, "learning_rate": 9.986196064295932e-06, "loss": 3.7542, "step": 383 }, { "epoch": 0.3661065427243475, "grad_norm": 28.286228244369624, "learning_rate": 9.98578112832336e-06, "loss": 3.7454, "step": 384 }, { "epoch": 0.36705994517935886, "grad_norm": 20.509733587084416, "learning_rate": 9.985360056900915e-06, "loss": 3.7214, "step": 385 }, { "epoch": 0.36801334763437016, "grad_norm": 21.301265075959208, "learning_rate": 9.984932850546764e-06, "loss": 3.922, "step": 386 }, { "epoch": 0.36896675008938146, "grad_norm": 29.796719294315622, "learning_rate": 9.98449950978662e-06, "loss": 3.7189, "step": 387 }, { "epoch": 0.3699201525443928, "grad_norm": 22.05004589629295, "learning_rate": 9.984060035153752e-06, "loss": 3.6733, "step": 388 }, { "epoch": 0.3708735549994041, "grad_norm": 24.42333474558909, "learning_rate": 9.983614427188968e-06, "loss": 3.9899, "step": 389 }, { "epoch": 0.37182695745441546, "grad_norm": 24.335582548672768, "learning_rate": 9.983162686440632e-06, "loss": 3.695, "step": 390 }, { "epoch": 0.37278035990942676, "grad_norm": 21.83008775570887, "learning_rate": 9.982704813464652e-06, "loss": 3.6435, "step": 391 }, { "epoch": 0.3737337623644381, "grad_norm": 22.220129133119375, "learning_rate": 9.982240808824477e-06, "loss": 3.798, "step": 392 }, { "epoch": 0.3746871648194494, "grad_norm": 18.40504100899135, "learning_rate": 9.98177067309111e-06, "loss": 3.7641, "step": 393 }, { "epoch": 0.3756405672744607, "grad_norm": 21.327814975992116, "learning_rate": 9.981294406843093e-06, "loss": 3.6929, "step": 394 }, { "epoch": 0.37659396972947207, "grad_norm": 25.479039808935934, "learning_rate": 9.980812010666519e-06, "loss": 3.6968, "step": 395 }, { "epoch": 0.37754737218448337, "grad_norm": 22.084168205351812, "learning_rate": 9.980323485155013e-06, "loss": 3.5752, "step": 396 }, { "epoch": 0.3785007746394947, "grad_norm": 21.929681063147477, "learning_rate": 9.979828830909755e-06, "loss": 3.6293, "step": 397 }, { "epoch": 0.379454177094506, "grad_norm": 24.158604892032763, "learning_rate": 9.979328048539456e-06, "loss": 3.8578, "step": 398 }, { "epoch": 0.3804075795495173, "grad_norm": 20.300976884695753, "learning_rate": 9.97882113866038e-06, "loss": 3.5758, "step": 399 }, { "epoch": 0.38136098200452867, "grad_norm": 26.79624234678178, "learning_rate": 9.978308101896318e-06, "loss": 3.7405, "step": 400 }, { "epoch": 0.38231438445953997, "grad_norm": 19.926122049713655, "learning_rate": 9.977788938878612e-06, "loss": 3.7099, "step": 401 }, { "epoch": 0.3832677869145513, "grad_norm": 24.154847133391133, "learning_rate": 9.977263650246139e-06, "loss": 3.6725, "step": 402 }, { "epoch": 0.3842211893695626, "grad_norm": 20.28608907034229, "learning_rate": 9.97673223664531e-06, "loss": 3.7286, "step": 403 }, { "epoch": 0.385174591824574, "grad_norm": 23.51774677454541, "learning_rate": 9.97619469873008e-06, "loss": 3.7326, "step": 404 }, { "epoch": 0.3861279942795853, "grad_norm": 22.377964447679705, "learning_rate": 9.975651037161937e-06, "loss": 3.7975, "step": 405 }, { "epoch": 0.3870813967345966, "grad_norm": 20.277347520861532, "learning_rate": 9.975101252609904e-06, "loss": 3.6492, "step": 406 }, { "epoch": 0.3880347991896079, "grad_norm": 26.767187858916582, "learning_rate": 9.974545345750542e-06, "loss": 3.8371, "step": 407 }, { "epoch": 0.3889882016446192, "grad_norm": 21.796497372218877, "learning_rate": 9.973983317267944e-06, "loss": 3.8673, "step": 408 }, { "epoch": 0.3899416040996306, "grad_norm": 23.220315838056063, "learning_rate": 9.973415167853735e-06, "loss": 3.6656, "step": 409 }, { "epoch": 0.3908950065546419, "grad_norm": 22.95451108876476, "learning_rate": 9.972840898207073e-06, "loss": 3.907, "step": 410 }, { "epoch": 0.3918484090096532, "grad_norm": 22.621397705734076, "learning_rate": 9.97226050903465e-06, "loss": 3.8977, "step": 411 }, { "epoch": 0.39280181146466453, "grad_norm": 19.917599282812052, "learning_rate": 9.971674001050687e-06, "loss": 3.6964, "step": 412 }, { "epoch": 0.39375521391967583, "grad_norm": 22.92390213197064, "learning_rate": 9.971081374976932e-06, "loss": 3.6343, "step": 413 }, { "epoch": 0.3947086163746872, "grad_norm": 23.39357122578866, "learning_rate": 9.970482631542668e-06, "loss": 3.6244, "step": 414 }, { "epoch": 0.3956620188296985, "grad_norm": 22.2577267949096, "learning_rate": 9.9698777714847e-06, "loss": 3.5952, "step": 415 }, { "epoch": 0.39661542128470983, "grad_norm": 25.237329843523757, "learning_rate": 9.969266795547364e-06, "loss": 4.0274, "step": 416 }, { "epoch": 0.39756882373972113, "grad_norm": 22.087750323810962, "learning_rate": 9.968649704482519e-06, "loss": 3.7513, "step": 417 }, { "epoch": 0.39852222619473243, "grad_norm": 26.15979758533802, "learning_rate": 9.96802649904955e-06, "loss": 3.5946, "step": 418 }, { "epoch": 0.3994756286497438, "grad_norm": 27.602507601828023, "learning_rate": 9.96739718001537e-06, "loss": 3.9068, "step": 419 }, { "epoch": 0.4004290311047551, "grad_norm": 23.40492219105174, "learning_rate": 9.96676174815441e-06, "loss": 3.705, "step": 420 }, { "epoch": 0.40138243355976644, "grad_norm": 20.16660268370362, "learning_rate": 9.966120204248626e-06, "loss": 3.4482, "step": 421 }, { "epoch": 0.40233583601477774, "grad_norm": 19.783911694708944, "learning_rate": 9.965472549087497e-06, "loss": 3.7036, "step": 422 }, { "epoch": 0.40328923846978904, "grad_norm": 23.467178518978425, "learning_rate": 9.964818783468018e-06, "loss": 3.6022, "step": 423 }, { "epoch": 0.4042426409248004, "grad_norm": 20.85204071738281, "learning_rate": 9.964158908194708e-06, "loss": 3.6782, "step": 424 }, { "epoch": 0.4051960433798117, "grad_norm": 23.797424524251166, "learning_rate": 9.963492924079603e-06, "loss": 3.893, "step": 425 }, { "epoch": 0.40614944583482304, "grad_norm": 26.279557039478508, "learning_rate": 9.962820831942254e-06, "loss": 3.699, "step": 426 }, { "epoch": 0.40710284828983434, "grad_norm": 25.910349656168407, "learning_rate": 9.962142632609734e-06, "loss": 3.4868, "step": 427 }, { "epoch": 0.4080562507448457, "grad_norm": 24.0708898597714, "learning_rate": 9.961458326916624e-06, "loss": 3.8478, "step": 428 }, { "epoch": 0.409009653199857, "grad_norm": 19.6896138090046, "learning_rate": 9.960767915705025e-06, "loss": 3.5669, "step": 429 }, { "epoch": 0.4099630556548683, "grad_norm": 19.296170990788163, "learning_rate": 9.960071399824549e-06, "loss": 3.7853, "step": 430 }, { "epoch": 0.41091645810987965, "grad_norm": 27.06718719347339, "learning_rate": 9.959368780132324e-06, "loss": 3.4934, "step": 431 }, { "epoch": 0.41186986056489094, "grad_norm": 23.84975200347006, "learning_rate": 9.958660057492982e-06, "loss": 4.003, "step": 432 }, { "epoch": 0.4128232630199023, "grad_norm": 25.11272668702917, "learning_rate": 9.957945232778674e-06, "loss": 3.7292, "step": 433 }, { "epoch": 0.4137766654749136, "grad_norm": 21.358838634102593, "learning_rate": 9.957224306869053e-06, "loss": 3.665, "step": 434 }, { "epoch": 0.4147300679299249, "grad_norm": 20.904332672362962, "learning_rate": 9.956497280651283e-06, "loss": 3.9544, "step": 435 }, { "epoch": 0.41568347038493625, "grad_norm": 23.53178156667075, "learning_rate": 9.955764155020037e-06, "loss": 3.6883, "step": 436 }, { "epoch": 0.41663687283994755, "grad_norm": 27.332696928268895, "learning_rate": 9.955024930877489e-06, "loss": 3.708, "step": 437 }, { "epoch": 0.4175902752949589, "grad_norm": 21.999999863763485, "learning_rate": 9.954279609133322e-06, "loss": 3.5586, "step": 438 }, { "epoch": 0.4185436777499702, "grad_norm": 21.26009355957675, "learning_rate": 9.95352819070472e-06, "loss": 3.7641, "step": 439 }, { "epoch": 0.41949708020498155, "grad_norm": 21.185091598709004, "learning_rate": 9.952770676516372e-06, "loss": 3.6766, "step": 440 }, { "epoch": 0.42045048265999285, "grad_norm": 27.433963388127342, "learning_rate": 9.952007067500467e-06, "loss": 3.683, "step": 441 }, { "epoch": 0.42140388511500415, "grad_norm": 20.995010146484773, "learning_rate": 9.951237364596692e-06, "loss": 3.9408, "step": 442 }, { "epoch": 0.4223572875700155, "grad_norm": 28.403395863562366, "learning_rate": 9.950461568752238e-06, "loss": 3.6357, "step": 443 }, { "epoch": 0.4233106900250268, "grad_norm": 21.4313427074426, "learning_rate": 9.94967968092179e-06, "loss": 3.9362, "step": 444 }, { "epoch": 0.42426409248003816, "grad_norm": 23.336777993606553, "learning_rate": 9.948891702067532e-06, "loss": 3.688, "step": 445 }, { "epoch": 0.42521749493504946, "grad_norm": 25.082244542851917, "learning_rate": 9.94809763315914e-06, "loss": 3.9122, "step": 446 }, { "epoch": 0.42617089739006075, "grad_norm": 22.029424535884758, "learning_rate": 9.947297475173788e-06, "loss": 3.7623, "step": 447 }, { "epoch": 0.4271242998450721, "grad_norm": 29.39877843896823, "learning_rate": 9.946491229096143e-06, "loss": 3.5193, "step": 448 }, { "epoch": 0.4280777023000834, "grad_norm": 19.500776873908453, "learning_rate": 9.945678895918363e-06, "loss": 3.617, "step": 449 }, { "epoch": 0.42903110475509476, "grad_norm": 25.841216378149593, "learning_rate": 9.944860476640096e-06, "loss": 3.917, "step": 450 }, { "epoch": 0.42998450721010606, "grad_norm": 24.845243903724246, "learning_rate": 9.94403597226848e-06, "loss": 3.7091, "step": 451 }, { "epoch": 0.4309379096651174, "grad_norm": 21.028319478549577, "learning_rate": 9.943205383818142e-06, "loss": 3.7767, "step": 452 }, { "epoch": 0.4318913121201287, "grad_norm": 22.339279748526433, "learning_rate": 9.942368712311194e-06, "loss": 3.7668, "step": 453 }, { "epoch": 0.43284471457514, "grad_norm": 22.757462343479993, "learning_rate": 9.941525958777237e-06, "loss": 3.8904, "step": 454 }, { "epoch": 0.43379811703015136, "grad_norm": 23.82843667345077, "learning_rate": 9.940677124253353e-06, "loss": 3.8399, "step": 455 }, { "epoch": 0.43475151948516266, "grad_norm": 20.697743950309015, "learning_rate": 9.93982220978411e-06, "loss": 3.9321, "step": 456 }, { "epoch": 0.435704921940174, "grad_norm": 24.460044914293334, "learning_rate": 9.938961216421557e-06, "loss": 3.6391, "step": 457 }, { "epoch": 0.4366583243951853, "grad_norm": 19.497760425395636, "learning_rate": 9.938094145225223e-06, "loss": 3.7083, "step": 458 }, { "epoch": 0.4376117268501966, "grad_norm": 16.870906672995407, "learning_rate": 9.937220997262116e-06, "loss": 3.9975, "step": 459 }, { "epoch": 0.43856512930520797, "grad_norm": 21.85144233482734, "learning_rate": 9.936341773606723e-06, "loss": 3.6864, "step": 460 }, { "epoch": 0.43951853176021927, "grad_norm": 18.991354772483863, "learning_rate": 9.935456475341008e-06, "loss": 3.5741, "step": 461 }, { "epoch": 0.4404719342152306, "grad_norm": 22.805010093624716, "learning_rate": 9.93456510355441e-06, "loss": 3.7004, "step": 462 }, { "epoch": 0.4414253366702419, "grad_norm": 19.211561618358292, "learning_rate": 9.93366765934384e-06, "loss": 3.839, "step": 463 }, { "epoch": 0.4423787391252533, "grad_norm": 17.65670428744854, "learning_rate": 9.932764143813686e-06, "loss": 3.6399, "step": 464 }, { "epoch": 0.44333214158026457, "grad_norm": 23.707248867407543, "learning_rate": 9.931854558075804e-06, "loss": 3.7768, "step": 465 }, { "epoch": 0.44428554403527587, "grad_norm": 21.04032471725381, "learning_rate": 9.930938903249517e-06, "loss": 3.8168, "step": 466 }, { "epoch": 0.4452389464902872, "grad_norm": 23.31497517987919, "learning_rate": 9.930017180461624e-06, "loss": 3.6433, "step": 467 }, { "epoch": 0.4461923489452985, "grad_norm": 21.613858132316864, "learning_rate": 9.929089390846389e-06, "loss": 3.8294, "step": 468 }, { "epoch": 0.4471457514003099, "grad_norm": 21.73324953517127, "learning_rate": 9.928155535545535e-06, "loss": 3.7869, "step": 469 }, { "epoch": 0.4480991538553212, "grad_norm": 17.7207401858102, "learning_rate": 9.927215615708258e-06, "loss": 3.7671, "step": 470 }, { "epoch": 0.4490525563103325, "grad_norm": 20.882146213677945, "learning_rate": 9.926269632491211e-06, "loss": 3.7474, "step": 471 }, { "epoch": 0.4500059587653438, "grad_norm": 20.52127752728802, "learning_rate": 9.925317587058516e-06, "loss": 3.7746, "step": 472 }, { "epoch": 0.4509593612203551, "grad_norm": 22.587647066912982, "learning_rate": 9.924359480581746e-06, "loss": 3.8897, "step": 473 }, { "epoch": 0.4519127636753665, "grad_norm": 22.880860776994922, "learning_rate": 9.923395314239937e-06, "loss": 3.7661, "step": 474 }, { "epoch": 0.4528661661303778, "grad_norm": 24.913424914966274, "learning_rate": 9.922425089219581e-06, "loss": 3.5596, "step": 475 }, { "epoch": 0.45381956858538913, "grad_norm": 20.00062998609751, "learning_rate": 9.92144880671463e-06, "loss": 3.7241, "step": 476 }, { "epoch": 0.45477297104040043, "grad_norm": 21.576399192931476, "learning_rate": 9.920466467926485e-06, "loss": 3.6207, "step": 477 }, { "epoch": 0.45572637349541173, "grad_norm": 21.753547140065635, "learning_rate": 9.919478074064002e-06, "loss": 3.9479, "step": 478 }, { "epoch": 0.4566797759504231, "grad_norm": 20.586203615407097, "learning_rate": 9.918483626343487e-06, "loss": 3.6118, "step": 479 }, { "epoch": 0.4576331784054344, "grad_norm": 19.66092222332852, "learning_rate": 9.9174831259887e-06, "loss": 3.4812, "step": 480 }, { "epoch": 0.45858658086044574, "grad_norm": 20.220240034753186, "learning_rate": 9.916476574230842e-06, "loss": 3.6758, "step": 481 }, { "epoch": 0.45953998331545703, "grad_norm": 22.260972705295757, "learning_rate": 9.91546397230857e-06, "loss": 3.934, "step": 482 }, { "epoch": 0.46049338577046833, "grad_norm": 21.717708816178355, "learning_rate": 9.914445321467976e-06, "loss": 3.4961, "step": 483 }, { "epoch": 0.4614467882254797, "grad_norm": 22.863740855455056, "learning_rate": 9.913420622962606e-06, "loss": 3.9005, "step": 484 }, { "epoch": 0.462400190680491, "grad_norm": 18.747129502069885, "learning_rate": 9.912389878053439e-06, "loss": 3.5449, "step": 485 }, { "epoch": 0.46335359313550234, "grad_norm": 19.301371994343448, "learning_rate": 9.911353088008901e-06, "loss": 3.6942, "step": 486 }, { "epoch": 0.46430699559051364, "grad_norm": 22.749654515260634, "learning_rate": 9.910310254104856e-06, "loss": 3.8765, "step": 487 }, { "epoch": 0.465260398045525, "grad_norm": 18.689445817129766, "learning_rate": 9.909261377624601e-06, "loss": 3.8212, "step": 488 }, { "epoch": 0.4662138005005363, "grad_norm": 17.949560508852077, "learning_rate": 9.908206459858875e-06, "loss": 3.5174, "step": 489 }, { "epoch": 0.4671672029555476, "grad_norm": 21.6034002913404, "learning_rate": 9.907145502105847e-06, "loss": 3.5386, "step": 490 }, { "epoch": 0.46812060541055894, "grad_norm": 26.564678367233682, "learning_rate": 9.906078505671126e-06, "loss": 3.7042, "step": 491 }, { "epoch": 0.46907400786557024, "grad_norm": 18.281686598199197, "learning_rate": 9.90500547186774e-06, "loss": 3.6408, "step": 492 }, { "epoch": 0.4700274103205816, "grad_norm": 24.168794419626643, "learning_rate": 9.903926402016153e-06, "loss": 3.7448, "step": 493 }, { "epoch": 0.4709808127755929, "grad_norm": 21.271743723106155, "learning_rate": 9.90284129744426e-06, "loss": 3.9969, "step": 494 }, { "epoch": 0.4719342152306042, "grad_norm": 23.692820146601257, "learning_rate": 9.90175015948738e-06, "loss": 3.9245, "step": 495 }, { "epoch": 0.47288761768561555, "grad_norm": 20.478134190475245, "learning_rate": 9.900652989488255e-06, "loss": 3.5351, "step": 496 }, { "epoch": 0.47384102014062685, "grad_norm": 21.92641826899236, "learning_rate": 9.89954978879705e-06, "loss": 3.5021, "step": 497 }, { "epoch": 0.4747944225956382, "grad_norm": 17.01857452163401, "learning_rate": 9.898440558771351e-06, "loss": 3.5226, "step": 498 }, { "epoch": 0.4757478250506495, "grad_norm": 17.501323408383087, "learning_rate": 9.897325300776168e-06, "loss": 3.4794, "step": 499 }, { "epoch": 0.47670122750566085, "grad_norm": 21.67887406223962, "learning_rate": 9.896204016183924e-06, "loss": 3.6378, "step": 500 }, { "epoch": 0.47765462996067215, "grad_norm": 21.6383126187888, "learning_rate": 9.89507670637446e-06, "loss": 3.4078, "step": 501 }, { "epoch": 0.47860803241568345, "grad_norm": 21.501535790499663, "learning_rate": 9.893943372735034e-06, "loss": 3.9334, "step": 502 }, { "epoch": 0.4795614348706948, "grad_norm": 19.149406531808157, "learning_rate": 9.892804016660308e-06, "loss": 3.6477, "step": 503 }, { "epoch": 0.4805148373257061, "grad_norm": 24.090012370703967, "learning_rate": 9.891658639552368e-06, "loss": 3.5183, "step": 504 }, { "epoch": 0.48146823978071746, "grad_norm": 22.820482591011622, "learning_rate": 9.890507242820702e-06, "loss": 4.0411, "step": 505 }, { "epoch": 0.48242164223572875, "grad_norm": 21.748774593769937, "learning_rate": 9.889349827882204e-06, "loss": 3.7529, "step": 506 }, { "epoch": 0.48337504469074005, "grad_norm": 20.700589475223037, "learning_rate": 9.888186396161178e-06, "loss": 3.7559, "step": 507 }, { "epoch": 0.4843284471457514, "grad_norm": 24.674241545063897, "learning_rate": 9.887016949089334e-06, "loss": 3.6933, "step": 508 }, { "epoch": 0.4852818496007627, "grad_norm": 18.479174093649252, "learning_rate": 9.885841488105777e-06, "loss": 3.9128, "step": 509 }, { "epoch": 0.48623525205577406, "grad_norm": 22.510918094937708, "learning_rate": 9.884660014657018e-06, "loss": 3.8064, "step": 510 }, { "epoch": 0.48718865451078536, "grad_norm": 19.60226278435774, "learning_rate": 9.883472530196968e-06, "loss": 3.6201, "step": 511 }, { "epoch": 0.4881420569657967, "grad_norm": 21.689246702593625, "learning_rate": 9.882279036186927e-06, "loss": 3.8281, "step": 512 }, { "epoch": 0.489095459420808, "grad_norm": 19.79504337399451, "learning_rate": 9.881079534095604e-06, "loss": 3.8338, "step": 513 }, { "epoch": 0.4900488618758193, "grad_norm": 22.154732008938083, "learning_rate": 9.879874025399088e-06, "loss": 3.6718, "step": 514 }, { "epoch": 0.49100226433083066, "grad_norm": 27.89576657992264, "learning_rate": 9.878662511580868e-06, "loss": 3.7502, "step": 515 }, { "epoch": 0.49195566678584196, "grad_norm": 20.488129831400013, "learning_rate": 9.87744499413182e-06, "loss": 3.6914, "step": 516 }, { "epoch": 0.4929090692408533, "grad_norm": 22.913303137887805, "learning_rate": 9.876221474550207e-06, "loss": 3.9242, "step": 517 }, { "epoch": 0.4938624716958646, "grad_norm": 19.005073265502364, "learning_rate": 9.874991954341681e-06, "loss": 3.7033, "step": 518 }, { "epoch": 0.4948158741508759, "grad_norm": 19.73163273513753, "learning_rate": 9.873756435019276e-06, "loss": 3.6113, "step": 519 }, { "epoch": 0.49576927660588727, "grad_norm": 20.067601694509325, "learning_rate": 9.872514918103407e-06, "loss": 3.4186, "step": 520 }, { "epoch": 0.49672267906089856, "grad_norm": 24.152017373014665, "learning_rate": 9.871267405121874e-06, "loss": 3.8649, "step": 521 }, { "epoch": 0.4976760815159099, "grad_norm": 20.738040558821528, "learning_rate": 9.870013897609853e-06, "loss": 3.8536, "step": 522 }, { "epoch": 0.4986294839709212, "grad_norm": 20.65057324623671, "learning_rate": 9.868754397109897e-06, "loss": 3.7033, "step": 523 }, { "epoch": 0.49958288642593257, "grad_norm": 23.875536693649597, "learning_rate": 9.867488905171934e-06, "loss": 3.9346, "step": 524 }, { "epoch": 0.5005362888809438, "grad_norm": 26.780095560637736, "learning_rate": 9.866217423353264e-06, "loss": 3.7448, "step": 525 }, { "epoch": 0.5014896913359552, "grad_norm": 19.449794988207028, "learning_rate": 9.864939953218562e-06, "loss": 3.6883, "step": 526 }, { "epoch": 0.5024430937909665, "grad_norm": 18.970472129382294, "learning_rate": 9.863656496339867e-06, "loss": 3.4693, "step": 527 }, { "epoch": 0.5033964962459778, "grad_norm": 22.53149596496983, "learning_rate": 9.86236705429659e-06, "loss": 3.6544, "step": 528 }, { "epoch": 0.5043498987009891, "grad_norm": 22.080100385407608, "learning_rate": 9.8610716286755e-06, "loss": 3.7927, "step": 529 }, { "epoch": 0.5053033011560005, "grad_norm": 21.25199910203447, "learning_rate": 9.859770221070739e-06, "loss": 3.7448, "step": 530 }, { "epoch": 0.5062567036110118, "grad_norm": 20.96894689343104, "learning_rate": 9.858462833083804e-06, "loss": 3.6107, "step": 531 }, { "epoch": 0.5072101060660231, "grad_norm": 22.15427175219691, "learning_rate": 9.85714946632355e-06, "loss": 3.7265, "step": 532 }, { "epoch": 0.5081635085210344, "grad_norm": 19.38202929338937, "learning_rate": 9.855830122406198e-06, "loss": 3.4208, "step": 533 }, { "epoch": 0.5091169109760457, "grad_norm": 24.017230961214263, "learning_rate": 9.854504802955314e-06, "loss": 3.5762, "step": 534 }, { "epoch": 0.5100703134310571, "grad_norm": 20.034556668149094, "learning_rate": 9.853173509601825e-06, "loss": 3.5565, "step": 535 }, { "epoch": 0.5110237158860684, "grad_norm": 23.97931362916978, "learning_rate": 9.851836243984005e-06, "loss": 3.6289, "step": 536 }, { "epoch": 0.5119771183410797, "grad_norm": 20.16590497169743, "learning_rate": 9.85049300774748e-06, "loss": 3.497, "step": 537 }, { "epoch": 0.512930520796091, "grad_norm": 29.63612456579063, "learning_rate": 9.84914380254522e-06, "loss": 3.7224, "step": 538 }, { "epoch": 0.5138839232511023, "grad_norm": 26.369768494326454, "learning_rate": 9.847788630037546e-06, "loss": 3.6074, "step": 539 }, { "epoch": 0.5148373257061137, "grad_norm": 17.583810637456732, "learning_rate": 9.846427491892117e-06, "loss": 3.658, "step": 540 }, { "epoch": 0.515790728161125, "grad_norm": 23.326235858603575, "learning_rate": 9.845060389783938e-06, "loss": 3.8201, "step": 541 }, { "epoch": 0.5167441306161363, "grad_norm": 22.70581580968102, "learning_rate": 9.843687325395347e-06, "loss": 3.8669, "step": 542 }, { "epoch": 0.5176975330711476, "grad_norm": 18.18077132595944, "learning_rate": 9.842308300416024e-06, "loss": 3.5563, "step": 543 }, { "epoch": 0.5186509355261589, "grad_norm": 21.08003883371943, "learning_rate": 9.840923316542984e-06, "loss": 3.8657, "step": 544 }, { "epoch": 0.5196043379811703, "grad_norm": 20.583022829089906, "learning_rate": 9.839532375480572e-06, "loss": 3.6966, "step": 545 }, { "epoch": 0.5205577404361816, "grad_norm": 21.713932385389782, "learning_rate": 9.838135478940464e-06, "loss": 3.5607, "step": 546 }, { "epoch": 0.5215111428911929, "grad_norm": 20.744762307744175, "learning_rate": 9.836732628641671e-06, "loss": 3.5523, "step": 547 }, { "epoch": 0.5224645453462042, "grad_norm": 17.732029311107617, "learning_rate": 9.835323826310522e-06, "loss": 3.698, "step": 548 }, { "epoch": 0.5234179478012155, "grad_norm": 20.826726755623106, "learning_rate": 9.833909073680674e-06, "loss": 3.58, "step": 549 }, { "epoch": 0.5243713502562269, "grad_norm": 20.212656629244293, "learning_rate": 9.83248837249311e-06, "loss": 3.7048, "step": 550 }, { "epoch": 0.5253247527112382, "grad_norm": 20.325584495140575, "learning_rate": 9.831061724496124e-06, "loss": 3.7106, "step": 551 }, { "epoch": 0.5262781551662495, "grad_norm": 19.48440559873833, "learning_rate": 9.829629131445342e-06, "loss": 3.5681, "step": 552 }, { "epoch": 0.5272315576212608, "grad_norm": 17.652715663374366, "learning_rate": 9.828190595103693e-06, "loss": 3.5506, "step": 553 }, { "epoch": 0.5281849600762722, "grad_norm": 19.928957114107362, "learning_rate": 9.826746117241424e-06, "loss": 3.5265, "step": 554 }, { "epoch": 0.5291383625312835, "grad_norm": 22.834679928864244, "learning_rate": 9.825295699636098e-06, "loss": 3.7772, "step": 555 }, { "epoch": 0.5300917649862948, "grad_norm": 18.634539632738708, "learning_rate": 9.823839344072582e-06, "loss": 3.5919, "step": 556 }, { "epoch": 0.5310451674413061, "grad_norm": 22.802956203475954, "learning_rate": 9.822377052343048e-06, "loss": 3.8807, "step": 557 }, { "epoch": 0.5319985698963174, "grad_norm": 23.447890908203075, "learning_rate": 9.820908826246982e-06, "loss": 3.7494, "step": 558 }, { "epoch": 0.5329519723513289, "grad_norm": 19.62613187694099, "learning_rate": 9.819434667591168e-06, "loss": 4.038, "step": 559 }, { "epoch": 0.5339053748063401, "grad_norm": 23.318946770573415, "learning_rate": 9.817954578189686e-06, "loss": 3.6463, "step": 560 }, { "epoch": 0.5348587772613514, "grad_norm": 18.834056787209438, "learning_rate": 9.81646855986392e-06, "loss": 3.6913, "step": 561 }, { "epoch": 0.5358121797163627, "grad_norm": 22.886717182568567, "learning_rate": 9.81497661444255e-06, "loss": 3.8203, "step": 562 }, { "epoch": 0.536765582171374, "grad_norm": 23.558250869375392, "learning_rate": 9.813478743761547e-06, "loss": 3.6129, "step": 563 }, { "epoch": 0.5377189846263855, "grad_norm": 20.437106318501243, "learning_rate": 9.811974949664176e-06, "loss": 3.6788, "step": 564 }, { "epoch": 0.5386723870813968, "grad_norm": 26.527108886440313, "learning_rate": 9.810465234000988e-06, "loss": 3.8986, "step": 565 }, { "epoch": 0.539625789536408, "grad_norm": 19.119890978007632, "learning_rate": 9.808949598629825e-06, "loss": 3.657, "step": 566 }, { "epoch": 0.5405791919914193, "grad_norm": 22.688088398938095, "learning_rate": 9.807428045415813e-06, "loss": 3.7723, "step": 567 }, { "epoch": 0.5415325944464306, "grad_norm": 27.871400720140166, "learning_rate": 9.805900576231358e-06, "loss": 3.909, "step": 568 }, { "epoch": 0.5424859969014421, "grad_norm": 22.11503580470286, "learning_rate": 9.804367192956147e-06, "loss": 3.6967, "step": 569 }, { "epoch": 0.5434393993564534, "grad_norm": 21.044732220713257, "learning_rate": 9.802827897477145e-06, "loss": 3.5583, "step": 570 }, { "epoch": 0.5443928018114647, "grad_norm": 17.286282808287424, "learning_rate": 9.801282691688597e-06, "loss": 3.4467, "step": 571 }, { "epoch": 0.545346204266476, "grad_norm": 22.597744856614682, "learning_rate": 9.79973157749201e-06, "loss": 3.7038, "step": 572 }, { "epoch": 0.5462996067214874, "grad_norm": 18.4516710820031, "learning_rate": 9.798174556796175e-06, "loss": 3.8799, "step": 573 }, { "epoch": 0.5472530091764987, "grad_norm": 24.5245579804903, "learning_rate": 9.796611631517144e-06, "loss": 3.6981, "step": 574 }, { "epoch": 0.54820641163151, "grad_norm": 24.651065791876697, "learning_rate": 9.795042803578233e-06, "loss": 3.8807, "step": 575 }, { "epoch": 0.5491598140865213, "grad_norm": 14.823683750916235, "learning_rate": 9.793468074910028e-06, "loss": 3.5702, "step": 576 }, { "epoch": 0.5501132165415326, "grad_norm": 20.682637849768398, "learning_rate": 9.791887447450375e-06, "loss": 3.5956, "step": 577 }, { "epoch": 0.551066618996544, "grad_norm": 25.951672022571344, "learning_rate": 9.790300923144374e-06, "loss": 3.7326, "step": 578 }, { "epoch": 0.5520200214515553, "grad_norm": 18.226711064073665, "learning_rate": 9.788708503944387e-06, "loss": 3.7201, "step": 579 }, { "epoch": 0.5529734239065666, "grad_norm": 23.088771925053877, "learning_rate": 9.787110191810027e-06, "loss": 3.6497, "step": 580 }, { "epoch": 0.5539268263615779, "grad_norm": 18.44154887278252, "learning_rate": 9.785505988708164e-06, "loss": 3.5736, "step": 581 }, { "epoch": 0.5548802288165892, "grad_norm": 23.920149168839053, "learning_rate": 9.783895896612909e-06, "loss": 3.9025, "step": 582 }, { "epoch": 0.5558336312716006, "grad_norm": 22.50093426555597, "learning_rate": 9.782279917505626e-06, "loss": 3.8219, "step": 583 }, { "epoch": 0.5567870337266119, "grad_norm": 22.187495539892524, "learning_rate": 9.780658053374923e-06, "loss": 3.7387, "step": 584 }, { "epoch": 0.5577404361816232, "grad_norm": 17.89403186626563, "learning_rate": 9.779030306216648e-06, "loss": 3.5512, "step": 585 }, { "epoch": 0.5586938386366345, "grad_norm": 21.23657459577573, "learning_rate": 9.77739667803389e-06, "loss": 3.5558, "step": 586 }, { "epoch": 0.5596472410916458, "grad_norm": 19.234347087200806, "learning_rate": 9.775757170836975e-06, "loss": 3.7948, "step": 587 }, { "epoch": 0.5606006435466572, "grad_norm": 16.79214035309919, "learning_rate": 9.77411178664346e-06, "loss": 3.4618, "step": 588 }, { "epoch": 0.5615540460016685, "grad_norm": 16.483637260588612, "learning_rate": 9.772460527478142e-06, "loss": 3.3762, "step": 589 }, { "epoch": 0.5625074484566798, "grad_norm": 21.29685479809222, "learning_rate": 9.770803395373041e-06, "loss": 3.7169, "step": 590 }, { "epoch": 0.5634608509116911, "grad_norm": 19.228737744222432, "learning_rate": 9.769140392367407e-06, "loss": 3.6304, "step": 591 }, { "epoch": 0.5644142533667024, "grad_norm": 19.136573893206975, "learning_rate": 9.767471520507713e-06, "loss": 3.755, "step": 592 }, { "epoch": 0.5653676558217138, "grad_norm": 19.661929814361855, "learning_rate": 9.765796781847656e-06, "loss": 3.8426, "step": 593 }, { "epoch": 0.5663210582767251, "grad_norm": 26.862202505259862, "learning_rate": 9.76411617844815e-06, "loss": 3.6334, "step": 594 }, { "epoch": 0.5672744607317364, "grad_norm": 19.74942572397459, "learning_rate": 9.762429712377332e-06, "loss": 3.811, "step": 595 }, { "epoch": 0.5682278631867477, "grad_norm": 26.505335623566815, "learning_rate": 9.760737385710546e-06, "loss": 3.7637, "step": 596 }, { "epoch": 0.5691812656417591, "grad_norm": 20.535398999432875, "learning_rate": 9.759039200530354e-06, "loss": 3.7223, "step": 597 }, { "epoch": 0.5701346680967704, "grad_norm": 18.010016703337993, "learning_rate": 9.757335158926521e-06, "loss": 3.7523, "step": 598 }, { "epoch": 0.5710880705517817, "grad_norm": 24.061665506307655, "learning_rate": 9.755625262996028e-06, "loss": 3.8184, "step": 599 }, { "epoch": 0.572041473006793, "grad_norm": 20.662930534880314, "learning_rate": 9.753909514843047e-06, "loss": 3.6991, "step": 600 }, { "epoch": 0.5729948754618043, "grad_norm": 22.59915730204225, "learning_rate": 9.752187916578968e-06, "loss": 3.8152, "step": 601 }, { "epoch": 0.5739482779168157, "grad_norm": 22.876741550555916, "learning_rate": 9.750460470322367e-06, "loss": 3.7121, "step": 602 }, { "epoch": 0.574901680371827, "grad_norm": 17.833467205294987, "learning_rate": 9.748727178199024e-06, "loss": 3.5757, "step": 603 }, { "epoch": 0.5758550828268383, "grad_norm": 19.523246680147277, "learning_rate": 9.746988042341907e-06, "loss": 3.5941, "step": 604 }, { "epoch": 0.5768084852818496, "grad_norm": 24.202009626179205, "learning_rate": 9.745243064891181e-06, "loss": 3.6389, "step": 605 }, { "epoch": 0.5777618877368609, "grad_norm": 24.468302573014594, "learning_rate": 9.743492247994195e-06, "loss": 3.6117, "step": 606 }, { "epoch": 0.5787152901918723, "grad_norm": 21.367412184065202, "learning_rate": 9.741735593805488e-06, "loss": 3.4569, "step": 607 }, { "epoch": 0.5796686926468836, "grad_norm": 20.399358206171843, "learning_rate": 9.739973104486777e-06, "loss": 3.6236, "step": 608 }, { "epoch": 0.5806220951018949, "grad_norm": 23.163741046015346, "learning_rate": 9.738204782206967e-06, "loss": 3.8825, "step": 609 }, { "epoch": 0.5815754975569062, "grad_norm": 18.85420127240173, "learning_rate": 9.73643062914213e-06, "loss": 3.5877, "step": 610 }, { "epoch": 0.5825289000119175, "grad_norm": 24.752019197803662, "learning_rate": 9.73465064747553e-06, "loss": 3.604, "step": 611 }, { "epoch": 0.5834823024669289, "grad_norm": 20.400985316773642, "learning_rate": 9.732864839397585e-06, "loss": 3.5695, "step": 612 }, { "epoch": 0.5844357049219402, "grad_norm": 21.63526897360644, "learning_rate": 9.731073207105897e-06, "loss": 3.5949, "step": 613 }, { "epoch": 0.5853891073769515, "grad_norm": 20.82984988677345, "learning_rate": 9.729275752805226e-06, "loss": 3.6081, "step": 614 }, { "epoch": 0.5863425098319628, "grad_norm": 19.726007021340696, "learning_rate": 9.727472478707504e-06, "loss": 3.4799, "step": 615 }, { "epoch": 0.5872959122869741, "grad_norm": 27.428068220015394, "learning_rate": 9.725663387031818e-06, "loss": 3.6904, "step": 616 }, { "epoch": 0.5882493147419855, "grad_norm": 23.072944356882243, "learning_rate": 9.72384848000442e-06, "loss": 3.7638, "step": 617 }, { "epoch": 0.5892027171969968, "grad_norm": 22.707135221768738, "learning_rate": 9.722027759858715e-06, "loss": 3.6139, "step": 618 }, { "epoch": 0.5901561196520081, "grad_norm": 23.271749280429393, "learning_rate": 9.720201228835258e-06, "loss": 3.7451, "step": 619 }, { "epoch": 0.5911095221070194, "grad_norm": 19.819878942247716, "learning_rate": 9.718368889181763e-06, "loss": 3.7066, "step": 620 }, { "epoch": 0.5920629245620308, "grad_norm": 20.68383168423227, "learning_rate": 9.71653074315309e-06, "loss": 3.6514, "step": 621 }, { "epoch": 0.5930163270170421, "grad_norm": 21.161284665401297, "learning_rate": 9.714686793011236e-06, "loss": 3.5594, "step": 622 }, { "epoch": 0.5939697294720534, "grad_norm": 19.989734499614094, "learning_rate": 9.712837041025352e-06, "loss": 3.48, "step": 623 }, { "epoch": 0.5949231319270647, "grad_norm": 19.365117390377225, "learning_rate": 9.710981489471721e-06, "loss": 3.7226, "step": 624 }, { "epoch": 0.595876534382076, "grad_norm": 18.652427344137482, "learning_rate": 9.709120140633766e-06, "loss": 3.6195, "step": 625 }, { "epoch": 0.5968299368370874, "grad_norm": 27.060298467758635, "learning_rate": 9.707252996802041e-06, "loss": 3.5446, "step": 626 }, { "epoch": 0.5977833392920987, "grad_norm": 18.65321164314461, "learning_rate": 9.705380060274237e-06, "loss": 3.7946, "step": 627 }, { "epoch": 0.59873674174711, "grad_norm": 21.727791360587442, "learning_rate": 9.703501333355167e-06, "loss": 3.7835, "step": 628 }, { "epoch": 0.5996901442021213, "grad_norm": 24.403412165547554, "learning_rate": 9.701616818356775e-06, "loss": 3.8585, "step": 629 }, { "epoch": 0.6006435466571326, "grad_norm": 18.163843179047802, "learning_rate": 9.699726517598125e-06, "loss": 3.5015, "step": 630 }, { "epoch": 0.601596949112144, "grad_norm": 19.891219737205258, "learning_rate": 9.6978304334054e-06, "loss": 3.5384, "step": 631 }, { "epoch": 0.6025503515671553, "grad_norm": 22.84089037413649, "learning_rate": 9.6959285681119e-06, "loss": 3.6063, "step": 632 }, { "epoch": 0.6035037540221666, "grad_norm": 23.280236357811482, "learning_rate": 9.694020924058045e-06, "loss": 3.6926, "step": 633 }, { "epoch": 0.6044571564771779, "grad_norm": 20.78646316485179, "learning_rate": 9.69210750359136e-06, "loss": 3.5049, "step": 634 }, { "epoch": 0.6054105589321892, "grad_norm": 24.23435137701321, "learning_rate": 9.690188309066478e-06, "loss": 3.78, "step": 635 }, { "epoch": 0.6063639613872006, "grad_norm": 20.183228059754374, "learning_rate": 9.68826334284514e-06, "loss": 3.6528, "step": 636 }, { "epoch": 0.6073173638422119, "grad_norm": 21.05142655300872, "learning_rate": 9.686332607296192e-06, "loss": 3.6024, "step": 637 }, { "epoch": 0.6082707662972232, "grad_norm": 24.19383921526575, "learning_rate": 9.684396104795573e-06, "loss": 3.874, "step": 638 }, { "epoch": 0.6092241687522345, "grad_norm": 25.20893002362635, "learning_rate": 9.682453837726324e-06, "loss": 3.7618, "step": 639 }, { "epoch": 0.6101775712072458, "grad_norm": 22.1858755334919, "learning_rate": 9.680505808478583e-06, "loss": 3.8419, "step": 640 }, { "epoch": 0.6111309736622572, "grad_norm": 24.222631959912363, "learning_rate": 9.678552019449567e-06, "loss": 3.828, "step": 641 }, { "epoch": 0.6120843761172685, "grad_norm": 18.379749024446827, "learning_rate": 9.67659247304359e-06, "loss": 3.7115, "step": 642 }, { "epoch": 0.6130377785722798, "grad_norm": 26.539712074737817, "learning_rate": 9.674627171672055e-06, "loss": 3.8905, "step": 643 }, { "epoch": 0.6139911810272911, "grad_norm": 19.082434287939027, "learning_rate": 9.672656117753435e-06, "loss": 3.8144, "step": 644 }, { "epoch": 0.6149445834823025, "grad_norm": 20.91858399398256, "learning_rate": 9.670679313713292e-06, "loss": 3.8152, "step": 645 }, { "epoch": 0.6158979859373138, "grad_norm": 18.478114695655968, "learning_rate": 9.668696761984255e-06, "loss": 3.5668, "step": 646 }, { "epoch": 0.6168513883923251, "grad_norm": 19.613538023085255, "learning_rate": 9.666708465006038e-06, "loss": 3.8092, "step": 647 }, { "epoch": 0.6178047908473364, "grad_norm": 18.741994356146964, "learning_rate": 9.664714425225414e-06, "loss": 3.8201, "step": 648 }, { "epoch": 0.6187581933023477, "grad_norm": 19.607743183951957, "learning_rate": 9.66271464509623e-06, "loss": 3.7204, "step": 649 }, { "epoch": 0.6197115957573591, "grad_norm": 18.551855370043047, "learning_rate": 9.660709127079391e-06, "loss": 3.5559, "step": 650 }, { "epoch": 0.6206649982123704, "grad_norm": 21.816584310234852, "learning_rate": 9.65869787364287e-06, "loss": 3.4561, "step": 651 }, { "epoch": 0.6216184006673817, "grad_norm": 20.686707696320692, "learning_rate": 9.656680887261693e-06, "loss": 3.7697, "step": 652 }, { "epoch": 0.622571803122393, "grad_norm": 20.249848136756725, "learning_rate": 9.654658170417941e-06, "loss": 3.5749, "step": 653 }, { "epoch": 0.6235252055774043, "grad_norm": 23.58747833646611, "learning_rate": 9.652629725600751e-06, "loss": 3.573, "step": 654 }, { "epoch": 0.6244786080324157, "grad_norm": 16.930444505263946, "learning_rate": 9.650595555306303e-06, "loss": 3.9304, "step": 655 }, { "epoch": 0.625432010487427, "grad_norm": 18.377455834731407, "learning_rate": 9.648555662037826e-06, "loss": 3.6634, "step": 656 }, { "epoch": 0.6263854129424383, "grad_norm": 19.009435008900805, "learning_rate": 9.646510048305593e-06, "loss": 3.5771, "step": 657 }, { "epoch": 0.6273388153974496, "grad_norm": 18.650898641604105, "learning_rate": 9.644458716626911e-06, "loss": 3.6231, "step": 658 }, { "epoch": 0.6282922178524609, "grad_norm": 20.615334028005172, "learning_rate": 9.642401669526133e-06, "loss": 3.584, "step": 659 }, { "epoch": 0.6292456203074723, "grad_norm": 20.3241844169688, "learning_rate": 9.640338909534636e-06, "loss": 3.7067, "step": 660 }, { "epoch": 0.6301990227624836, "grad_norm": 22.52696539026348, "learning_rate": 9.63827043919083e-06, "loss": 3.7885, "step": 661 }, { "epoch": 0.6311524252174949, "grad_norm": 17.20811574264691, "learning_rate": 9.636196261040155e-06, "loss": 3.6167, "step": 662 }, { "epoch": 0.6321058276725062, "grad_norm": 19.503293382499418, "learning_rate": 9.634116377635073e-06, "loss": 3.7099, "step": 663 }, { "epoch": 0.6330592301275175, "grad_norm": 21.058625782103448, "learning_rate": 9.632030791535063e-06, "loss": 3.7454, "step": 664 }, { "epoch": 0.6340126325825289, "grad_norm": 20.64622536336895, "learning_rate": 9.62993950530663e-06, "loss": 3.6656, "step": 665 }, { "epoch": 0.6349660350375402, "grad_norm": 22.734340104294198, "learning_rate": 9.627842521523286e-06, "loss": 3.6987, "step": 666 }, { "epoch": 0.6359194374925515, "grad_norm": 21.103417478070998, "learning_rate": 9.625739842765558e-06, "loss": 3.4594, "step": 667 }, { "epoch": 0.6368728399475628, "grad_norm": 22.348458058218583, "learning_rate": 9.62363147162098e-06, "loss": 3.7511, "step": 668 }, { "epoch": 0.6378262424025742, "grad_norm": 19.566746067920327, "learning_rate": 9.621517410684091e-06, "loss": 3.5671, "step": 669 }, { "epoch": 0.6387796448575855, "grad_norm": 20.32526230831538, "learning_rate": 9.619397662556434e-06, "loss": 3.7097, "step": 670 }, { "epoch": 0.6397330473125968, "grad_norm": 22.92070665591996, "learning_rate": 9.617272229846548e-06, "loss": 3.9397, "step": 671 }, { "epoch": 0.6406864497676081, "grad_norm": 19.809354605518987, "learning_rate": 9.615141115169968e-06, "loss": 3.4925, "step": 672 }, { "epoch": 0.6416398522226194, "grad_norm": 21.136311716636015, "learning_rate": 9.61300432114922e-06, "loss": 3.8265, "step": 673 }, { "epoch": 0.6425932546776308, "grad_norm": 20.62650989101661, "learning_rate": 9.610861850413823e-06, "loss": 3.7376, "step": 674 }, { "epoch": 0.6435466571326421, "grad_norm": 28.041220667565888, "learning_rate": 9.608713705600277e-06, "loss": 3.7836, "step": 675 }, { "epoch": 0.6445000595876534, "grad_norm": 18.670052623381736, "learning_rate": 9.606559889352065e-06, "loss": 3.6447, "step": 676 }, { "epoch": 0.6454534620426647, "grad_norm": 20.23840938209968, "learning_rate": 9.604400404319655e-06, "loss": 3.7822, "step": 677 }, { "epoch": 0.646406864497676, "grad_norm": 21.855312897410872, "learning_rate": 9.602235253160484e-06, "loss": 3.8659, "step": 678 }, { "epoch": 0.6473602669526874, "grad_norm": 21.26641319740921, "learning_rate": 9.600064438538963e-06, "loss": 3.6536, "step": 679 }, { "epoch": 0.6483136694076987, "grad_norm": 21.76426151253235, "learning_rate": 9.597887963126476e-06, "loss": 3.8644, "step": 680 }, { "epoch": 0.64926707186271, "grad_norm": 21.421497189712962, "learning_rate": 9.595705829601369e-06, "loss": 3.7837, "step": 681 }, { "epoch": 0.6502204743177213, "grad_norm": 18.89928494171812, "learning_rate": 9.593518040648953e-06, "loss": 3.6508, "step": 682 }, { "epoch": 0.6511738767727326, "grad_norm": 19.890349813794526, "learning_rate": 9.5913245989615e-06, "loss": 3.7314, "step": 683 }, { "epoch": 0.652127279227744, "grad_norm": 19.05003885990834, "learning_rate": 9.589125507238234e-06, "loss": 3.594, "step": 684 }, { "epoch": 0.6530806816827553, "grad_norm": 19.229355807904287, "learning_rate": 9.586920768185335e-06, "loss": 3.7522, "step": 685 }, { "epoch": 0.6540340841377666, "grad_norm": 20.355499852665147, "learning_rate": 9.584710384515931e-06, "loss": 3.6044, "step": 686 }, { "epoch": 0.654987486592778, "grad_norm": 24.08839207674641, "learning_rate": 9.5824943589501e-06, "loss": 3.7015, "step": 687 }, { "epoch": 0.6559408890477892, "grad_norm": 23.131205167360452, "learning_rate": 9.580272694214855e-06, "loss": 3.4363, "step": 688 }, { "epoch": 0.6568942915028007, "grad_norm": 20.60029697606701, "learning_rate": 9.578045393044157e-06, "loss": 3.6026, "step": 689 }, { "epoch": 0.657847693957812, "grad_norm": 18.26924944868734, "learning_rate": 9.575812458178897e-06, "loss": 3.516, "step": 690 }, { "epoch": 0.6588010964128233, "grad_norm": 20.01465660153153, "learning_rate": 9.573573892366903e-06, "loss": 3.9667, "step": 691 }, { "epoch": 0.6597544988678345, "grad_norm": 20.019373545929987, "learning_rate": 9.571329698362931e-06, "loss": 3.7286, "step": 692 }, { "epoch": 0.660707901322846, "grad_norm": 22.06604645406874, "learning_rate": 9.56907987892866e-06, "loss": 4.0036, "step": 693 }, { "epoch": 0.6616613037778573, "grad_norm": 20.778347406097378, "learning_rate": 9.566824436832696e-06, "loss": 3.5915, "step": 694 }, { "epoch": 0.6626147062328686, "grad_norm": 20.704912541338892, "learning_rate": 9.56456337485056e-06, "loss": 3.5859, "step": 695 }, { "epoch": 0.6635681086878799, "grad_norm": 16.559527146532346, "learning_rate": 9.562296695764695e-06, "loss": 3.5461, "step": 696 }, { "epoch": 0.6645215111428912, "grad_norm": 18.577949754772643, "learning_rate": 9.56002440236445e-06, "loss": 3.9377, "step": 697 }, { "epoch": 0.6654749135979026, "grad_norm": 18.358973031792505, "learning_rate": 9.557746497446086e-06, "loss": 3.7591, "step": 698 }, { "epoch": 0.6664283160529139, "grad_norm": 19.031224323754127, "learning_rate": 9.555462983812769e-06, "loss": 3.5001, "step": 699 }, { "epoch": 0.6673817185079252, "grad_norm": 18.228079243708386, "learning_rate": 9.553173864274567e-06, "loss": 3.7274, "step": 700 }, { "epoch": 0.6683351209629365, "grad_norm": 22.215641770895736, "learning_rate": 9.550879141648445e-06, "loss": 4.0366, "step": 701 }, { "epoch": 0.6692885234179478, "grad_norm": 19.585564261167974, "learning_rate": 9.548578818758266e-06, "loss": 3.6245, "step": 702 }, { "epoch": 0.6702419258729592, "grad_norm": 24.365000456557162, "learning_rate": 9.546272898434783e-06, "loss": 3.6171, "step": 703 }, { "epoch": 0.6711953283279705, "grad_norm": 17.223834666769164, "learning_rate": 9.543961383515638e-06, "loss": 3.6585, "step": 704 }, { "epoch": 0.6721487307829818, "grad_norm": 25.776770947574892, "learning_rate": 9.541644276845354e-06, "loss": 3.6884, "step": 705 }, { "epoch": 0.6731021332379931, "grad_norm": 24.002455237600646, "learning_rate": 9.539321581275343e-06, "loss": 3.7257, "step": 706 }, { "epoch": 0.6740555356930044, "grad_norm": 18.936208118314465, "learning_rate": 9.536993299663885e-06, "loss": 3.6101, "step": 707 }, { "epoch": 0.6750089381480158, "grad_norm": 19.365300109990926, "learning_rate": 9.53465943487614e-06, "loss": 3.8618, "step": 708 }, { "epoch": 0.6759623406030271, "grad_norm": 18.871139620715525, "learning_rate": 9.53231998978414e-06, "loss": 3.7846, "step": 709 }, { "epoch": 0.6769157430580384, "grad_norm": 21.165422632654376, "learning_rate": 9.52997496726678e-06, "loss": 3.5044, "step": 710 }, { "epoch": 0.6778691455130497, "grad_norm": 16.822501928757806, "learning_rate": 9.52762437020982e-06, "loss": 3.6925, "step": 711 }, { "epoch": 0.678822547968061, "grad_norm": 20.66951967311793, "learning_rate": 9.52526820150588e-06, "loss": 3.7513, "step": 712 }, { "epoch": 0.6797759504230724, "grad_norm": 21.7454878364122, "learning_rate": 9.522906464054435e-06, "loss": 3.7098, "step": 713 }, { "epoch": 0.6807293528780837, "grad_norm": 24.230077668954667, "learning_rate": 9.520539160761818e-06, "loss": 3.5958, "step": 714 }, { "epoch": 0.681682755333095, "grad_norm": 28.45958381373582, "learning_rate": 9.518166294541205e-06, "loss": 3.4064, "step": 715 }, { "epoch": 0.6826361577881063, "grad_norm": 20.858175472710275, "learning_rate": 9.51578786831262e-06, "loss": 3.6929, "step": 716 }, { "epoch": 0.6835895602431177, "grad_norm": 21.49855401408576, "learning_rate": 9.513403885002928e-06, "loss": 3.6839, "step": 717 }, { "epoch": 0.684542962698129, "grad_norm": 22.01268190257118, "learning_rate": 9.511014347545839e-06, "loss": 3.8249, "step": 718 }, { "epoch": 0.6854963651531403, "grad_norm": 17.252415469926323, "learning_rate": 9.508619258881888e-06, "loss": 3.8537, "step": 719 }, { "epoch": 0.6864497676081516, "grad_norm": 17.2132168213156, "learning_rate": 9.506218621958448e-06, "loss": 3.4969, "step": 720 }, { "epoch": 0.6874031700631629, "grad_norm": 20.519755523375853, "learning_rate": 9.503812439729714e-06, "loss": 3.7068, "step": 721 }, { "epoch": 0.6883565725181743, "grad_norm": 22.165719601009755, "learning_rate": 9.501400715156714e-06, "loss": 3.6985, "step": 722 }, { "epoch": 0.6893099749731856, "grad_norm": 23.181560297672053, "learning_rate": 9.498983451207289e-06, "loss": 3.6968, "step": 723 }, { "epoch": 0.6902633774281969, "grad_norm": 18.09064373596276, "learning_rate": 9.496560650856097e-06, "loss": 3.8815, "step": 724 }, { "epoch": 0.6912167798832082, "grad_norm": 17.267188186350168, "learning_rate": 9.494132317084612e-06, "loss": 3.6695, "step": 725 }, { "epoch": 0.6921701823382195, "grad_norm": 20.837621664305065, "learning_rate": 9.491698452881116e-06, "loss": 3.6708, "step": 726 }, { "epoch": 0.6931235847932309, "grad_norm": 19.64269935169027, "learning_rate": 9.489259061240696e-06, "loss": 3.6419, "step": 727 }, { "epoch": 0.6940769872482422, "grad_norm": 27.148022024287783, "learning_rate": 9.486814145165242e-06, "loss": 3.6963, "step": 728 }, { "epoch": 0.6950303897032535, "grad_norm": 19.70599338015986, "learning_rate": 9.484363707663443e-06, "loss": 3.7389, "step": 729 }, { "epoch": 0.6959837921582648, "grad_norm": 18.363431631792245, "learning_rate": 9.48190775175078e-06, "loss": 3.5963, "step": 730 }, { "epoch": 0.6969371946132761, "grad_norm": 17.844197752058147, "learning_rate": 9.479446280449528e-06, "loss": 3.8274, "step": 731 }, { "epoch": 0.6978905970682875, "grad_norm": 24.625391126557343, "learning_rate": 9.476979296788746e-06, "loss": 3.8068, "step": 732 }, { "epoch": 0.6988439995232988, "grad_norm": 18.14347930519435, "learning_rate": 9.47450680380428e-06, "loss": 3.4668, "step": 733 }, { "epoch": 0.6997974019783101, "grad_norm": 18.53122770370977, "learning_rate": 9.472028804538753e-06, "loss": 3.6973, "step": 734 }, { "epoch": 0.7007508044333214, "grad_norm": 21.206912974604144, "learning_rate": 9.469545302041566e-06, "loss": 3.4894, "step": 735 }, { "epoch": 0.7017042068883327, "grad_norm": 20.17903407207095, "learning_rate": 9.467056299368888e-06, "loss": 3.3542, "step": 736 }, { "epoch": 0.7026576093433441, "grad_norm": 17.9674310395467, "learning_rate": 9.464561799583665e-06, "loss": 3.6327, "step": 737 }, { "epoch": 0.7036110117983554, "grad_norm": 20.936148863171447, "learning_rate": 9.462061805755596e-06, "loss": 3.6793, "step": 738 }, { "epoch": 0.7045644142533667, "grad_norm": 18.429240320228963, "learning_rate": 9.459556320961151e-06, "loss": 3.6845, "step": 739 }, { "epoch": 0.705517816708378, "grad_norm": 19.08792748868632, "learning_rate": 9.457045348283552e-06, "loss": 3.5611, "step": 740 }, { "epoch": 0.7064712191633894, "grad_norm": 20.08640971066131, "learning_rate": 9.454528890812776e-06, "loss": 3.5307, "step": 741 }, { "epoch": 0.7074246216184007, "grad_norm": 20.505622288159657, "learning_rate": 9.45200695164555e-06, "loss": 3.6161, "step": 742 }, { "epoch": 0.708378024073412, "grad_norm": 20.847548858406746, "learning_rate": 9.449479533885343e-06, "loss": 3.7438, "step": 743 }, { "epoch": 0.7093314265284233, "grad_norm": 19.683364918376707, "learning_rate": 9.446946640642372e-06, "loss": 3.9113, "step": 744 }, { "epoch": 0.7102848289834346, "grad_norm": 18.71610055188088, "learning_rate": 9.444408275033588e-06, "loss": 3.6763, "step": 745 }, { "epoch": 0.711238231438446, "grad_norm": 18.045896377830932, "learning_rate": 9.441864440182674e-06, "loss": 3.6881, "step": 746 }, { "epoch": 0.7121916338934573, "grad_norm": 21.265780058733068, "learning_rate": 9.43931513922005e-06, "loss": 3.8869, "step": 747 }, { "epoch": 0.7131450363484686, "grad_norm": 21.16567625045913, "learning_rate": 9.436760375282858e-06, "loss": 3.6461, "step": 748 }, { "epoch": 0.7140984388034799, "grad_norm": 19.147540986215798, "learning_rate": 9.434200151514965e-06, "loss": 3.6281, "step": 749 }, { "epoch": 0.7150518412584912, "grad_norm": 20.68977648697296, "learning_rate": 9.431634471066953e-06, "loss": 3.9236, "step": 750 }, { "epoch": 0.7160052437135026, "grad_norm": 19.1795146527202, "learning_rate": 9.42906333709612e-06, "loss": 3.6103, "step": 751 }, { "epoch": 0.7169586461685139, "grad_norm": 22.992018812499957, "learning_rate": 9.426486752766481e-06, "loss": 3.5799, "step": 752 }, { "epoch": 0.7179120486235252, "grad_norm": 26.96638795877227, "learning_rate": 9.423904721248752e-06, "loss": 3.6099, "step": 753 }, { "epoch": 0.7188654510785365, "grad_norm": 16.44807611738353, "learning_rate": 9.421317245720352e-06, "loss": 3.7608, "step": 754 }, { "epoch": 0.7198188535335478, "grad_norm": 16.25110117876056, "learning_rate": 9.418724329365405e-06, "loss": 3.6593, "step": 755 }, { "epoch": 0.7207722559885592, "grad_norm": 18.163977193914057, "learning_rate": 9.416125975374722e-06, "loss": 3.8098, "step": 756 }, { "epoch": 0.7217256584435705, "grad_norm": 17.661895630689724, "learning_rate": 9.413522186945812e-06, "loss": 3.7988, "step": 757 }, { "epoch": 0.7226790608985818, "grad_norm": 21.468695163405133, "learning_rate": 9.41091296728287e-06, "loss": 3.718, "step": 758 }, { "epoch": 0.7236324633535931, "grad_norm": 18.498339153647127, "learning_rate": 9.408298319596775e-06, "loss": 3.6349, "step": 759 }, { "epoch": 0.7245858658086045, "grad_norm": 18.274277881793196, "learning_rate": 9.405678247105083e-06, "loss": 3.7082, "step": 760 }, { "epoch": 0.7255392682636158, "grad_norm": 21.55643436471065, "learning_rate": 9.403052753032031e-06, "loss": 3.7165, "step": 761 }, { "epoch": 0.7264926707186271, "grad_norm": 19.432579932067085, "learning_rate": 9.40042184060852e-06, "loss": 3.6488, "step": 762 }, { "epoch": 0.7274460731736384, "grad_norm": 22.193957314952293, "learning_rate": 9.397785513072128e-06, "loss": 3.4842, "step": 763 }, { "epoch": 0.7283994756286497, "grad_norm": 19.725157353987544, "learning_rate": 9.395143773667089e-06, "loss": 3.6607, "step": 764 }, { "epoch": 0.7293528780836611, "grad_norm": 20.640973463262576, "learning_rate": 9.3924966256443e-06, "loss": 3.6387, "step": 765 }, { "epoch": 0.7303062805386724, "grad_norm": 17.941816220540925, "learning_rate": 9.389844072261312e-06, "loss": 3.5804, "step": 766 }, { "epoch": 0.7312596829936837, "grad_norm": 20.787232706377324, "learning_rate": 9.38718611678233e-06, "loss": 3.929, "step": 767 }, { "epoch": 0.732213085448695, "grad_norm": 20.83257096851149, "learning_rate": 9.38452276247821e-06, "loss": 3.7154, "step": 768 }, { "epoch": 0.7331664879037063, "grad_norm": 20.59631129013889, "learning_rate": 9.381854012626444e-06, "loss": 3.8433, "step": 769 }, { "epoch": 0.7341198903587177, "grad_norm": 21.928112412829435, "learning_rate": 9.37917987051117e-06, "loss": 3.9252, "step": 770 }, { "epoch": 0.735073292813729, "grad_norm": 19.634095995187717, "learning_rate": 9.376500339423155e-06, "loss": 3.6393, "step": 771 }, { "epoch": 0.7360266952687403, "grad_norm": 19.746497257664743, "learning_rate": 9.373815422659806e-06, "loss": 3.7373, "step": 772 }, { "epoch": 0.7369800977237516, "grad_norm": 19.13565174122058, "learning_rate": 9.371125123525153e-06, "loss": 3.8144, "step": 773 }, { "epoch": 0.7379335001787629, "grad_norm": 18.87139457745093, "learning_rate": 9.368429445329848e-06, "loss": 3.9137, "step": 774 }, { "epoch": 0.7388869026337743, "grad_norm": 20.184140255773578, "learning_rate": 9.365728391391164e-06, "loss": 3.5493, "step": 775 }, { "epoch": 0.7398403050887856, "grad_norm": 19.510004396878593, "learning_rate": 9.363021965032993e-06, "loss": 3.7985, "step": 776 }, { "epoch": 0.7407937075437969, "grad_norm": 20.726697088342362, "learning_rate": 9.360310169585828e-06, "loss": 3.6484, "step": 777 }, { "epoch": 0.7417471099988082, "grad_norm": 20.55938718168173, "learning_rate": 9.357593008386786e-06, "loss": 3.6801, "step": 778 }, { "epoch": 0.7427005124538195, "grad_norm": 21.4585514230187, "learning_rate": 9.354870484779569e-06, "loss": 3.6149, "step": 779 }, { "epoch": 0.7436539149088309, "grad_norm": 18.682408551961295, "learning_rate": 9.352142602114487e-06, "loss": 3.5327, "step": 780 }, { "epoch": 0.7446073173638422, "grad_norm": 17.921647468959428, "learning_rate": 9.349409363748445e-06, "loss": 3.7582, "step": 781 }, { "epoch": 0.7455607198188535, "grad_norm": 18.117167418057033, "learning_rate": 9.346670773044939e-06, "loss": 3.6906, "step": 782 }, { "epoch": 0.7465141222738648, "grad_norm": 18.36409282490528, "learning_rate": 9.343926833374048e-06, "loss": 3.7457, "step": 783 }, { "epoch": 0.7474675247288762, "grad_norm": 19.705711385770837, "learning_rate": 9.341177548112437e-06, "loss": 3.7325, "step": 784 }, { "epoch": 0.7484209271838875, "grad_norm": 16.018973496773814, "learning_rate": 9.338422920643345e-06, "loss": 3.6859, "step": 785 }, { "epoch": 0.7493743296388988, "grad_norm": 27.0323731950641, "learning_rate": 9.335662954356591e-06, "loss": 3.6282, "step": 786 }, { "epoch": 0.7503277320939101, "grad_norm": 20.68806712211132, "learning_rate": 9.332897652648556e-06, "loss": 3.9263, "step": 787 }, { "epoch": 0.7512811345489214, "grad_norm": 24.93793583745072, "learning_rate": 9.330127018922195e-06, "loss": 3.7687, "step": 788 }, { "epoch": 0.7522345370039328, "grad_norm": 21.906512239103833, "learning_rate": 9.327351056587018e-06, "loss": 3.8643, "step": 789 }, { "epoch": 0.7531879394589441, "grad_norm": 20.69728361639571, "learning_rate": 9.324569769059098e-06, "loss": 3.7677, "step": 790 }, { "epoch": 0.7541413419139554, "grad_norm": 25.653947742518593, "learning_rate": 9.321783159761057e-06, "loss": 3.6784, "step": 791 }, { "epoch": 0.7550947443689667, "grad_norm": 17.10223783896589, "learning_rate": 9.318991232122065e-06, "loss": 3.6255, "step": 792 }, { "epoch": 0.756048146823978, "grad_norm": 17.649961793968913, "learning_rate": 9.316193989577845e-06, "loss": 3.4723, "step": 793 }, { "epoch": 0.7570015492789894, "grad_norm": 18.57244928311512, "learning_rate": 9.313391435570645e-06, "loss": 3.652, "step": 794 }, { "epoch": 0.7579549517340007, "grad_norm": 17.344232333027193, "learning_rate": 9.310583573549265e-06, "loss": 3.7092, "step": 795 }, { "epoch": 0.758908354189012, "grad_norm": 18.94377056516029, "learning_rate": 9.307770406969032e-06, "loss": 3.7182, "step": 796 }, { "epoch": 0.7598617566440233, "grad_norm": 19.703100666436754, "learning_rate": 9.304951939291794e-06, "loss": 3.7154, "step": 797 }, { "epoch": 0.7608151590990346, "grad_norm": 18.617127434407745, "learning_rate": 9.30212817398593e-06, "loss": 3.6527, "step": 798 }, { "epoch": 0.761768561554046, "grad_norm": 20.068515638202967, "learning_rate": 9.299299114526335e-06, "loss": 3.8514, "step": 799 }, { "epoch": 0.7627219640090573, "grad_norm": 16.270066366693513, "learning_rate": 9.296464764394422e-06, "loss": 3.3792, "step": 800 }, { "epoch": 0.7636753664640686, "grad_norm": 16.867319402486906, "learning_rate": 9.293625127078112e-06, "loss": 3.6871, "step": 801 }, { "epoch": 0.7646287689190799, "grad_norm": 28.32738937531431, "learning_rate": 9.290780206071831e-06, "loss": 3.7857, "step": 802 }, { "epoch": 0.7655821713740912, "grad_norm": 23.278201590345795, "learning_rate": 9.28793000487651e-06, "loss": 3.8684, "step": 803 }, { "epoch": 0.7665355738291026, "grad_norm": 21.00261675206655, "learning_rate": 9.285074526999577e-06, "loss": 3.6881, "step": 804 }, { "epoch": 0.7674889762841139, "grad_norm": 17.580782501612884, "learning_rate": 9.282213775954951e-06, "loss": 3.761, "step": 805 }, { "epoch": 0.7684423787391252, "grad_norm": 19.563866900119418, "learning_rate": 9.279347755263046e-06, "loss": 3.6123, "step": 806 }, { "epoch": 0.7693957811941365, "grad_norm": 20.41001266709308, "learning_rate": 9.276476468450754e-06, "loss": 3.6722, "step": 807 }, { "epoch": 0.770349183649148, "grad_norm": 21.63621546928935, "learning_rate": 9.273599919051452e-06, "loss": 3.6484, "step": 808 }, { "epoch": 0.7713025861041592, "grad_norm": 20.11536943524368, "learning_rate": 9.27071811060499e-06, "loss": 3.6375, "step": 809 }, { "epoch": 0.7722559885591705, "grad_norm": 21.239669722462263, "learning_rate": 9.267831046657693e-06, "loss": 3.6743, "step": 810 }, { "epoch": 0.7732093910141818, "grad_norm": 19.101626903927976, "learning_rate": 9.26493873076235e-06, "loss": 3.6978, "step": 811 }, { "epoch": 0.7741627934691931, "grad_norm": 21.404247167103236, "learning_rate": 9.262041166478215e-06, "loss": 3.9582, "step": 812 }, { "epoch": 0.7751161959242046, "grad_norm": 17.858512989400538, "learning_rate": 9.259138357370998e-06, "loss": 3.5537, "step": 813 }, { "epoch": 0.7760695983792159, "grad_norm": 19.461707117310045, "learning_rate": 9.25623030701287e-06, "loss": 3.951, "step": 814 }, { "epoch": 0.7770230008342272, "grad_norm": 19.264748708251272, "learning_rate": 9.253317018982444e-06, "loss": 3.6148, "step": 815 }, { "epoch": 0.7779764032892384, "grad_norm": 18.59873025433661, "learning_rate": 9.250398496864782e-06, "loss": 3.7723, "step": 816 }, { "epoch": 0.7789298057442497, "grad_norm": 21.844105952026503, "learning_rate": 9.247474744251387e-06, "loss": 3.6615, "step": 817 }, { "epoch": 0.7798832081992612, "grad_norm": 19.38172288807973, "learning_rate": 9.244545764740199e-06, "loss": 3.4786, "step": 818 }, { "epoch": 0.7808366106542725, "grad_norm": 15.871018100931154, "learning_rate": 9.241611561935589e-06, "loss": 3.7503, "step": 819 }, { "epoch": 0.7817900131092838, "grad_norm": 21.307845145202705, "learning_rate": 9.238672139448354e-06, "loss": 3.7427, "step": 820 }, { "epoch": 0.782743415564295, "grad_norm": 20.200919605667305, "learning_rate": 9.235727500895721e-06, "loss": 3.6132, "step": 821 }, { "epoch": 0.7836968180193064, "grad_norm": 21.39396047658353, "learning_rate": 9.232777649901327e-06, "loss": 3.7095, "step": 822 }, { "epoch": 0.7846502204743178, "grad_norm": 16.141032634513888, "learning_rate": 9.22982259009523e-06, "loss": 3.6662, "step": 823 }, { "epoch": 0.7856036229293291, "grad_norm": 20.690308594036807, "learning_rate": 9.226862325113894e-06, "loss": 3.6948, "step": 824 }, { "epoch": 0.7865570253843404, "grad_norm": 20.580936769623378, "learning_rate": 9.223896858600192e-06, "loss": 3.6344, "step": 825 }, { "epoch": 0.7875104278393517, "grad_norm": 23.27336220764639, "learning_rate": 9.220926194203394e-06, "loss": 3.6191, "step": 826 }, { "epoch": 0.788463830294363, "grad_norm": 19.71121204825298, "learning_rate": 9.217950335579169e-06, "loss": 3.9559, "step": 827 }, { "epoch": 0.7894172327493744, "grad_norm": 19.67807996058843, "learning_rate": 9.214969286389577e-06, "loss": 3.5608, "step": 828 }, { "epoch": 0.7903706352043857, "grad_norm": 20.44982213291286, "learning_rate": 9.211983050303067e-06, "loss": 3.7392, "step": 829 }, { "epoch": 0.791324037659397, "grad_norm": 17.34304601787974, "learning_rate": 9.208991630994466e-06, "loss": 3.783, "step": 830 }, { "epoch": 0.7922774401144083, "grad_norm": 19.052092863299986, "learning_rate": 9.20599503214499e-06, "loss": 3.9628, "step": 831 }, { "epoch": 0.7932308425694197, "grad_norm": 18.335484765464646, "learning_rate": 9.202993257442216e-06, "loss": 3.5759, "step": 832 }, { "epoch": 0.794184245024431, "grad_norm": 22.867412599476896, "learning_rate": 9.1999863105801e-06, "loss": 3.7729, "step": 833 }, { "epoch": 0.7951376474794423, "grad_norm": 18.61485250895624, "learning_rate": 9.196974195258957e-06, "loss": 3.6906, "step": 834 }, { "epoch": 0.7960910499344536, "grad_norm": 17.086617371596514, "learning_rate": 9.193956915185466e-06, "loss": 3.6429, "step": 835 }, { "epoch": 0.7970444523894649, "grad_norm": 18.1730324345673, "learning_rate": 9.190934474072658e-06, "loss": 3.516, "step": 836 }, { "epoch": 0.7979978548444763, "grad_norm": 17.225036510909483, "learning_rate": 9.18790687563992e-06, "loss": 3.7818, "step": 837 }, { "epoch": 0.7989512572994876, "grad_norm": 20.910134149861555, "learning_rate": 9.184874123612983e-06, "loss": 3.2522, "step": 838 }, { "epoch": 0.7999046597544989, "grad_norm": 27.49357300862323, "learning_rate": 9.181836221723916e-06, "loss": 3.7567, "step": 839 }, { "epoch": 0.8008580622095102, "grad_norm": 20.859777004204712, "learning_rate": 9.178793173711133e-06, "loss": 3.5255, "step": 840 }, { "epoch": 0.8018114646645215, "grad_norm": 17.88637704538415, "learning_rate": 9.175744983319374e-06, "loss": 3.6831, "step": 841 }, { "epoch": 0.8027648671195329, "grad_norm": 19.321897269827105, "learning_rate": 9.172691654299712e-06, "loss": 3.7317, "step": 842 }, { "epoch": 0.8037182695745442, "grad_norm": 25.84996468611731, "learning_rate": 9.169633190409542e-06, "loss": 3.4048, "step": 843 }, { "epoch": 0.8046716720295555, "grad_norm": 20.22607005854188, "learning_rate": 9.166569595412576e-06, "loss": 3.6014, "step": 844 }, { "epoch": 0.8056250744845668, "grad_norm": 20.06411589841278, "learning_rate": 9.163500873078842e-06, "loss": 3.6457, "step": 845 }, { "epoch": 0.8065784769395781, "grad_norm": 16.127280919829985, "learning_rate": 9.160427027184677e-06, "loss": 3.8747, "step": 846 }, { "epoch": 0.8075318793945895, "grad_norm": 21.591437637711838, "learning_rate": 9.157348061512728e-06, "loss": 3.7583, "step": 847 }, { "epoch": 0.8084852818496008, "grad_norm": 18.304095725045485, "learning_rate": 9.154263979851932e-06, "loss": 3.5297, "step": 848 }, { "epoch": 0.8094386843046121, "grad_norm": 16.841495027255135, "learning_rate": 9.151174785997527e-06, "loss": 3.4157, "step": 849 }, { "epoch": 0.8103920867596234, "grad_norm": 21.255759145776402, "learning_rate": 9.14808048375105e-06, "loss": 3.8601, "step": 850 }, { "epoch": 0.8113454892146347, "grad_norm": 18.329744927110355, "learning_rate": 9.144981076920308e-06, "loss": 3.5818, "step": 851 }, { "epoch": 0.8122988916696461, "grad_norm": 20.384310345603943, "learning_rate": 9.141876569319405e-06, "loss": 3.4998, "step": 852 }, { "epoch": 0.8132522941246574, "grad_norm": 17.38023391129369, "learning_rate": 9.138766964768711e-06, "loss": 3.6461, "step": 853 }, { "epoch": 0.8142056965796687, "grad_norm": 17.532203727232996, "learning_rate": 9.135652267094877e-06, "loss": 3.6754, "step": 854 }, { "epoch": 0.81515909903468, "grad_norm": 24.81889937031987, "learning_rate": 9.132532480130813e-06, "loss": 3.5618, "step": 855 }, { "epoch": 0.8161125014896914, "grad_norm": 18.93479043657124, "learning_rate": 9.129407607715697e-06, "loss": 3.8351, "step": 856 }, { "epoch": 0.8170659039447027, "grad_norm": 18.495180507417132, "learning_rate": 9.126277653694966e-06, "loss": 3.8603, "step": 857 }, { "epoch": 0.818019306399714, "grad_norm": 21.691372961700775, "learning_rate": 9.123142621920308e-06, "loss": 3.4601, "step": 858 }, { "epoch": 0.8189727088547253, "grad_norm": 18.25549165925455, "learning_rate": 9.12000251624966e-06, "loss": 3.6172, "step": 859 }, { "epoch": 0.8199261113097366, "grad_norm": 19.588708242000653, "learning_rate": 9.116857340547203e-06, "loss": 3.5112, "step": 860 }, { "epoch": 0.820879513764748, "grad_norm": 18.972375729688313, "learning_rate": 9.113707098683358e-06, "loss": 3.7133, "step": 861 }, { "epoch": 0.8218329162197593, "grad_norm": 19.92727995953765, "learning_rate": 9.110551794534777e-06, "loss": 3.7265, "step": 862 }, { "epoch": 0.8227863186747706, "grad_norm": 19.14245945043222, "learning_rate": 9.107391431984346e-06, "loss": 3.7458, "step": 863 }, { "epoch": 0.8237397211297819, "grad_norm": 23.48379035023014, "learning_rate": 9.104226014921171e-06, "loss": 3.629, "step": 864 }, { "epoch": 0.8246931235847932, "grad_norm": 18.306599031841184, "learning_rate": 9.101055547240588e-06, "loss": 3.6758, "step": 865 }, { "epoch": 0.8256465260398046, "grad_norm": 17.390126544190778, "learning_rate": 9.097880032844133e-06, "loss": 3.9327, "step": 866 }, { "epoch": 0.8265999284948159, "grad_norm": 16.91420050467237, "learning_rate": 9.094699475639566e-06, "loss": 3.3931, "step": 867 }, { "epoch": 0.8275533309498272, "grad_norm": 18.62211073385219, "learning_rate": 9.091513879540845e-06, "loss": 3.8815, "step": 868 }, { "epoch": 0.8285067334048385, "grad_norm": 16.957705082171042, "learning_rate": 9.088323248468133e-06, "loss": 3.4733, "step": 869 }, { "epoch": 0.8294601358598498, "grad_norm": 24.990327640827246, "learning_rate": 9.085127586347786e-06, "loss": 3.8081, "step": 870 }, { "epoch": 0.8304135383148612, "grad_norm": 23.10048940979399, "learning_rate": 9.081926897112352e-06, "loss": 3.7937, "step": 871 }, { "epoch": 0.8313669407698725, "grad_norm": 23.192375755054698, "learning_rate": 9.078721184700565e-06, "loss": 3.4718, "step": 872 }, { "epoch": 0.8323203432248838, "grad_norm": 18.774906012497656, "learning_rate": 9.075510453057341e-06, "loss": 3.3801, "step": 873 }, { "epoch": 0.8332737456798951, "grad_norm": 25.102789864717007, "learning_rate": 9.072294706133775e-06, "loss": 3.5229, "step": 874 }, { "epoch": 0.8342271481349064, "grad_norm": 18.7384971207806, "learning_rate": 9.06907394788713e-06, "loss": 3.7157, "step": 875 }, { "epoch": 0.8351805505899178, "grad_norm": 21.84505327538972, "learning_rate": 9.065848182280835e-06, "loss": 3.7272, "step": 876 }, { "epoch": 0.8361339530449291, "grad_norm": 15.62584622183954, "learning_rate": 9.062617413284485e-06, "loss": 3.6383, "step": 877 }, { "epoch": 0.8370873554999404, "grad_norm": 21.397526272987246, "learning_rate": 9.059381644873833e-06, "loss": 3.9366, "step": 878 }, { "epoch": 0.8380407579549517, "grad_norm": 21.970713616607654, "learning_rate": 9.056140881030777e-06, "loss": 3.8779, "step": 879 }, { "epoch": 0.8389941604099631, "grad_norm": 21.42553138719002, "learning_rate": 9.05289512574337e-06, "loss": 3.6095, "step": 880 }, { "epoch": 0.8399475628649744, "grad_norm": 17.696524021313888, "learning_rate": 9.049644383005804e-06, "loss": 3.621, "step": 881 }, { "epoch": 0.8409009653199857, "grad_norm": 15.110946447415424, "learning_rate": 9.046388656818406e-06, "loss": 3.4429, "step": 882 }, { "epoch": 0.841854367774997, "grad_norm": 15.580551503757194, "learning_rate": 9.043127951187644e-06, "loss": 3.5141, "step": 883 }, { "epoch": 0.8428077702300083, "grad_norm": 16.756922362456773, "learning_rate": 9.039862270126102e-06, "loss": 3.6628, "step": 884 }, { "epoch": 0.8437611726850197, "grad_norm": 15.14747983401258, "learning_rate": 9.036591617652498e-06, "loss": 3.5985, "step": 885 }, { "epoch": 0.844714575140031, "grad_norm": 17.08375156695288, "learning_rate": 9.033315997791659e-06, "loss": 3.3104, "step": 886 }, { "epoch": 0.8456679775950423, "grad_norm": 18.4560992104435, "learning_rate": 9.030035414574529e-06, "loss": 3.4751, "step": 887 }, { "epoch": 0.8466213800500536, "grad_norm": 17.910665337433727, "learning_rate": 9.026749872038161e-06, "loss": 3.8139, "step": 888 }, { "epoch": 0.8475747825050649, "grad_norm": 21.384016660969664, "learning_rate": 9.02345937422571e-06, "loss": 3.8083, "step": 889 }, { "epoch": 0.8485281849600763, "grad_norm": 20.218932601782168, "learning_rate": 9.020163925186423e-06, "loss": 3.5463, "step": 890 }, { "epoch": 0.8494815874150876, "grad_norm": 21.307782503087065, "learning_rate": 9.016863528975647e-06, "loss": 3.7422, "step": 891 }, { "epoch": 0.8504349898700989, "grad_norm": 14.853402285351354, "learning_rate": 9.013558189654819e-06, "loss": 3.4842, "step": 892 }, { "epoch": 0.8513883923251102, "grad_norm": 20.592003141666297, "learning_rate": 9.01024791129145e-06, "loss": 3.7965, "step": 893 }, { "epoch": 0.8523417947801215, "grad_norm": 21.272339439055884, "learning_rate": 9.006932697959136e-06, "loss": 3.811, "step": 894 }, { "epoch": 0.8532951972351329, "grad_norm": 18.12610077968521, "learning_rate": 9.003612553737544e-06, "loss": 3.576, "step": 895 }, { "epoch": 0.8542485996901442, "grad_norm": 17.895188591092086, "learning_rate": 9.000287482712407e-06, "loss": 3.5145, "step": 896 }, { "epoch": 0.8552020021451555, "grad_norm": 17.57771636126358, "learning_rate": 8.996957488975523e-06, "loss": 3.5767, "step": 897 }, { "epoch": 0.8561554046001668, "grad_norm": 20.42214181776099, "learning_rate": 8.993622576624748e-06, "loss": 3.7169, "step": 898 }, { "epoch": 0.8571088070551781, "grad_norm": 20.361276106029784, "learning_rate": 8.99028274976399e-06, "loss": 3.5764, "step": 899 }, { "epoch": 0.8580622095101895, "grad_norm": 25.565375721222907, "learning_rate": 8.986938012503203e-06, "loss": 3.6594, "step": 900 }, { "epoch": 0.8590156119652008, "grad_norm": 20.903732528550115, "learning_rate": 8.983588368958387e-06, "loss": 3.8108, "step": 901 }, { "epoch": 0.8599690144202121, "grad_norm": 19.936098444737635, "learning_rate": 8.98023382325158e-06, "loss": 3.5588, "step": 902 }, { "epoch": 0.8609224168752234, "grad_norm": 19.82451634182694, "learning_rate": 8.976874379510848e-06, "loss": 3.514, "step": 903 }, { "epoch": 0.8618758193302348, "grad_norm": 20.62451215241615, "learning_rate": 8.973510041870287e-06, "loss": 3.8869, "step": 904 }, { "epoch": 0.8628292217852461, "grad_norm": 17.708395611653614, "learning_rate": 8.97014081447002e-06, "loss": 3.782, "step": 905 }, { "epoch": 0.8637826242402574, "grad_norm": 25.222466872819467, "learning_rate": 8.966766701456177e-06, "loss": 3.8438, "step": 906 }, { "epoch": 0.8647360266952687, "grad_norm": 25.298209501038084, "learning_rate": 8.963387706980909e-06, "loss": 3.6744, "step": 907 }, { "epoch": 0.86568942915028, "grad_norm": 18.855866577895025, "learning_rate": 8.960003835202369e-06, "loss": 3.6438, "step": 908 }, { "epoch": 0.8666428316052914, "grad_norm": 19.769853053592456, "learning_rate": 8.956615090284718e-06, "loss": 3.6989, "step": 909 }, { "epoch": 0.8675962340603027, "grad_norm": 20.78701285291881, "learning_rate": 8.953221476398106e-06, "loss": 3.7572, "step": 910 }, { "epoch": 0.868549636515314, "grad_norm": 18.840925048902545, "learning_rate": 8.949822997718681e-06, "loss": 3.6916, "step": 911 }, { "epoch": 0.8695030389703253, "grad_norm": 19.831469706953786, "learning_rate": 8.946419658428573e-06, "loss": 3.7431, "step": 912 }, { "epoch": 0.8704564414253366, "grad_norm": 17.054642815610336, "learning_rate": 8.943011462715899e-06, "loss": 3.4807, "step": 913 }, { "epoch": 0.871409843880348, "grad_norm": 20.04654667391656, "learning_rate": 8.939598414774746e-06, "loss": 3.8886, "step": 914 }, { "epoch": 0.8723632463353593, "grad_norm": 17.144271361565956, "learning_rate": 8.936180518805176e-06, "loss": 3.4119, "step": 915 }, { "epoch": 0.8733166487903706, "grad_norm": 20.93946583606002, "learning_rate": 8.932757779013214e-06, "loss": 3.7485, "step": 916 }, { "epoch": 0.8742700512453819, "grad_norm": 20.12999513134241, "learning_rate": 8.929330199610849e-06, "loss": 3.3793, "step": 917 }, { "epoch": 0.8752234537003932, "grad_norm": 17.546938252593023, "learning_rate": 8.925897784816025e-06, "loss": 3.6357, "step": 918 }, { "epoch": 0.8761768561554046, "grad_norm": 18.562299445130556, "learning_rate": 8.922460538852636e-06, "loss": 3.5974, "step": 919 }, { "epoch": 0.8771302586104159, "grad_norm": 21.494264602412894, "learning_rate": 8.919018465950517e-06, "loss": 3.4119, "step": 920 }, { "epoch": 0.8780836610654272, "grad_norm": 20.55278891304494, "learning_rate": 8.915571570345451e-06, "loss": 3.6142, "step": 921 }, { "epoch": 0.8790370635204385, "grad_norm": 19.33350792984897, "learning_rate": 8.912119856279151e-06, "loss": 3.8107, "step": 922 }, { "epoch": 0.8799904659754498, "grad_norm": 17.626998411557523, "learning_rate": 8.908663327999259e-06, "loss": 3.5774, "step": 923 }, { "epoch": 0.8809438684304612, "grad_norm": 21.449441903918064, "learning_rate": 8.90520198975934e-06, "loss": 3.8879, "step": 924 }, { "epoch": 0.8818972708854725, "grad_norm": 19.946924146365436, "learning_rate": 8.901735845818885e-06, "loss": 3.3077, "step": 925 }, { "epoch": 0.8828506733404838, "grad_norm": 17.754713494063974, "learning_rate": 8.898264900443291e-06, "loss": 3.5193, "step": 926 }, { "epoch": 0.8838040757954951, "grad_norm": 20.374970027658645, "learning_rate": 8.89478915790387e-06, "loss": 3.7456, "step": 927 }, { "epoch": 0.8847574782505065, "grad_norm": 19.085288978552402, "learning_rate": 8.89130862247783e-06, "loss": 3.6686, "step": 928 }, { "epoch": 0.8857108807055178, "grad_norm": 20.69133014247932, "learning_rate": 8.887823298448286e-06, "loss": 3.5233, "step": 929 }, { "epoch": 0.8866642831605291, "grad_norm": 21.96686887393393, "learning_rate": 8.884333190104237e-06, "loss": 3.7198, "step": 930 }, { "epoch": 0.8876176856155404, "grad_norm": 19.76683945666241, "learning_rate": 8.880838301740575e-06, "loss": 3.6551, "step": 931 }, { "epoch": 0.8885710880705517, "grad_norm": 20.755236826551258, "learning_rate": 8.877338637658074e-06, "loss": 3.7101, "step": 932 }, { "epoch": 0.8895244905255631, "grad_norm": 19.72944113913355, "learning_rate": 8.873834202163386e-06, "loss": 3.7599, "step": 933 }, { "epoch": 0.8904778929805744, "grad_norm": 22.320200769846455, "learning_rate": 8.870324999569025e-06, "loss": 3.6616, "step": 934 }, { "epoch": 0.8914312954355857, "grad_norm": 20.461367643712205, "learning_rate": 8.866811034193386e-06, "loss": 3.7872, "step": 935 }, { "epoch": 0.892384697890597, "grad_norm": 19.32970167788679, "learning_rate": 8.863292310360716e-06, "loss": 3.5581, "step": 936 }, { "epoch": 0.8933381003456083, "grad_norm": 18.294730876728273, "learning_rate": 8.859768832401117e-06, "loss": 3.7361, "step": 937 }, { "epoch": 0.8942915028006198, "grad_norm": 17.824949314211594, "learning_rate": 8.856240604650547e-06, "loss": 3.5735, "step": 938 }, { "epoch": 0.895244905255631, "grad_norm": 22.288807929641262, "learning_rate": 8.852707631450807e-06, "loss": 3.8671, "step": 939 }, { "epoch": 0.8961983077106424, "grad_norm": 20.21154331561062, "learning_rate": 8.849169917149532e-06, "loss": 3.559, "step": 940 }, { "epoch": 0.8971517101656536, "grad_norm": 18.58017288631965, "learning_rate": 8.845627466100199e-06, "loss": 3.6887, "step": 941 }, { "epoch": 0.898105112620665, "grad_norm": 21.106611878243918, "learning_rate": 8.842080282662111e-06, "loss": 3.7499, "step": 942 }, { "epoch": 0.8990585150756764, "grad_norm": 18.533937594260124, "learning_rate": 8.838528371200395e-06, "loss": 3.4706, "step": 943 }, { "epoch": 0.9000119175306877, "grad_norm": 19.529438482191683, "learning_rate": 8.834971736085995e-06, "loss": 3.7092, "step": 944 }, { "epoch": 0.900965319985699, "grad_norm": 17.192103009946592, "learning_rate": 8.831410381695669e-06, "loss": 3.9297, "step": 945 }, { "epoch": 0.9019187224407103, "grad_norm": 19.942216954995946, "learning_rate": 8.827844312411984e-06, "loss": 3.6172, "step": 946 }, { "epoch": 0.9028721248957217, "grad_norm": 18.964491420112143, "learning_rate": 8.824273532623305e-06, "loss": 3.6857, "step": 947 }, { "epoch": 0.903825527350733, "grad_norm": 19.39437154148688, "learning_rate": 8.820698046723796e-06, "loss": 3.6747, "step": 948 }, { "epoch": 0.9047789298057443, "grad_norm": 16.795114931027907, "learning_rate": 8.817117859113413e-06, "loss": 3.7027, "step": 949 }, { "epoch": 0.9057323322607556, "grad_norm": 23.720473767646006, "learning_rate": 8.813532974197897e-06, "loss": 3.765, "step": 950 }, { "epoch": 0.9066857347157669, "grad_norm": 21.229029565472164, "learning_rate": 8.809943396388774e-06, "loss": 3.6105, "step": 951 }, { "epoch": 0.9076391371707783, "grad_norm": 18.958695115743637, "learning_rate": 8.806349130103334e-06, "loss": 3.4158, "step": 952 }, { "epoch": 0.9085925396257896, "grad_norm": 18.924918174366262, "learning_rate": 8.802750179764647e-06, "loss": 3.3701, "step": 953 }, { "epoch": 0.9095459420808009, "grad_norm": 17.953486109274678, "learning_rate": 8.799146549801543e-06, "loss": 3.6863, "step": 954 }, { "epoch": 0.9104993445358122, "grad_norm": 23.556878826070232, "learning_rate": 8.79553824464861e-06, "loss": 3.7458, "step": 955 }, { "epoch": 0.9114527469908235, "grad_norm": 20.116177361702935, "learning_rate": 8.791925268746193e-06, "loss": 3.7151, "step": 956 }, { "epoch": 0.9124061494458349, "grad_norm": 17.830458176543026, "learning_rate": 8.78830762654038e-06, "loss": 3.7047, "step": 957 }, { "epoch": 0.9133595519008462, "grad_norm": 23.984891337270245, "learning_rate": 8.784685322483003e-06, "loss": 3.8279, "step": 958 }, { "epoch": 0.9143129543558575, "grad_norm": 18.039973029222814, "learning_rate": 8.781058361031634e-06, "loss": 3.6834, "step": 959 }, { "epoch": 0.9152663568108688, "grad_norm": 16.5729750383268, "learning_rate": 8.777426746649571e-06, "loss": 3.6143, "step": 960 }, { "epoch": 0.9162197592658801, "grad_norm": 22.19549976704451, "learning_rate": 8.773790483805843e-06, "loss": 3.6394, "step": 961 }, { "epoch": 0.9171731617208915, "grad_norm": 20.033611495104065, "learning_rate": 8.770149576975193e-06, "loss": 3.8049, "step": 962 }, { "epoch": 0.9181265641759028, "grad_norm": 18.678891761754763, "learning_rate": 8.766504030638087e-06, "loss": 3.8454, "step": 963 }, { "epoch": 0.9190799666309141, "grad_norm": 20.1308796771559, "learning_rate": 8.762853849280692e-06, "loss": 3.5212, "step": 964 }, { "epoch": 0.9200333690859254, "grad_norm": 24.384666848897997, "learning_rate": 8.759199037394888e-06, "loss": 3.8191, "step": 965 }, { "epoch": 0.9209867715409367, "grad_norm": 20.258346364851302, "learning_rate": 8.755539599478244e-06, "loss": 3.6554, "step": 966 }, { "epoch": 0.9219401739959481, "grad_norm": 19.611557435163274, "learning_rate": 8.751875540034026e-06, "loss": 3.6066, "step": 967 }, { "epoch": 0.9228935764509594, "grad_norm": 23.620770099147684, "learning_rate": 8.748206863571188e-06, "loss": 3.7239, "step": 968 }, { "epoch": 0.9238469789059707, "grad_norm": 26.387777353887813, "learning_rate": 8.744533574604366e-06, "loss": 3.8567, "step": 969 }, { "epoch": 0.924800381360982, "grad_norm": 22.032456564218077, "learning_rate": 8.740855677653868e-06, "loss": 3.6308, "step": 970 }, { "epoch": 0.9257537838159934, "grad_norm": 18.01985020820172, "learning_rate": 8.737173177245677e-06, "loss": 3.4782, "step": 971 }, { "epoch": 0.9267071862710047, "grad_norm": 18.11859796779658, "learning_rate": 8.73348607791144e-06, "loss": 3.6122, "step": 972 }, { "epoch": 0.927660588726016, "grad_norm": 19.087721286593602, "learning_rate": 8.729794384188462e-06, "loss": 3.5161, "step": 973 }, { "epoch": 0.9286139911810273, "grad_norm": 20.49218256907962, "learning_rate": 8.7260981006197e-06, "loss": 3.544, "step": 974 }, { "epoch": 0.9295673936360386, "grad_norm": 17.56617585184201, "learning_rate": 8.722397231753766e-06, "loss": 3.4666, "step": 975 }, { "epoch": 0.93052079609105, "grad_norm": 19.764551186618498, "learning_rate": 8.718691782144908e-06, "loss": 3.6722, "step": 976 }, { "epoch": 0.9314741985460613, "grad_norm": 21.67331896078188, "learning_rate": 8.714981756353014e-06, "loss": 3.7865, "step": 977 }, { "epoch": 0.9324276010010726, "grad_norm": 21.07031903640165, "learning_rate": 8.711267158943602e-06, "loss": 3.815, "step": 978 }, { "epoch": 0.9333810034560839, "grad_norm": 17.920267251181368, "learning_rate": 8.707547994487818e-06, "loss": 3.6379, "step": 979 }, { "epoch": 0.9343344059110952, "grad_norm": 17.90392175948924, "learning_rate": 8.703824267562424e-06, "loss": 3.5538, "step": 980 }, { "epoch": 0.9352878083661066, "grad_norm": 20.930960486464688, "learning_rate": 8.700095982749804e-06, "loss": 3.6125, "step": 981 }, { "epoch": 0.9362412108211179, "grad_norm": 18.12456435632318, "learning_rate": 8.69636314463794e-06, "loss": 3.638, "step": 982 }, { "epoch": 0.9371946132761292, "grad_norm": 25.562630540798175, "learning_rate": 8.692625757820429e-06, "loss": 3.8365, "step": 983 }, { "epoch": 0.9381480157311405, "grad_norm": 20.66942575189988, "learning_rate": 8.688883826896458e-06, "loss": 3.5041, "step": 984 }, { "epoch": 0.9391014181861518, "grad_norm": 17.31125263563036, "learning_rate": 8.685137356470803e-06, "loss": 3.6745, "step": 985 }, { "epoch": 0.9400548206411632, "grad_norm": 20.253726501277384, "learning_rate": 8.681386351153839e-06, "loss": 3.7334, "step": 986 }, { "epoch": 0.9410082230961745, "grad_norm": 23.887277494024787, "learning_rate": 8.67763081556151e-06, "loss": 3.6174, "step": 987 }, { "epoch": 0.9419616255511858, "grad_norm": 23.095259308136956, "learning_rate": 8.673870754315336e-06, "loss": 3.5274, "step": 988 }, { "epoch": 0.9429150280061971, "grad_norm": 18.233687421712432, "learning_rate": 8.670106172042415e-06, "loss": 3.6983, "step": 989 }, { "epoch": 0.9438684304612084, "grad_norm": 16.911985565552985, "learning_rate": 8.666337073375398e-06, "loss": 3.8184, "step": 990 }, { "epoch": 0.9448218329162198, "grad_norm": 18.733257436945514, "learning_rate": 8.6625634629525e-06, "loss": 3.6425, "step": 991 }, { "epoch": 0.9457752353712311, "grad_norm": 19.661809132941507, "learning_rate": 8.658785345417484e-06, "loss": 3.7597, "step": 992 }, { "epoch": 0.9467286378262424, "grad_norm": 21.427327006785724, "learning_rate": 8.655002725419663e-06, "loss": 3.8327, "step": 993 }, { "epoch": 0.9476820402812537, "grad_norm": 19.030098689595334, "learning_rate": 8.651215607613891e-06, "loss": 3.6147, "step": 994 }, { "epoch": 0.9486354427362651, "grad_norm": 21.34256773050582, "learning_rate": 8.647423996660556e-06, "loss": 3.668, "step": 995 }, { "epoch": 0.9495888451912764, "grad_norm": 18.72160577365826, "learning_rate": 8.64362789722557e-06, "loss": 3.6841, "step": 996 }, { "epoch": 0.9505422476462877, "grad_norm": 25.582679673453995, "learning_rate": 8.639827313980377e-06, "loss": 3.4846, "step": 997 }, { "epoch": 0.951495650101299, "grad_norm": 19.337028625372522, "learning_rate": 8.636022251601935e-06, "loss": 3.6144, "step": 998 }, { "epoch": 0.9524490525563103, "grad_norm": 18.797426745424776, "learning_rate": 8.63221271477271e-06, "loss": 3.8351, "step": 999 }, { "epoch": 0.9534024550113217, "grad_norm": 18.0693312248646, "learning_rate": 8.62839870818068e-06, "loss": 3.7506, "step": 1000 }, { "epoch": 0.954355857466333, "grad_norm": 19.931857415947523, "learning_rate": 8.624580236519325e-06, "loss": 3.6913, "step": 1001 }, { "epoch": 0.9553092599213443, "grad_norm": 20.613536131728605, "learning_rate": 8.620757304487612e-06, "loss": 3.6418, "step": 1002 }, { "epoch": 0.9562626623763556, "grad_norm": 17.642286551732212, "learning_rate": 8.616929916790002e-06, "loss": 3.6952, "step": 1003 }, { "epoch": 0.9572160648313669, "grad_norm": 18.5869215379394, "learning_rate": 8.613098078136436e-06, "loss": 3.735, "step": 1004 }, { "epoch": 0.9581694672863783, "grad_norm": 18.900633122736462, "learning_rate": 8.60926179324234e-06, "loss": 3.8392, "step": 1005 }, { "epoch": 0.9591228697413896, "grad_norm": 18.350270460924044, "learning_rate": 8.605421066828599e-06, "loss": 3.5951, "step": 1006 }, { "epoch": 0.9600762721964009, "grad_norm": 15.58092847930561, "learning_rate": 8.601575903621576e-06, "loss": 3.6846, "step": 1007 }, { "epoch": 0.9610296746514122, "grad_norm": 26.123629225888315, "learning_rate": 8.597726308353085e-06, "loss": 3.7103, "step": 1008 }, { "epoch": 0.9619830771064235, "grad_norm": 20.98502541539002, "learning_rate": 8.593872285760401e-06, "loss": 3.7758, "step": 1009 }, { "epoch": 0.9629364795614349, "grad_norm": 17.431377443913906, "learning_rate": 8.59001384058624e-06, "loss": 3.895, "step": 1010 }, { "epoch": 0.9638898820164462, "grad_norm": 17.445271952264054, "learning_rate": 8.586150977578765e-06, "loss": 3.5972, "step": 1011 }, { "epoch": 0.9648432844714575, "grad_norm": 19.10164958584953, "learning_rate": 8.582283701491576e-06, "loss": 3.5281, "step": 1012 }, { "epoch": 0.9657966869264688, "grad_norm": 17.144786257157556, "learning_rate": 8.578412017083701e-06, "loss": 3.7712, "step": 1013 }, { "epoch": 0.9667500893814801, "grad_norm": 19.278234060811414, "learning_rate": 8.574535929119598e-06, "loss": 3.6645, "step": 1014 }, { "epoch": 0.9677034918364915, "grad_norm": 19.19484500147491, "learning_rate": 8.570655442369134e-06, "loss": 3.7363, "step": 1015 }, { "epoch": 0.9686568942915028, "grad_norm": 19.057382200357566, "learning_rate": 8.566770561607598e-06, "loss": 3.7884, "step": 1016 }, { "epoch": 0.9696102967465141, "grad_norm": 21.11125828626934, "learning_rate": 8.562881291615685e-06, "loss": 3.6414, "step": 1017 }, { "epoch": 0.9705636992015254, "grad_norm": 18.463647892745485, "learning_rate": 8.558987637179488e-06, "loss": 3.7156, "step": 1018 }, { "epoch": 0.9715171016565368, "grad_norm": 17.262402870212775, "learning_rate": 8.555089603090496e-06, "loss": 3.6325, "step": 1019 }, { "epoch": 0.9724705041115481, "grad_norm": 19.304574887450972, "learning_rate": 8.551187194145591e-06, "loss": 3.6694, "step": 1020 }, { "epoch": 0.9734239065665594, "grad_norm": 20.274086359882197, "learning_rate": 8.547280415147038e-06, "loss": 3.5451, "step": 1021 }, { "epoch": 0.9743773090215707, "grad_norm": 21.65496390966592, "learning_rate": 8.543369270902474e-06, "loss": 3.6665, "step": 1022 }, { "epoch": 0.975330711476582, "grad_norm": 16.52491835097125, "learning_rate": 8.539453766224915e-06, "loss": 3.7257, "step": 1023 }, { "epoch": 0.9762841139315934, "grad_norm": 18.52328432370933, "learning_rate": 8.535533905932739e-06, "loss": 3.9279, "step": 1024 }, { "epoch": 0.9772375163866047, "grad_norm": 20.688807061145084, "learning_rate": 8.531609694849683e-06, "loss": 3.6398, "step": 1025 }, { "epoch": 0.978190918841616, "grad_norm": 20.270117623791293, "learning_rate": 8.527681137804843e-06, "loss": 3.6387, "step": 1026 }, { "epoch": 0.9791443212966273, "grad_norm": 20.639801960198373, "learning_rate": 8.52374823963266e-06, "loss": 4.0619, "step": 1027 }, { "epoch": 0.9800977237516386, "grad_norm": 17.8805969367878, "learning_rate": 8.519811005172916e-06, "loss": 3.8255, "step": 1028 }, { "epoch": 0.98105112620665, "grad_norm": 18.823504093547815, "learning_rate": 8.51586943927073e-06, "loss": 3.6382, "step": 1029 }, { "epoch": 0.9820045286616613, "grad_norm": 19.084929418344117, "learning_rate": 8.51192354677655e-06, "loss": 3.676, "step": 1030 }, { "epoch": 0.9829579311166726, "grad_norm": 17.223560347711267, "learning_rate": 8.507973332546153e-06, "loss": 3.4917, "step": 1031 }, { "epoch": 0.9839113335716839, "grad_norm": 19.54761920405057, "learning_rate": 8.50401880144063e-06, "loss": 3.7394, "step": 1032 }, { "epoch": 0.9848647360266952, "grad_norm": 25.759979021911285, "learning_rate": 8.500059958326385e-06, "loss": 3.7637, "step": 1033 }, { "epoch": 0.9858181384817066, "grad_norm": 18.587241616944937, "learning_rate": 8.49609680807513e-06, "loss": 3.701, "step": 1034 }, { "epoch": 0.9867715409367179, "grad_norm": 17.472887614237933, "learning_rate": 8.492129355563874e-06, "loss": 3.6457, "step": 1035 }, { "epoch": 0.9877249433917292, "grad_norm": 19.57681166168716, "learning_rate": 8.488157605674924e-06, "loss": 3.8844, "step": 1036 }, { "epoch": 0.9886783458467405, "grad_norm": 14.816385679357204, "learning_rate": 8.484181563295874e-06, "loss": 3.2408, "step": 1037 }, { "epoch": 0.9896317483017518, "grad_norm": 16.90906857223358, "learning_rate": 8.480201233319598e-06, "loss": 3.6393, "step": 1038 }, { "epoch": 0.9905851507567632, "grad_norm": 18.344857094203686, "learning_rate": 8.476216620644252e-06, "loss": 3.5517, "step": 1039 }, { "epoch": 0.9915385532117745, "grad_norm": 28.23874094169735, "learning_rate": 8.472227730173252e-06, "loss": 3.828, "step": 1040 }, { "epoch": 0.9924919556667858, "grad_norm": 16.663981942509405, "learning_rate": 8.46823456681529e-06, "loss": 3.5243, "step": 1041 }, { "epoch": 0.9934453581217971, "grad_norm": 24.067104042664898, "learning_rate": 8.46423713548431e-06, "loss": 3.738, "step": 1042 }, { "epoch": 0.9943987605768085, "grad_norm": 17.13215397488891, "learning_rate": 8.460235441099509e-06, "loss": 3.7161, "step": 1043 }, { "epoch": 0.9953521630318198, "grad_norm": 17.416151240057, "learning_rate": 8.456229488585328e-06, "loss": 3.6706, "step": 1044 }, { "epoch": 0.9963055654868311, "grad_norm": 18.565565178413507, "learning_rate": 8.452219282871452e-06, "loss": 3.5367, "step": 1045 }, { "epoch": 0.9972589679418424, "grad_norm": 18.727698542322138, "learning_rate": 8.448204828892799e-06, "loss": 3.784, "step": 1046 }, { "epoch": 0.9982123703968537, "grad_norm": 14.384949494486222, "learning_rate": 8.44418613158951e-06, "loss": 3.6971, "step": 1047 }, { "epoch": 0.9991657728518651, "grad_norm": 16.060832946424547, "learning_rate": 8.440163195906959e-06, "loss": 3.7596, "step": 1048 }, { "epoch": 1.0, "grad_norm": 16.060832946424547, "learning_rate": 8.436136026795718e-06, "loss": 3.7239, "step": 1049 }, { "epoch": 1.0009534024550113, "grad_norm": 18.22179789730615, "learning_rate": 8.432104629211589e-06, "loss": 3.0804, "step": 1050 }, { "epoch": 1.0019068049100226, "grad_norm": 19.439940414940658, "learning_rate": 8.42806900811556e-06, "loss": 3.0095, "step": 1051 }, { "epoch": 1.002860207365034, "grad_norm": 19.688214450172996, "learning_rate": 8.424029168473829e-06, "loss": 3.2907, "step": 1052 }, { "epoch": 1.0038136098200452, "grad_norm": 17.49322475806205, "learning_rate": 8.419985115257777e-06, "loss": 3.2804, "step": 1053 }, { "epoch": 1.0047670122750567, "grad_norm": 15.414509437971413, "learning_rate": 8.415936853443976e-06, "loss": 3.2681, "step": 1054 }, { "epoch": 1.005720414730068, "grad_norm": 22.329186317583424, "learning_rate": 8.411884388014171e-06, "loss": 3.2395, "step": 1055 }, { "epoch": 1.0066738171850793, "grad_norm": 20.119660151445103, "learning_rate": 8.407827723955287e-06, "loss": 3.4185, "step": 1056 }, { "epoch": 1.0076272196400906, "grad_norm": 16.49758162219479, "learning_rate": 8.40376686625941e-06, "loss": 3.4752, "step": 1057 }, { "epoch": 1.008580622095102, "grad_norm": 16.843357367110357, "learning_rate": 8.399701819923791e-06, "loss": 3.0832, "step": 1058 }, { "epoch": 1.0095340245501132, "grad_norm": 17.39306767481526, "learning_rate": 8.39563258995083e-06, "loss": 3.2722, "step": 1059 }, { "epoch": 1.0104874270051245, "grad_norm": 18.740433277849746, "learning_rate": 8.391559181348081e-06, "loss": 3.2877, "step": 1060 }, { "epoch": 1.0114408294601358, "grad_norm": 17.295412727074396, "learning_rate": 8.387481599128237e-06, "loss": 3.0154, "step": 1061 }, { "epoch": 1.012394231915147, "grad_norm": 18.53980839945021, "learning_rate": 8.383399848309128e-06, "loss": 3.058, "step": 1062 }, { "epoch": 1.0133476343701584, "grad_norm": 18.421809835383396, "learning_rate": 8.379313933913715e-06, "loss": 3.3334, "step": 1063 }, { "epoch": 1.01430103682517, "grad_norm": 18.07036648657408, "learning_rate": 8.375223860970078e-06, "loss": 3.1606, "step": 1064 }, { "epoch": 1.0152544392801812, "grad_norm": 19.59576884973211, "learning_rate": 8.371129634511423e-06, "loss": 3.2684, "step": 1065 }, { "epoch": 1.0162078417351925, "grad_norm": 20.31546063326952, "learning_rate": 8.367031259576057e-06, "loss": 3.3106, "step": 1066 }, { "epoch": 1.0171612441902038, "grad_norm": 23.590117946139028, "learning_rate": 8.3629287412074e-06, "loss": 3.3808, "step": 1067 }, { "epoch": 1.0181146466452151, "grad_norm": 18.286147309063125, "learning_rate": 8.358822084453964e-06, "loss": 3.2253, "step": 1068 }, { "epoch": 1.0190680491002264, "grad_norm": 24.154431205828267, "learning_rate": 8.354711294369363e-06, "loss": 3.0219, "step": 1069 }, { "epoch": 1.0200214515552377, "grad_norm": 19.26384695275529, "learning_rate": 8.35059637601229e-06, "loss": 3.1146, "step": 1070 }, { "epoch": 1.020974854010249, "grad_norm": 20.1914447075837, "learning_rate": 8.346477334446522e-06, "loss": 3.0302, "step": 1071 }, { "epoch": 1.0219282564652603, "grad_norm": 18.833197162650645, "learning_rate": 8.342354174740904e-06, "loss": 3.3289, "step": 1072 }, { "epoch": 1.0228816589202716, "grad_norm": 17.886608378413964, "learning_rate": 8.338226901969355e-06, "loss": 3.1086, "step": 1073 }, { "epoch": 1.0238350613752831, "grad_norm": 19.398163832241387, "learning_rate": 8.334095521210855e-06, "loss": 3.2295, "step": 1074 }, { "epoch": 1.0247884638302944, "grad_norm": 19.567088726752818, "learning_rate": 8.329960037549433e-06, "loss": 3.0205, "step": 1075 }, { "epoch": 1.0257418662853057, "grad_norm": 16.88539655526215, "learning_rate": 8.325820456074181e-06, "loss": 2.8962, "step": 1076 }, { "epoch": 1.026695268740317, "grad_norm": 17.291909937676326, "learning_rate": 8.321676781879215e-06, "loss": 3.0265, "step": 1077 }, { "epoch": 1.0276486711953283, "grad_norm": 19.710784191794247, "learning_rate": 8.317529020063704e-06, "loss": 3.1158, "step": 1078 }, { "epoch": 1.0286020736503396, "grad_norm": 19.349907244865797, "learning_rate": 8.313377175731835e-06, "loss": 3.0528, "step": 1079 }, { "epoch": 1.029555476105351, "grad_norm": 22.33109079627624, "learning_rate": 8.309221253992825e-06, "loss": 3.1295, "step": 1080 }, { "epoch": 1.0305088785603622, "grad_norm": 20.34505661749425, "learning_rate": 8.30506125996091e-06, "loss": 2.9751, "step": 1081 }, { "epoch": 1.0314622810153735, "grad_norm": 18.071016893539525, "learning_rate": 8.300897198755333e-06, "loss": 2.9169, "step": 1082 }, { "epoch": 1.032415683470385, "grad_norm": 15.582360772584865, "learning_rate": 8.296729075500345e-06, "loss": 2.9667, "step": 1083 }, { "epoch": 1.0333690859253963, "grad_norm": 22.76599475729743, "learning_rate": 8.292556895325195e-06, "loss": 3.0293, "step": 1084 }, { "epoch": 1.0343224883804076, "grad_norm": 18.178325638649316, "learning_rate": 8.288380663364122e-06, "loss": 3.1362, "step": 1085 }, { "epoch": 1.035275890835419, "grad_norm": 24.561007823180812, "learning_rate": 8.284200384756354e-06, "loss": 3.2759, "step": 1086 }, { "epoch": 1.0362292932904302, "grad_norm": 19.521112030168293, "learning_rate": 8.280016064646099e-06, "loss": 3.1598, "step": 1087 }, { "epoch": 1.0371826957454415, "grad_norm": 20.409562655593124, "learning_rate": 8.275827708182536e-06, "loss": 3.131, "step": 1088 }, { "epoch": 1.0381360982004528, "grad_norm": 19.982311894219354, "learning_rate": 8.271635320519815e-06, "loss": 3.1446, "step": 1089 }, { "epoch": 1.0390895006554641, "grad_norm": 19.28066821799892, "learning_rate": 8.26743890681704e-06, "loss": 3.0644, "step": 1090 }, { "epoch": 1.0400429031104754, "grad_norm": 20.949351814351704, "learning_rate": 8.263238472238278e-06, "loss": 3.0954, "step": 1091 }, { "epoch": 1.0409963055654867, "grad_norm": 20.531749658698253, "learning_rate": 8.259034021952537e-06, "loss": 3.1093, "step": 1092 }, { "epoch": 1.0419497080204982, "grad_norm": 19.88864937096013, "learning_rate": 8.25482556113377e-06, "loss": 3.2334, "step": 1093 }, { "epoch": 1.0429031104755095, "grad_norm": 19.968178431640183, "learning_rate": 8.250613094960865e-06, "loss": 2.9953, "step": 1094 }, { "epoch": 1.0438565129305208, "grad_norm": 17.76354070397676, "learning_rate": 8.246396628617637e-06, "loss": 3.2141, "step": 1095 }, { "epoch": 1.0448099153855321, "grad_norm": 17.77597143265063, "learning_rate": 8.242176167292827e-06, "loss": 2.8239, "step": 1096 }, { "epoch": 1.0457633178405434, "grad_norm": 17.424116844734687, "learning_rate": 8.23795171618009e-06, "loss": 3.0997, "step": 1097 }, { "epoch": 1.0467167202955547, "grad_norm": 20.361608200788428, "learning_rate": 8.233723280477991e-06, "loss": 3.1326, "step": 1098 }, { "epoch": 1.047670122750566, "grad_norm": 21.350593460024015, "learning_rate": 8.22949086539e-06, "loss": 3.1039, "step": 1099 }, { "epoch": 1.0486235252055773, "grad_norm": 18.67276258705635, "learning_rate": 8.225254476124479e-06, "loss": 3.0874, "step": 1100 }, { "epoch": 1.0495769276605886, "grad_norm": 20.272110935266447, "learning_rate": 8.221014117894685e-06, "loss": 2.9375, "step": 1101 }, { "epoch": 1.0505303301156002, "grad_norm": 18.514076823276554, "learning_rate": 8.216769795918764e-06, "loss": 3.0979, "step": 1102 }, { "epoch": 1.0514837325706115, "grad_norm": 18.153681528659753, "learning_rate": 8.212521515419727e-06, "loss": 3.2429, "step": 1103 }, { "epoch": 1.0524371350256227, "grad_norm": 17.901924255494585, "learning_rate": 8.208269281625466e-06, "loss": 3.1816, "step": 1104 }, { "epoch": 1.053390537480634, "grad_norm": 21.726531997407466, "learning_rate": 8.204013099768733e-06, "loss": 3.066, "step": 1105 }, { "epoch": 1.0543439399356453, "grad_norm": 23.765271532856726, "learning_rate": 8.199752975087145e-06, "loss": 3.3638, "step": 1106 }, { "epoch": 1.0552973423906566, "grad_norm": 21.077088675189106, "learning_rate": 8.195488912823164e-06, "loss": 3.5593, "step": 1107 }, { "epoch": 1.056250744845668, "grad_norm": 22.868355027671793, "learning_rate": 8.191220918224102e-06, "loss": 3.4213, "step": 1108 }, { "epoch": 1.0572041473006792, "grad_norm": 22.346796634926225, "learning_rate": 8.186948996542105e-06, "loss": 3.2187, "step": 1109 }, { "epoch": 1.0581575497556905, "grad_norm": 20.579476609300063, "learning_rate": 8.182673153034157e-06, "loss": 3.2326, "step": 1110 }, { "epoch": 1.0591109522107018, "grad_norm": 18.05839830669214, "learning_rate": 8.178393392962068e-06, "loss": 3.035, "step": 1111 }, { "epoch": 1.0600643546657134, "grad_norm": 21.589660267066087, "learning_rate": 8.174109721592463e-06, "loss": 3.1206, "step": 1112 }, { "epoch": 1.0610177571207247, "grad_norm": 18.512881504354237, "learning_rate": 8.169822144196781e-06, "loss": 3.099, "step": 1113 }, { "epoch": 1.061971159575736, "grad_norm": 19.654210999457447, "learning_rate": 8.165530666051275e-06, "loss": 3.0882, "step": 1114 }, { "epoch": 1.0629245620307473, "grad_norm": 21.197294653796035, "learning_rate": 8.161235292436992e-06, "loss": 3.137, "step": 1115 }, { "epoch": 1.0638779644857586, "grad_norm": 18.46109814874656, "learning_rate": 8.156936028639768e-06, "loss": 3.3824, "step": 1116 }, { "epoch": 1.0648313669407699, "grad_norm": 20.313533983171755, "learning_rate": 8.152632879950238e-06, "loss": 3.0938, "step": 1117 }, { "epoch": 1.0657847693957812, "grad_norm": 16.850887627024427, "learning_rate": 8.14832585166381e-06, "loss": 3.313, "step": 1118 }, { "epoch": 1.0667381718507924, "grad_norm": 17.440458983421333, "learning_rate": 8.144014949080668e-06, "loss": 3.1289, "step": 1119 }, { "epoch": 1.0676915743058037, "grad_norm": 15.791043979584234, "learning_rate": 8.13970017750576e-06, "loss": 3.1111, "step": 1120 }, { "epoch": 1.0686449767608153, "grad_norm": 18.351758616228544, "learning_rate": 8.135381542248802e-06, "loss": 2.7426, "step": 1121 }, { "epoch": 1.0695983792158266, "grad_norm": 20.518597383109512, "learning_rate": 8.131059048624262e-06, "loss": 2.9792, "step": 1122 }, { "epoch": 1.0705517816708379, "grad_norm": 21.1960907216779, "learning_rate": 8.126732701951352e-06, "loss": 3.1316, "step": 1123 }, { "epoch": 1.0715051841258492, "grad_norm": 17.77277837963605, "learning_rate": 8.12240250755403e-06, "loss": 3.0024, "step": 1124 }, { "epoch": 1.0724585865808605, "grad_norm": 17.07807970544841, "learning_rate": 8.118068470760988e-06, "loss": 3.2107, "step": 1125 }, { "epoch": 1.0734119890358718, "grad_norm": 18.827134047333534, "learning_rate": 8.11373059690565e-06, "loss": 3.1314, "step": 1126 }, { "epoch": 1.074365391490883, "grad_norm": 17.334526654389954, "learning_rate": 8.10938889132615e-06, "loss": 3.0842, "step": 1127 }, { "epoch": 1.0753187939458944, "grad_norm": 20.212908048505025, "learning_rate": 8.10504335936535e-06, "loss": 3.1041, "step": 1128 }, { "epoch": 1.0762721964009057, "grad_norm": 21.25938268803354, "learning_rate": 8.100694006370816e-06, "loss": 3.4349, "step": 1129 }, { "epoch": 1.077225598855917, "grad_norm": 23.398043065743188, "learning_rate": 8.096340837694816e-06, "loss": 3.3599, "step": 1130 }, { "epoch": 1.0781790013109285, "grad_norm": 17.66114850412804, "learning_rate": 8.091983858694314e-06, "loss": 3.1181, "step": 1131 }, { "epoch": 1.0791324037659398, "grad_norm": 19.103508607034613, "learning_rate": 8.08762307473096e-06, "loss": 3.1444, "step": 1132 }, { "epoch": 1.080085806220951, "grad_norm": 18.9186747384087, "learning_rate": 8.08325849117109e-06, "loss": 3.0256, "step": 1133 }, { "epoch": 1.0810392086759624, "grad_norm": 18.82746385213971, "learning_rate": 8.078890113385718e-06, "loss": 3.132, "step": 1134 }, { "epoch": 1.0819926111309737, "grad_norm": 19.781787049223595, "learning_rate": 8.074517946750521e-06, "loss": 3.0467, "step": 1135 }, { "epoch": 1.082946013585985, "grad_norm": 21.789995195323467, "learning_rate": 8.07014199664584e-06, "loss": 3.1212, "step": 1136 }, { "epoch": 1.0838994160409963, "grad_norm": 18.882729958636645, "learning_rate": 8.065762268456677e-06, "loss": 3.3536, "step": 1137 }, { "epoch": 1.0848528184960076, "grad_norm": 20.52236132295884, "learning_rate": 8.061378767572673e-06, "loss": 3.235, "step": 1138 }, { "epoch": 1.0858062209510189, "grad_norm": 20.306839867352345, "learning_rate": 8.056991499388126e-06, "loss": 3.1033, "step": 1139 }, { "epoch": 1.0867596234060302, "grad_norm": 17.835336814745396, "learning_rate": 8.052600469301958e-06, "loss": 3.0892, "step": 1140 }, { "epoch": 1.0877130258610417, "grad_norm": 15.152825682884508, "learning_rate": 8.048205682717724e-06, "loss": 3.1615, "step": 1141 }, { "epoch": 1.088666428316053, "grad_norm": 21.02764497374464, "learning_rate": 8.043807145043604e-06, "loss": 3.0414, "step": 1142 }, { "epoch": 1.0896198307710643, "grad_norm": 16.479718857243572, "learning_rate": 8.039404861692391e-06, "loss": 3.251, "step": 1143 }, { "epoch": 1.0905732332260756, "grad_norm": 21.60005138956664, "learning_rate": 8.03499883808149e-06, "loss": 3.4491, "step": 1144 }, { "epoch": 1.0915266356810869, "grad_norm": 22.317637275185266, "learning_rate": 8.030589079632905e-06, "loss": 3.0812, "step": 1145 }, { "epoch": 1.0924800381360982, "grad_norm": 18.62870563170603, "learning_rate": 8.026175591773239e-06, "loss": 3.1591, "step": 1146 }, { "epoch": 1.0934334405911095, "grad_norm": 18.93907676567074, "learning_rate": 8.021758379933688e-06, "loss": 3.3435, "step": 1147 }, { "epoch": 1.0943868430461208, "grad_norm": 26.689422539590232, "learning_rate": 8.01733744955002e-06, "loss": 3.0941, "step": 1148 }, { "epoch": 1.095340245501132, "grad_norm": 21.144673770558413, "learning_rate": 8.012912806062589e-06, "loss": 3.1461, "step": 1149 }, { "epoch": 1.0962936479561436, "grad_norm": 20.21225670096521, "learning_rate": 8.008484454916316e-06, "loss": 3.1516, "step": 1150 }, { "epoch": 1.097247050411155, "grad_norm": 17.63700007632881, "learning_rate": 8.00405240156068e-06, "loss": 3.3312, "step": 1151 }, { "epoch": 1.0982004528661662, "grad_norm": 21.34336059878273, "learning_rate": 7.999616651449722e-06, "loss": 2.967, "step": 1152 }, { "epoch": 1.0991538553211775, "grad_norm": 19.937778630636508, "learning_rate": 7.99517721004203e-06, "loss": 3.2128, "step": 1153 }, { "epoch": 1.1001072577761888, "grad_norm": 20.26333341118303, "learning_rate": 7.990734082800731e-06, "loss": 2.9921, "step": 1154 }, { "epoch": 1.1010606602312, "grad_norm": 20.12928437731517, "learning_rate": 7.986287275193491e-06, "loss": 2.8982, "step": 1155 }, { "epoch": 1.1020140626862114, "grad_norm": 17.422402278920536, "learning_rate": 7.981836792692508e-06, "loss": 2.9746, "step": 1156 }, { "epoch": 1.1029674651412227, "grad_norm": 16.448183856347395, "learning_rate": 7.977382640774495e-06, "loss": 2.8267, "step": 1157 }, { "epoch": 1.103920867596234, "grad_norm": 21.403320198929553, "learning_rate": 7.97292482492069e-06, "loss": 3.3782, "step": 1158 }, { "epoch": 1.1048742700512455, "grad_norm": 21.603428947162634, "learning_rate": 7.968463350616826e-06, "loss": 3.1298, "step": 1159 }, { "epoch": 1.1058276725062568, "grad_norm": 21.4735738514797, "learning_rate": 7.963998223353154e-06, "loss": 3.1218, "step": 1160 }, { "epoch": 1.106781074961268, "grad_norm": 19.167397751023902, "learning_rate": 7.95952944862441e-06, "loss": 3.1012, "step": 1161 }, { "epoch": 1.1077344774162794, "grad_norm": 20.415910028182562, "learning_rate": 7.95505703192982e-06, "loss": 3.1668, "step": 1162 }, { "epoch": 1.1086878798712907, "grad_norm": 16.483608522200186, "learning_rate": 7.950580978773096e-06, "loss": 3.2523, "step": 1163 }, { "epoch": 1.109641282326302, "grad_norm": 22.55788646047019, "learning_rate": 7.946101294662418e-06, "loss": 3.08, "step": 1164 }, { "epoch": 1.1105946847813133, "grad_norm": 16.945839162739176, "learning_rate": 7.941617985110443e-06, "loss": 3.3092, "step": 1165 }, { "epoch": 1.1115480872363246, "grad_norm": 20.976447369044916, "learning_rate": 7.93713105563428e-06, "loss": 3.4475, "step": 1166 }, { "epoch": 1.1125014896913359, "grad_norm": 22.277719934698922, "learning_rate": 7.9326405117555e-06, "loss": 3.3889, "step": 1167 }, { "epoch": 1.1134548921463472, "grad_norm": 22.614406353863938, "learning_rate": 7.928146359000117e-06, "loss": 2.9586, "step": 1168 }, { "epoch": 1.1144082946013585, "grad_norm": 18.77676436461188, "learning_rate": 7.92364860289859e-06, "loss": 3.1556, "step": 1169 }, { "epoch": 1.11536169705637, "grad_norm": 19.441015052954196, "learning_rate": 7.919147248985811e-06, "loss": 3.3349, "step": 1170 }, { "epoch": 1.1163150995113813, "grad_norm": 19.5467834993246, "learning_rate": 7.914642302801097e-06, "loss": 3.1923, "step": 1171 }, { "epoch": 1.1172685019663926, "grad_norm": 22.158869596704754, "learning_rate": 7.91013376988819e-06, "loss": 2.9784, "step": 1172 }, { "epoch": 1.118221904421404, "grad_norm": 18.434907001572746, "learning_rate": 7.905621655795239e-06, "loss": 3.1225, "step": 1173 }, { "epoch": 1.1191753068764152, "grad_norm": 18.36564561708116, "learning_rate": 7.901105966074807e-06, "loss": 3.188, "step": 1174 }, { "epoch": 1.1201287093314265, "grad_norm": 17.729796775882928, "learning_rate": 7.896586706283856e-06, "loss": 3.0197, "step": 1175 }, { "epoch": 1.1210821117864378, "grad_norm": 17.779934750191288, "learning_rate": 7.892063881983736e-06, "loss": 3.256, "step": 1176 }, { "epoch": 1.122035514241449, "grad_norm": 19.702937668799326, "learning_rate": 7.887537498740187e-06, "loss": 2.9063, "step": 1177 }, { "epoch": 1.1229889166964604, "grad_norm": 15.257137233794793, "learning_rate": 7.883007562123332e-06, "loss": 3.1469, "step": 1178 }, { "epoch": 1.123942319151472, "grad_norm": 17.806804302594518, "learning_rate": 7.878474077707663e-06, "loss": 3.2928, "step": 1179 }, { "epoch": 1.1248957216064832, "grad_norm": 19.545022013960722, "learning_rate": 7.873937051072037e-06, "loss": 3.1371, "step": 1180 }, { "epoch": 1.1258491240614945, "grad_norm": 18.166705168917975, "learning_rate": 7.869396487799668e-06, "loss": 3.1961, "step": 1181 }, { "epoch": 1.1268025265165058, "grad_norm": 19.477111832157718, "learning_rate": 7.864852393478131e-06, "loss": 3.0989, "step": 1182 }, { "epoch": 1.127755928971517, "grad_norm": 18.669403900046575, "learning_rate": 7.86030477369934e-06, "loss": 3.2309, "step": 1183 }, { "epoch": 1.1287093314265284, "grad_norm": 21.986453695798932, "learning_rate": 7.855753634059543e-06, "loss": 3.1396, "step": 1184 }, { "epoch": 1.1296627338815397, "grad_norm": 22.085621631520134, "learning_rate": 7.85119898015933e-06, "loss": 3.3027, "step": 1185 }, { "epoch": 1.130616136336551, "grad_norm": 18.45999707249247, "learning_rate": 7.846640817603607e-06, "loss": 3.2042, "step": 1186 }, { "epoch": 1.1315695387915623, "grad_norm": 22.913454777905308, "learning_rate": 7.842079152001607e-06, "loss": 3.2983, "step": 1187 }, { "epoch": 1.1325229412465738, "grad_norm": 18.867542717504115, "learning_rate": 7.83751398896686e-06, "loss": 3.141, "step": 1188 }, { "epoch": 1.1334763437015851, "grad_norm": 17.72592118621337, "learning_rate": 7.83294533411721e-06, "loss": 3.0548, "step": 1189 }, { "epoch": 1.1344297461565964, "grad_norm": 16.633182404938626, "learning_rate": 7.828373193074798e-06, "loss": 3.0761, "step": 1190 }, { "epoch": 1.1353831486116077, "grad_norm": 17.274650984056155, "learning_rate": 7.823797571466051e-06, "loss": 3.0613, "step": 1191 }, { "epoch": 1.136336551066619, "grad_norm": 20.124794122195944, "learning_rate": 7.81921847492168e-06, "loss": 3.0775, "step": 1192 }, { "epoch": 1.1372899535216303, "grad_norm": 18.807156868497703, "learning_rate": 7.814635909076676e-06, "loss": 3.1908, "step": 1193 }, { "epoch": 1.1382433559766416, "grad_norm": 16.33262718599133, "learning_rate": 7.810049879570294e-06, "loss": 3.0327, "step": 1194 }, { "epoch": 1.139196758431653, "grad_norm": 16.390637428744515, "learning_rate": 7.805460392046054e-06, "loss": 2.9326, "step": 1195 }, { "epoch": 1.1401501608866642, "grad_norm": 18.546042424220207, "learning_rate": 7.80086745215173e-06, "loss": 3.305, "step": 1196 }, { "epoch": 1.1411035633416755, "grad_norm": 27.431757307714076, "learning_rate": 7.79627106553935e-06, "loss": 2.9927, "step": 1197 }, { "epoch": 1.1420569657966868, "grad_norm": 20.147542957371, "learning_rate": 7.791671237865176e-06, "loss": 3.2855, "step": 1198 }, { "epoch": 1.1430103682516983, "grad_norm": 18.176991182825464, "learning_rate": 7.787067974789705e-06, "loss": 3.0823, "step": 1199 }, { "epoch": 1.1439637707067096, "grad_norm": 21.512160405551867, "learning_rate": 7.782461281977668e-06, "loss": 3.3573, "step": 1200 }, { "epoch": 1.144917173161721, "grad_norm": 22.620678241635108, "learning_rate": 7.777851165098012e-06, "loss": 3.3062, "step": 1201 }, { "epoch": 1.1458705756167322, "grad_norm": 20.76902866117653, "learning_rate": 7.773237629823897e-06, "loss": 3.0617, "step": 1202 }, { "epoch": 1.1468239780717435, "grad_norm": 22.404792222856884, "learning_rate": 7.768620681832695e-06, "loss": 3.289, "step": 1203 }, { "epoch": 1.1477773805267548, "grad_norm": 18.1733965863717, "learning_rate": 7.764000326805967e-06, "loss": 3.1073, "step": 1204 }, { "epoch": 1.1487307829817661, "grad_norm": 19.877341462543725, "learning_rate": 7.75937657042948e-06, "loss": 3.0428, "step": 1205 }, { "epoch": 1.1496841854367774, "grad_norm": 22.42968755946053, "learning_rate": 7.754749418393176e-06, "loss": 3.0785, "step": 1206 }, { "epoch": 1.1506375878917887, "grad_norm": 18.5483416536666, "learning_rate": 7.750118876391182e-06, "loss": 3.2956, "step": 1207 }, { "epoch": 1.1515909903468002, "grad_norm": 18.94455572698192, "learning_rate": 7.74548495012179e-06, "loss": 2.9145, "step": 1208 }, { "epoch": 1.1525443928018115, "grad_norm": 18.63655947821557, "learning_rate": 7.740847645287467e-06, "loss": 3.0207, "step": 1209 }, { "epoch": 1.1534977952568228, "grad_norm": 20.7333763237411, "learning_rate": 7.736206967594828e-06, "loss": 3.0489, "step": 1210 }, { "epoch": 1.1544511977118341, "grad_norm": 20.431034983301725, "learning_rate": 7.731562922754643e-06, "loss": 3.5104, "step": 1211 }, { "epoch": 1.1554046001668454, "grad_norm": 23.36890360376239, "learning_rate": 7.726915516481824e-06, "loss": 3.0657, "step": 1212 }, { "epoch": 1.1563580026218567, "grad_norm": 17.050995948445614, "learning_rate": 7.722264754495422e-06, "loss": 2.768, "step": 1213 }, { "epoch": 1.157311405076868, "grad_norm": 17.30744449657713, "learning_rate": 7.717610642518615e-06, "loss": 3.1827, "step": 1214 }, { "epoch": 1.1582648075318793, "grad_norm": 22.153343312991318, "learning_rate": 7.712953186278703e-06, "loss": 3.2428, "step": 1215 }, { "epoch": 1.1592182099868906, "grad_norm": 18.510797393367014, "learning_rate": 7.708292391507105e-06, "loss": 2.9657, "step": 1216 }, { "epoch": 1.1601716124419021, "grad_norm": 16.764780023461327, "learning_rate": 7.703628263939346e-06, "loss": 3.0835, "step": 1217 }, { "epoch": 1.1611250148969134, "grad_norm": 17.090439347430596, "learning_rate": 7.69896080931505e-06, "loss": 2.8602, "step": 1218 }, { "epoch": 1.1620784173519247, "grad_norm": 20.842967332936198, "learning_rate": 7.69429003337794e-06, "loss": 3.1466, "step": 1219 }, { "epoch": 1.163031819806936, "grad_norm": 17.548031723372365, "learning_rate": 7.68961594187582e-06, "loss": 3.355, "step": 1220 }, { "epoch": 1.1639852222619473, "grad_norm": 18.853123280450465, "learning_rate": 7.684938540560583e-06, "loss": 3.2166, "step": 1221 }, { "epoch": 1.1649386247169586, "grad_norm": 22.018788721004512, "learning_rate": 7.680257835188187e-06, "loss": 3.0946, "step": 1222 }, { "epoch": 1.16589202717197, "grad_norm": 20.009490828872416, "learning_rate": 7.67557383151866e-06, "loss": 3.1974, "step": 1223 }, { "epoch": 1.1668454296269812, "grad_norm": 27.083581040999693, "learning_rate": 7.670886535316086e-06, "loss": 2.9664, "step": 1224 }, { "epoch": 1.1677988320819925, "grad_norm": 18.09569166866641, "learning_rate": 7.666195952348607e-06, "loss": 2.9946, "step": 1225 }, { "epoch": 1.168752234537004, "grad_norm": 20.94529756135149, "learning_rate": 7.661502088388398e-06, "loss": 3.2764, "step": 1226 }, { "epoch": 1.1697056369920151, "grad_norm": 19.69547769536894, "learning_rate": 7.656804949211684e-06, "loss": 3.1257, "step": 1227 }, { "epoch": 1.1706590394470267, "grad_norm": 19.05486938579244, "learning_rate": 7.652104540598712e-06, "loss": 3.0633, "step": 1228 }, { "epoch": 1.171612441902038, "grad_norm": 19.833531402798368, "learning_rate": 7.64740086833376e-06, "loss": 3.0576, "step": 1229 }, { "epoch": 1.1725658443570492, "grad_norm": 20.81170484236003, "learning_rate": 7.642693938205112e-06, "loss": 3.0283, "step": 1230 }, { "epoch": 1.1735192468120605, "grad_norm": 19.601269479547614, "learning_rate": 7.637983756005072e-06, "loss": 3.2134, "step": 1231 }, { "epoch": 1.1744726492670718, "grad_norm": 16.586269101672826, "learning_rate": 7.633270327529936e-06, "loss": 2.8596, "step": 1232 }, { "epoch": 1.1754260517220831, "grad_norm": 18.870779151709016, "learning_rate": 7.628553658580003e-06, "loss": 3.4508, "step": 1233 }, { "epoch": 1.1763794541770944, "grad_norm": 19.683156388402757, "learning_rate": 7.623833754959552e-06, "loss": 3.3103, "step": 1234 }, { "epoch": 1.1773328566321057, "grad_norm": 18.68189141174557, "learning_rate": 7.619110622476853e-06, "loss": 3.0815, "step": 1235 }, { "epoch": 1.178286259087117, "grad_norm": 19.793278935235342, "learning_rate": 7.614384266944139e-06, "loss": 3.118, "step": 1236 }, { "epoch": 1.1792396615421286, "grad_norm": 17.5202166711142, "learning_rate": 7.609654694177613e-06, "loss": 3.1107, "step": 1237 }, { "epoch": 1.1801930639971399, "grad_norm": 17.721188576829743, "learning_rate": 7.60492190999744e-06, "loss": 3.0916, "step": 1238 }, { "epoch": 1.1811464664521512, "grad_norm": 21.008730054409575, "learning_rate": 7.6001859202277335e-06, "loss": 3.0378, "step": 1239 }, { "epoch": 1.1820998689071625, "grad_norm": 20.99922094975697, "learning_rate": 7.595446730696554e-06, "loss": 3.179, "step": 1240 }, { "epoch": 1.1830532713621738, "grad_norm": 18.370342182402517, "learning_rate": 7.590704347235898e-06, "loss": 3.0559, "step": 1241 }, { "epoch": 1.184006673817185, "grad_norm": 20.887288695700718, "learning_rate": 7.585958775681688e-06, "loss": 3.2041, "step": 1242 }, { "epoch": 1.1849600762721963, "grad_norm": 18.29317019746262, "learning_rate": 7.581210021873779e-06, "loss": 3.1772, "step": 1243 }, { "epoch": 1.1859134787272076, "grad_norm": 18.364328819511663, "learning_rate": 7.5764580916559405e-06, "loss": 3.3836, "step": 1244 }, { "epoch": 1.186866881182219, "grad_norm": 18.701360810258535, "learning_rate": 7.57170299087584e-06, "loss": 3.0364, "step": 1245 }, { "epoch": 1.1878202836372305, "grad_norm": 16.770217066585033, "learning_rate": 7.566944725385061e-06, "loss": 3.0557, "step": 1246 }, { "epoch": 1.1887736860922418, "grad_norm": 19.204039555036903, "learning_rate": 7.562183301039073e-06, "loss": 3.2256, "step": 1247 }, { "epoch": 1.189727088547253, "grad_norm": 23.783416788346962, "learning_rate": 7.5574187236972344e-06, "loss": 3.4852, "step": 1248 }, { "epoch": 1.1906804910022644, "grad_norm": 23.629665552098796, "learning_rate": 7.5526509992227836e-06, "loss": 3.2439, "step": 1249 }, { "epoch": 1.1916338934572757, "grad_norm": 19.52474745715692, "learning_rate": 7.547880133482834e-06, "loss": 3.0732, "step": 1250 }, { "epoch": 1.192587295912287, "grad_norm": 20.065462164156656, "learning_rate": 7.54310613234836e-06, "loss": 3.1965, "step": 1251 }, { "epoch": 1.1935406983672983, "grad_norm": 16.63313411412898, "learning_rate": 7.5383290016942e-06, "loss": 3.2913, "step": 1252 }, { "epoch": 1.1944941008223096, "grad_norm": 18.515321606650232, "learning_rate": 7.53354874739904e-06, "loss": 3.0151, "step": 1253 }, { "epoch": 1.1954475032773209, "grad_norm": 19.194223600395784, "learning_rate": 7.528765375345411e-06, "loss": 3.0686, "step": 1254 }, { "epoch": 1.1964009057323324, "grad_norm": 18.404057333536446, "learning_rate": 7.523978891419679e-06, "loss": 3.0806, "step": 1255 }, { "epoch": 1.1973543081873437, "grad_norm": 20.30501974135666, "learning_rate": 7.519189301512042e-06, "loss": 3.1407, "step": 1256 }, { "epoch": 1.198307710642355, "grad_norm": 22.646744052966266, "learning_rate": 7.514396611516519e-06, "loss": 3.1731, "step": 1257 }, { "epoch": 1.1992611130973663, "grad_norm": 21.05440508248916, "learning_rate": 7.509600827330943e-06, "loss": 3.1504, "step": 1258 }, { "epoch": 1.2002145155523776, "grad_norm": 22.94445021475327, "learning_rate": 7.504801954856957e-06, "loss": 3.2767, "step": 1259 }, { "epoch": 1.2011679180073889, "grad_norm": 22.506022886897135, "learning_rate": 7.500000000000001e-06, "loss": 3.0305, "step": 1260 }, { "epoch": 1.2021213204624002, "grad_norm": 19.552096596285587, "learning_rate": 7.495194968669311e-06, "loss": 3.1947, "step": 1261 }, { "epoch": 1.2030747229174115, "grad_norm": 16.481851746212325, "learning_rate": 7.4903868667779116e-06, "loss": 3.1773, "step": 1262 }, { "epoch": 1.2040281253724228, "grad_norm": 19.671309311087636, "learning_rate": 7.485575700242597e-06, "loss": 3.1222, "step": 1263 }, { "epoch": 1.204981527827434, "grad_norm": 19.911889435693844, "learning_rate": 7.480761474983943e-06, "loss": 2.9538, "step": 1264 }, { "epoch": 1.2059349302824454, "grad_norm": 18.9119774279386, "learning_rate": 7.47594419692628e-06, "loss": 3.0944, "step": 1265 }, { "epoch": 1.2068883327374569, "grad_norm": 19.803422210807504, "learning_rate": 7.471123871997703e-06, "loss": 3.0768, "step": 1266 }, { "epoch": 1.2078417351924682, "grad_norm": 21.44010921927493, "learning_rate": 7.466300506130053e-06, "loss": 2.9963, "step": 1267 }, { "epoch": 1.2087951376474795, "grad_norm": 18.927086497359845, "learning_rate": 7.461474105258911e-06, "loss": 2.9808, "step": 1268 }, { "epoch": 1.2097485401024908, "grad_norm": 19.399584090253377, "learning_rate": 7.456644675323597e-06, "loss": 3.4861, "step": 1269 }, { "epoch": 1.210701942557502, "grad_norm": 22.831478496061848, "learning_rate": 7.4518122222671585e-06, "loss": 3.5247, "step": 1270 }, { "epoch": 1.2116553450125134, "grad_norm": 21.304097548010265, "learning_rate": 7.446976752036358e-06, "loss": 3.0697, "step": 1271 }, { "epoch": 1.2126087474675247, "grad_norm": 17.727314285419762, "learning_rate": 7.442138270581676e-06, "loss": 2.8174, "step": 1272 }, { "epoch": 1.213562149922536, "grad_norm": 16.981554258711785, "learning_rate": 7.437296783857297e-06, "loss": 3.0242, "step": 1273 }, { "epoch": 1.2145155523775473, "grad_norm": 17.42930288104114, "learning_rate": 7.432452297821103e-06, "loss": 3.4167, "step": 1274 }, { "epoch": 1.2154689548325588, "grad_norm": 19.205320117738363, "learning_rate": 7.4276048184346695e-06, "loss": 3.0459, "step": 1275 }, { "epoch": 1.21642235728757, "grad_norm": 16.183008777116562, "learning_rate": 7.422754351663252e-06, "loss": 3.0449, "step": 1276 }, { "epoch": 1.2173757597425814, "grad_norm": 18.802323850899835, "learning_rate": 7.417900903475783e-06, "loss": 3.1793, "step": 1277 }, { "epoch": 1.2183291621975927, "grad_norm": 18.13352473416534, "learning_rate": 7.413044479844867e-06, "loss": 3.1058, "step": 1278 }, { "epoch": 1.219282564652604, "grad_norm": 16.10435094642933, "learning_rate": 7.408185086746767e-06, "loss": 3.2091, "step": 1279 }, { "epoch": 1.2202359671076153, "grad_norm": 18.790869815555805, "learning_rate": 7.403322730161402e-06, "loss": 3.253, "step": 1280 }, { "epoch": 1.2211893695626266, "grad_norm": 19.035486719915713, "learning_rate": 7.398457416072334e-06, "loss": 3.0328, "step": 1281 }, { "epoch": 1.2221427720176379, "grad_norm": 17.480102345666385, "learning_rate": 7.3935891504667715e-06, "loss": 2.9932, "step": 1282 }, { "epoch": 1.2230961744726492, "grad_norm": 21.20835528705613, "learning_rate": 7.388717939335548e-06, "loss": 3.1415, "step": 1283 }, { "epoch": 1.2240495769276607, "grad_norm": 16.355994306502364, "learning_rate": 7.3838437886731264e-06, "loss": 2.8831, "step": 1284 }, { "epoch": 1.225002979382672, "grad_norm": 20.895522098366506, "learning_rate": 7.378966704477585e-06, "loss": 3.1783, "step": 1285 }, { "epoch": 1.2259563818376833, "grad_norm": 19.083832122011113, "learning_rate": 7.374086692750611e-06, "loss": 3.1196, "step": 1286 }, { "epoch": 1.2269097842926946, "grad_norm": 19.976261398282933, "learning_rate": 7.369203759497497e-06, "loss": 2.9486, "step": 1287 }, { "epoch": 1.227863186747706, "grad_norm": 19.940284060118312, "learning_rate": 7.364317910727128e-06, "loss": 2.9, "step": 1288 }, { "epoch": 1.2288165892027172, "grad_norm": 19.85187025857987, "learning_rate": 7.359429152451978e-06, "loss": 2.9984, "step": 1289 }, { "epoch": 1.2297699916577285, "grad_norm": 19.51246834754821, "learning_rate": 7.354537490688105e-06, "loss": 3.1204, "step": 1290 }, { "epoch": 1.2307233941127398, "grad_norm": 19.573674460354596, "learning_rate": 7.349642931455132e-06, "loss": 3.0738, "step": 1291 }, { "epoch": 1.231676796567751, "grad_norm": 19.792766127551303, "learning_rate": 7.3447454807762565e-06, "loss": 3.2961, "step": 1292 }, { "epoch": 1.2326301990227626, "grad_norm": 18.38946768494984, "learning_rate": 7.339845144678227e-06, "loss": 3.2982, "step": 1293 }, { "epoch": 1.2335836014777737, "grad_norm": 22.949215040552946, "learning_rate": 7.3349419291913445e-06, "loss": 3.065, "step": 1294 }, { "epoch": 1.2345370039327852, "grad_norm": 21.190120565571434, "learning_rate": 7.33003584034946e-06, "loss": 3.2714, "step": 1295 }, { "epoch": 1.2354904063877965, "grad_norm": 19.511461151479384, "learning_rate": 7.325126884189948e-06, "loss": 3.2719, "step": 1296 }, { "epoch": 1.2364438088428078, "grad_norm": 19.53705739317875, "learning_rate": 7.320215066753723e-06, "loss": 3.3774, "step": 1297 }, { "epoch": 1.237397211297819, "grad_norm": 20.364929940400835, "learning_rate": 7.3153003940852145e-06, "loss": 3.0939, "step": 1298 }, { "epoch": 1.2383506137528304, "grad_norm": 23.206526378783114, "learning_rate": 7.310382872232367e-06, "loss": 3.1109, "step": 1299 }, { "epoch": 1.2393040162078417, "grad_norm": 20.744203979138025, "learning_rate": 7.30546250724663e-06, "loss": 3.144, "step": 1300 }, { "epoch": 1.240257418662853, "grad_norm": 17.308547506783462, "learning_rate": 7.300539305182955e-06, "loss": 3.1984, "step": 1301 }, { "epoch": 1.2412108211178643, "grad_norm": 18.227898355499484, "learning_rate": 7.295613272099782e-06, "loss": 3.1079, "step": 1302 }, { "epoch": 1.2421642235728756, "grad_norm": 20.224916341703942, "learning_rate": 7.290684414059035e-06, "loss": 3.1146, "step": 1303 }, { "epoch": 1.2431176260278871, "grad_norm": 21.54140721659713, "learning_rate": 7.285752737126117e-06, "loss": 3.0423, "step": 1304 }, { "epoch": 1.2440710284828984, "grad_norm": 21.758979135626593, "learning_rate": 7.2808182473698955e-06, "loss": 3.026, "step": 1305 }, { "epoch": 1.2450244309379097, "grad_norm": 18.186422923166823, "learning_rate": 7.275880950862701e-06, "loss": 3.0805, "step": 1306 }, { "epoch": 1.245977833392921, "grad_norm": 18.486411867352185, "learning_rate": 7.270940853680322e-06, "loss": 2.9961, "step": 1307 }, { "epoch": 1.2469312358479323, "grad_norm": 21.486130126589583, "learning_rate": 7.265997961901987e-06, "loss": 3.2833, "step": 1308 }, { "epoch": 1.2478846383029436, "grad_norm": 18.153284599546794, "learning_rate": 7.2610522816103675e-06, "loss": 3.1448, "step": 1309 }, { "epoch": 1.248838040757955, "grad_norm": 17.650453969663317, "learning_rate": 7.256103818891569e-06, "loss": 3.2826, "step": 1310 }, { "epoch": 1.2497914432129662, "grad_norm": 21.409876869134997, "learning_rate": 7.251152579835114e-06, "loss": 3.1761, "step": 1311 }, { "epoch": 1.2507448456679775, "grad_norm": 20.85227633257782, "learning_rate": 7.246198570533944e-06, "loss": 2.9933, "step": 1312 }, { "epoch": 1.251698248122989, "grad_norm": 18.73886171180629, "learning_rate": 7.2412417970844154e-06, "loss": 3.2226, "step": 1313 }, { "epoch": 1.2526516505780003, "grad_norm": 21.83350402729095, "learning_rate": 7.236282265586279e-06, "loss": 3.0703, "step": 1314 }, { "epoch": 1.2536050530330116, "grad_norm": 19.973691929806478, "learning_rate": 7.2313199821426806e-06, "loss": 3.1978, "step": 1315 }, { "epoch": 1.254558455488023, "grad_norm": 20.25722830033505, "learning_rate": 7.226354952860157e-06, "loss": 3.0709, "step": 1316 }, { "epoch": 1.2555118579430342, "grad_norm": 16.082574156479275, "learning_rate": 7.22138718384862e-06, "loss": 2.9923, "step": 1317 }, { "epoch": 1.2564652603980455, "grad_norm": 23.31306944868432, "learning_rate": 7.216416681221354e-06, "loss": 3.1132, "step": 1318 }, { "epoch": 1.2574186628530568, "grad_norm": 18.842999111893754, "learning_rate": 7.211443451095007e-06, "loss": 3.3305, "step": 1319 }, { "epoch": 1.258372065308068, "grad_norm": 21.021821595484596, "learning_rate": 7.206467499589584e-06, "loss": 3.2218, "step": 1320 }, { "epoch": 1.2593254677630794, "grad_norm": 22.548669509782936, "learning_rate": 7.201488832828439e-06, "loss": 3.2228, "step": 1321 }, { "epoch": 1.260278870218091, "grad_norm": 19.868518105453354, "learning_rate": 7.196507456938264e-06, "loss": 3.1877, "step": 1322 }, { "epoch": 1.261232272673102, "grad_norm": 22.93157146021304, "learning_rate": 7.191523378049092e-06, "loss": 3.3608, "step": 1323 }, { "epoch": 1.2621856751281135, "grad_norm": 23.45717775583014, "learning_rate": 7.186536602294278e-06, "loss": 3.1102, "step": 1324 }, { "epoch": 1.2631390775831248, "grad_norm": 19.368120376687827, "learning_rate": 7.181547135810491e-06, "loss": 3.2147, "step": 1325 }, { "epoch": 1.2640924800381361, "grad_norm": 18.61750849314434, "learning_rate": 7.1765549847377236e-06, "loss": 3.2801, "step": 1326 }, { "epoch": 1.2650458824931474, "grad_norm": 19.648037659498577, "learning_rate": 7.171560155219257e-06, "loss": 2.9008, "step": 1327 }, { "epoch": 1.2659992849481587, "grad_norm": 17.56824442362474, "learning_rate": 7.166562653401681e-06, "loss": 3.0529, "step": 1328 }, { "epoch": 1.26695268740317, "grad_norm": 17.122453415319708, "learning_rate": 7.161562485434865e-06, "loss": 3.123, "step": 1329 }, { "epoch": 1.2679060898581813, "grad_norm": 18.117107261735665, "learning_rate": 7.156559657471967e-06, "loss": 3.1255, "step": 1330 }, { "epoch": 1.2688594923131928, "grad_norm": 17.829240432538185, "learning_rate": 7.1515541756694116e-06, "loss": 3.0609, "step": 1331 }, { "epoch": 1.269812894768204, "grad_norm": 18.860888619199194, "learning_rate": 7.146546046186893e-06, "loss": 3.1534, "step": 1332 }, { "epoch": 1.2707662972232154, "grad_norm": 20.950817579858697, "learning_rate": 7.141535275187363e-06, "loss": 3.0662, "step": 1333 }, { "epoch": 1.2717196996782267, "grad_norm": 21.246244554412026, "learning_rate": 7.136521868837024e-06, "loss": 3.1304, "step": 1334 }, { "epoch": 1.272673102133238, "grad_norm": 24.55408867237692, "learning_rate": 7.131505833305321e-06, "loss": 3.1956, "step": 1335 }, { "epoch": 1.2736265045882493, "grad_norm": 20.760382547215293, "learning_rate": 7.126487174764936e-06, "loss": 3.0939, "step": 1336 }, { "epoch": 1.2745799070432606, "grad_norm": 17.18196165535071, "learning_rate": 7.121465899391773e-06, "loss": 3.0994, "step": 1337 }, { "epoch": 1.275533309498272, "grad_norm": 17.934274242626653, "learning_rate": 7.116442013364964e-06, "loss": 3.2169, "step": 1338 }, { "epoch": 1.2764867119532832, "grad_norm": 22.657325923151475, "learning_rate": 7.111415522866851e-06, "loss": 3.1352, "step": 1339 }, { "epoch": 1.2774401144082945, "grad_norm": 18.363133542426123, "learning_rate": 7.106386434082979e-06, "loss": 3.0015, "step": 1340 }, { "epoch": 1.2783935168633058, "grad_norm": 18.102267683319557, "learning_rate": 7.101354753202092e-06, "loss": 3.1443, "step": 1341 }, { "epoch": 1.2793469193183173, "grad_norm": 20.725098066145517, "learning_rate": 7.096320486416125e-06, "loss": 3.3178, "step": 1342 }, { "epoch": 1.2803003217733286, "grad_norm": 19.99966178273997, "learning_rate": 7.091283639920191e-06, "loss": 3.1275, "step": 1343 }, { "epoch": 1.28125372422834, "grad_norm": 17.462025405450316, "learning_rate": 7.0862442199125836e-06, "loss": 2.9477, "step": 1344 }, { "epoch": 1.2822071266833512, "grad_norm": 20.181514557121773, "learning_rate": 7.081202232594758e-06, "loss": 3.3126, "step": 1345 }, { "epoch": 1.2831605291383625, "grad_norm": 21.226407094792837, "learning_rate": 7.0761576841713306e-06, "loss": 3.1026, "step": 1346 }, { "epoch": 1.2841139315933738, "grad_norm": 18.175445363389155, "learning_rate": 7.071110580850071e-06, "loss": 2.9852, "step": 1347 }, { "epoch": 1.2850673340483851, "grad_norm": 22.920532303506555, "learning_rate": 7.066060928841891e-06, "loss": 3.0977, "step": 1348 }, { "epoch": 1.2860207365033964, "grad_norm": 24.029393051985625, "learning_rate": 7.061008734360841e-06, "loss": 3.1904, "step": 1349 }, { "epoch": 1.2869741389584077, "grad_norm": 17.98183968322068, "learning_rate": 7.055954003624094e-06, "loss": 3.0658, "step": 1350 }, { "epoch": 1.2879275414134193, "grad_norm": 18.975673675452875, "learning_rate": 7.0508967428519525e-06, "loss": 3.2843, "step": 1351 }, { "epoch": 1.2888809438684303, "grad_norm": 21.26071374484562, "learning_rate": 7.0458369582678276e-06, "loss": 2.993, "step": 1352 }, { "epoch": 1.2898343463234418, "grad_norm": 17.095618478432506, "learning_rate": 7.040774656098235e-06, "loss": 3.1926, "step": 1353 }, { "epoch": 1.2907877487784531, "grad_norm": 20.62343678796673, "learning_rate": 7.0357098425727935e-06, "loss": 3.1671, "step": 1354 }, { "epoch": 1.2917411512334644, "grad_norm": 17.467549019274454, "learning_rate": 7.030642523924209e-06, "loss": 3.3047, "step": 1355 }, { "epoch": 1.2926945536884757, "grad_norm": 20.902405552593216, "learning_rate": 7.025572706388268e-06, "loss": 3.1973, "step": 1356 }, { "epoch": 1.293647956143487, "grad_norm": 18.291446930048124, "learning_rate": 7.020500396203838e-06, "loss": 3.1689, "step": 1357 }, { "epoch": 1.2946013585984983, "grad_norm": 20.425527292202286, "learning_rate": 7.015425599612849e-06, "loss": 3.1378, "step": 1358 }, { "epoch": 1.2955547610535096, "grad_norm": 20.717999614686086, "learning_rate": 7.010348322860291e-06, "loss": 3.2383, "step": 1359 }, { "epoch": 1.2965081635085212, "grad_norm": 17.505448381812606, "learning_rate": 7.005268572194208e-06, "loss": 3.0934, "step": 1360 }, { "epoch": 1.2974615659635322, "grad_norm": 21.633578049322924, "learning_rate": 7.000186353865691e-06, "loss": 3.1233, "step": 1361 }, { "epoch": 1.2984149684185438, "grad_norm": 20.634488868206866, "learning_rate": 6.995101674128861e-06, "loss": 3.2909, "step": 1362 }, { "epoch": 1.299368370873555, "grad_norm": 21.84521950012519, "learning_rate": 6.990014539240874e-06, "loss": 3.1755, "step": 1363 }, { "epoch": 1.3003217733285664, "grad_norm": 19.295357804989244, "learning_rate": 6.984924955461901e-06, "loss": 3.2257, "step": 1364 }, { "epoch": 1.3012751757835777, "grad_norm": 21.735739564432937, "learning_rate": 6.979832929055135e-06, "loss": 3.2538, "step": 1365 }, { "epoch": 1.302228578238589, "grad_norm": 19.19904446585923, "learning_rate": 6.9747384662867654e-06, "loss": 3.0804, "step": 1366 }, { "epoch": 1.3031819806936003, "grad_norm": 22.691342837894286, "learning_rate": 6.969641573425991e-06, "loss": 2.9007, "step": 1367 }, { "epoch": 1.3041353831486115, "grad_norm": 20.9764104424804, "learning_rate": 6.964542256744986e-06, "loss": 3.3937, "step": 1368 }, { "epoch": 1.3050887856036228, "grad_norm": 20.797138159236408, "learning_rate": 6.959440522518923e-06, "loss": 3.2336, "step": 1369 }, { "epoch": 1.3060421880586341, "grad_norm": 19.122712567790597, "learning_rate": 6.9543363770259385e-06, "loss": 3.0336, "step": 1370 }, { "epoch": 1.3069955905136457, "grad_norm": 19.52256758799079, "learning_rate": 6.949229826547143e-06, "loss": 3.2572, "step": 1371 }, { "epoch": 1.307948992968657, "grad_norm": 19.83524378295537, "learning_rate": 6.944120877366605e-06, "loss": 3.0336, "step": 1372 }, { "epoch": 1.3089023954236683, "grad_norm": 21.654116322449777, "learning_rate": 6.939009535771337e-06, "loss": 3.0323, "step": 1373 }, { "epoch": 1.3098557978786796, "grad_norm": 21.59045814085364, "learning_rate": 6.93389580805131e-06, "loss": 2.8054, "step": 1374 }, { "epoch": 1.3108092003336909, "grad_norm": 21.540762116237907, "learning_rate": 6.928779700499419e-06, "loss": 3.1498, "step": 1375 }, { "epoch": 1.3117626027887022, "grad_norm": 21.05881034853569, "learning_rate": 6.923661219411494e-06, "loss": 3.3063, "step": 1376 }, { "epoch": 1.3127160052437135, "grad_norm": 22.597207114450132, "learning_rate": 6.918540371086284e-06, "loss": 3.0551, "step": 1377 }, { "epoch": 1.3136694076987248, "grad_norm": 18.30193783424147, "learning_rate": 6.913417161825449e-06, "loss": 3.2275, "step": 1378 }, { "epoch": 1.314622810153736, "grad_norm": 20.882397987451586, "learning_rate": 6.9082915979335585e-06, "loss": 2.9054, "step": 1379 }, { "epoch": 1.3155762126087476, "grad_norm": 16.723749939712413, "learning_rate": 6.9031636857180795e-06, "loss": 2.9686, "step": 1380 }, { "epoch": 1.3165296150637587, "grad_norm": 18.005798757912707, "learning_rate": 6.898033431489363e-06, "loss": 3.2542, "step": 1381 }, { "epoch": 1.3174830175187702, "grad_norm": 27.20645991253771, "learning_rate": 6.892900841560648e-06, "loss": 3.1879, "step": 1382 }, { "epoch": 1.3184364199737815, "grad_norm": 17.866425461231078, "learning_rate": 6.887765922248045e-06, "loss": 3.3709, "step": 1383 }, { "epoch": 1.3193898224287928, "grad_norm": 20.667200313221727, "learning_rate": 6.8826286798705325e-06, "loss": 3.1962, "step": 1384 }, { "epoch": 1.320343224883804, "grad_norm": 18.659010818323743, "learning_rate": 6.877489120749946e-06, "loss": 2.9013, "step": 1385 }, { "epoch": 1.3212966273388154, "grad_norm": 20.571149218673224, "learning_rate": 6.8723472512109745e-06, "loss": 3.1402, "step": 1386 }, { "epoch": 1.3222500297938267, "grad_norm": 22.809959715618252, "learning_rate": 6.867203077581146e-06, "loss": 3.2255, "step": 1387 }, { "epoch": 1.323203432248838, "grad_norm": 19.40077157508024, "learning_rate": 6.86205660619083e-06, "loss": 3.3005, "step": 1388 }, { "epoch": 1.3241568347038495, "grad_norm": 19.615354990616357, "learning_rate": 6.856907843373217e-06, "loss": 3.0851, "step": 1389 }, { "epoch": 1.3251102371588606, "grad_norm": 17.248744541229293, "learning_rate": 6.851756795464323e-06, "loss": 3.2104, "step": 1390 }, { "epoch": 1.326063639613872, "grad_norm": 25.336305749402094, "learning_rate": 6.846603468802973e-06, "loss": 3.2048, "step": 1391 }, { "epoch": 1.3270170420688834, "grad_norm": 22.57405566793287, "learning_rate": 6.841447869730794e-06, "loss": 3.3065, "step": 1392 }, { "epoch": 1.3279704445238947, "grad_norm": 21.0000899380381, "learning_rate": 6.836290004592214e-06, "loss": 3.0925, "step": 1393 }, { "epoch": 1.328923846978906, "grad_norm": 21.525871615363133, "learning_rate": 6.831129879734448e-06, "loss": 3.1936, "step": 1394 }, { "epoch": 1.3298772494339173, "grad_norm": 18.0353686990679, "learning_rate": 6.8259675015074914e-06, "loss": 3.0495, "step": 1395 }, { "epoch": 1.3308306518889286, "grad_norm": 18.81078801991568, "learning_rate": 6.820802876264112e-06, "loss": 3.0623, "step": 1396 }, { "epoch": 1.3317840543439399, "grad_norm": 18.854557438368705, "learning_rate": 6.815636010359843e-06, "loss": 3.1555, "step": 1397 }, { "epoch": 1.3327374567989514, "grad_norm": 17.013703062956466, "learning_rate": 6.8104669101529766e-06, "loss": 3.2307, "step": 1398 }, { "epoch": 1.3336908592539625, "grad_norm": 26.292421364589448, "learning_rate": 6.805295582004552e-06, "loss": 3.0544, "step": 1399 }, { "epoch": 1.334644261708974, "grad_norm": 18.345841830175885, "learning_rate": 6.800122032278351e-06, "loss": 3.2113, "step": 1400 }, { "epoch": 1.3355976641639853, "grad_norm": 20.263415707874504, "learning_rate": 6.794946267340892e-06, "loss": 3.2174, "step": 1401 }, { "epoch": 1.3365510666189966, "grad_norm": 21.458855532208993, "learning_rate": 6.789768293561414e-06, "loss": 3.0551, "step": 1402 }, { "epoch": 1.3375044690740079, "grad_norm": 22.831112264617293, "learning_rate": 6.784588117311879e-06, "loss": 3.1121, "step": 1403 }, { "epoch": 1.3384578715290192, "grad_norm": 18.14060664373189, "learning_rate": 6.7794057449669545e-06, "loss": 3.2326, "step": 1404 }, { "epoch": 1.3394112739840305, "grad_norm": 18.0274237644326, "learning_rate": 6.774221182904018e-06, "loss": 3.0681, "step": 1405 }, { "epoch": 1.3403646764390418, "grad_norm": 19.493285414187753, "learning_rate": 6.769034437503136e-06, "loss": 3.1635, "step": 1406 }, { "epoch": 1.341318078894053, "grad_norm": 18.78213712283706, "learning_rate": 6.76384551514706e-06, "loss": 3.156, "step": 1407 }, { "epoch": 1.3422714813490644, "grad_norm": 19.396386982519935, "learning_rate": 6.758654422221225e-06, "loss": 3.2471, "step": 1408 }, { "epoch": 1.343224883804076, "grad_norm": 22.67725165217662, "learning_rate": 6.7534611651137365e-06, "loss": 3.156, "step": 1409 }, { "epoch": 1.3441782862590872, "grad_norm": 19.340496860764027, "learning_rate": 6.748265750215361e-06, "loss": 3.293, "step": 1410 }, { "epoch": 1.3451316887140985, "grad_norm": 19.076939786206108, "learning_rate": 6.74306818391952e-06, "loss": 3.1158, "step": 1411 }, { "epoch": 1.3460850911691098, "grad_norm": 16.30034697680283, "learning_rate": 6.7378684726222875e-06, "loss": 3.0567, "step": 1412 }, { "epoch": 1.347038493624121, "grad_norm": 18.066520742888617, "learning_rate": 6.732666622722371e-06, "loss": 3.1086, "step": 1413 }, { "epoch": 1.3479918960791324, "grad_norm": 28.093716966312332, "learning_rate": 6.727462640621113e-06, "loss": 3.0403, "step": 1414 }, { "epoch": 1.3489452985341437, "grad_norm": 20.154741591048328, "learning_rate": 6.72225653272248e-06, "loss": 3.2249, "step": 1415 }, { "epoch": 1.349898700989155, "grad_norm": 21.008994130346437, "learning_rate": 6.717048305433053e-06, "loss": 3.1102, "step": 1416 }, { "epoch": 1.3508521034441663, "grad_norm": 20.89305231772973, "learning_rate": 6.71183796516202e-06, "loss": 3.2005, "step": 1417 }, { "epoch": 1.3518055058991778, "grad_norm": 20.871671666050307, "learning_rate": 6.7066255183211745e-06, "loss": 3.1392, "step": 1418 }, { "epoch": 1.3527589083541889, "grad_norm": 21.401786222946424, "learning_rate": 6.701410971324896e-06, "loss": 3.0513, "step": 1419 }, { "epoch": 1.3537123108092004, "grad_norm": 17.41445390808209, "learning_rate": 6.6961943305901515e-06, "loss": 3.1859, "step": 1420 }, { "epoch": 1.3546657132642117, "grad_norm": 18.56563060286096, "learning_rate": 6.690975602536487e-06, "loss": 3.2041, "step": 1421 }, { "epoch": 1.355619115719223, "grad_norm": 19.26147867741435, "learning_rate": 6.6857547935860115e-06, "loss": 3.108, "step": 1422 }, { "epoch": 1.3565725181742343, "grad_norm": 22.919234721528014, "learning_rate": 6.680531910163399e-06, "loss": 3.3939, "step": 1423 }, { "epoch": 1.3575259206292456, "grad_norm": 22.935401930040747, "learning_rate": 6.675306958695874e-06, "loss": 3.448, "step": 1424 }, { "epoch": 1.358479323084257, "grad_norm": 20.119454335059427, "learning_rate": 6.670079945613207e-06, "loss": 3.1396, "step": 1425 }, { "epoch": 1.3594327255392682, "grad_norm": 22.164487661766664, "learning_rate": 6.664850877347706e-06, "loss": 3.1348, "step": 1426 }, { "epoch": 1.3603861279942797, "grad_norm": 19.6196515761043, "learning_rate": 6.659619760334208e-06, "loss": 3.3305, "step": 1427 }, { "epoch": 1.3613395304492908, "grad_norm": 20.96743214035873, "learning_rate": 6.65438660101007e-06, "loss": 3.063, "step": 1428 }, { "epoch": 1.3622929329043023, "grad_norm": 21.625625158353227, "learning_rate": 6.649151405815162e-06, "loss": 3.0714, "step": 1429 }, { "epoch": 1.3632463353593136, "grad_norm": 17.60169320917836, "learning_rate": 6.643914181191862e-06, "loss": 3.0563, "step": 1430 }, { "epoch": 1.364199737814325, "grad_norm": 18.18289938633811, "learning_rate": 6.638674933585043e-06, "loss": 3.1319, "step": 1431 }, { "epoch": 1.3651531402693362, "grad_norm": 24.63519790139341, "learning_rate": 6.633433669442066e-06, "loss": 3.3399, "step": 1432 }, { "epoch": 1.3661065427243475, "grad_norm": 16.438807902869428, "learning_rate": 6.6281903952127775e-06, "loss": 3.0812, "step": 1433 }, { "epoch": 1.3670599451793588, "grad_norm": 17.24959263748741, "learning_rate": 6.622945117349497e-06, "loss": 3.2493, "step": 1434 }, { "epoch": 1.36801334763437, "grad_norm": 20.698241810800102, "learning_rate": 6.617697842307005e-06, "loss": 2.8632, "step": 1435 }, { "epoch": 1.3689667500893814, "grad_norm": 18.178792865214078, "learning_rate": 6.612448576542545e-06, "loss": 3.0816, "step": 1436 }, { "epoch": 1.3699201525443927, "grad_norm": 21.542174202878854, "learning_rate": 6.607197326515808e-06, "loss": 3.0017, "step": 1437 }, { "epoch": 1.3708735549994042, "grad_norm": 17.318363287098055, "learning_rate": 6.601944098688928e-06, "loss": 3.1535, "step": 1438 }, { "epoch": 1.3718269574544155, "grad_norm": 26.175233556531982, "learning_rate": 6.596688899526471e-06, "loss": 2.9966, "step": 1439 }, { "epoch": 1.3727803599094268, "grad_norm": 18.228845490331633, "learning_rate": 6.59143173549543e-06, "loss": 3.1999, "step": 1440 }, { "epoch": 1.3737337623644381, "grad_norm": 18.106971009430307, "learning_rate": 6.586172613065216e-06, "loss": 3.2145, "step": 1441 }, { "epoch": 1.3746871648194494, "grad_norm": 18.82649549792602, "learning_rate": 6.5809115387076495e-06, "loss": 3.1227, "step": 1442 }, { "epoch": 1.3756405672744607, "grad_norm": 19.968127745434423, "learning_rate": 6.575648518896953e-06, "loss": 2.99, "step": 1443 }, { "epoch": 1.376593969729472, "grad_norm": 15.615597642653487, "learning_rate": 6.570383560109745e-06, "loss": 2.8814, "step": 1444 }, { "epoch": 1.3775473721844833, "grad_norm": 20.877264946827584, "learning_rate": 6.565116668825027e-06, "loss": 3.0013, "step": 1445 }, { "epoch": 1.3785007746394946, "grad_norm": 17.59570585705753, "learning_rate": 6.559847851524179e-06, "loss": 3.2445, "step": 1446 }, { "epoch": 1.3794541770945061, "grad_norm": 18.765505798084092, "learning_rate": 6.554577114690956e-06, "loss": 2.957, "step": 1447 }, { "epoch": 1.3804075795495172, "grad_norm": 20.04003165896157, "learning_rate": 6.549304464811467e-06, "loss": 3.0167, "step": 1448 }, { "epoch": 1.3813609820045287, "grad_norm": 19.164744644339372, "learning_rate": 6.544029908374182e-06, "loss": 3.0828, "step": 1449 }, { "epoch": 1.38231438445954, "grad_norm": 21.952350019048755, "learning_rate": 6.5387534518699145e-06, "loss": 3.0623, "step": 1450 }, { "epoch": 1.3832677869145513, "grad_norm": 18.574690455899386, "learning_rate": 6.533475101791816e-06, "loss": 3.0498, "step": 1451 }, { "epoch": 1.3842211893695626, "grad_norm": 18.87077162805114, "learning_rate": 6.52819486463537e-06, "loss": 3.1716, "step": 1452 }, { "epoch": 1.385174591824574, "grad_norm": 18.23787045568084, "learning_rate": 6.52291274689838e-06, "loss": 3.4491, "step": 1453 }, { "epoch": 1.3861279942795852, "grad_norm": 21.449717460671486, "learning_rate": 6.517628755080962e-06, "loss": 2.8454, "step": 1454 }, { "epoch": 1.3870813967345965, "grad_norm": 19.294542483451668, "learning_rate": 6.512342895685544e-06, "loss": 3.0608, "step": 1455 }, { "epoch": 1.388034799189608, "grad_norm": 22.282162614145182, "learning_rate": 6.50705517521685e-06, "loss": 3.3372, "step": 1456 }, { "epoch": 1.3889882016446191, "grad_norm": 20.158040727793743, "learning_rate": 6.50176560018189e-06, "loss": 3.0739, "step": 1457 }, { "epoch": 1.3899416040996306, "grad_norm": 18.635705202875567, "learning_rate": 6.496474177089959e-06, "loss": 3.2646, "step": 1458 }, { "epoch": 1.390895006554642, "grad_norm": 20.47289563694979, "learning_rate": 6.4911809124526315e-06, "loss": 3.0815, "step": 1459 }, { "epoch": 1.3918484090096532, "grad_norm": 19.378387337558593, "learning_rate": 6.48588581278374e-06, "loss": 2.8384, "step": 1460 }, { "epoch": 1.3928018114646645, "grad_norm": 16.148333132829656, "learning_rate": 6.480588884599377e-06, "loss": 3.3275, "step": 1461 }, { "epoch": 1.3937552139196758, "grad_norm": 20.208614709159193, "learning_rate": 6.475290134417892e-06, "loss": 3.0243, "step": 1462 }, { "epoch": 1.3947086163746871, "grad_norm": 18.34733140835761, "learning_rate": 6.469989568759865e-06, "loss": 3.21, "step": 1463 }, { "epoch": 1.3956620188296984, "grad_norm": 18.74940280887273, "learning_rate": 6.464687194148121e-06, "loss": 3.2875, "step": 1464 }, { "epoch": 1.39661542128471, "grad_norm": 24.99473423702534, "learning_rate": 6.459383017107703e-06, "loss": 2.9615, "step": 1465 }, { "epoch": 1.397568823739721, "grad_norm": 18.111572175412476, "learning_rate": 6.454077044165879e-06, "loss": 3.066, "step": 1466 }, { "epoch": 1.3985222261947325, "grad_norm": 19.45457397227308, "learning_rate": 6.448769281852121e-06, "loss": 2.9329, "step": 1467 }, { "epoch": 1.3994756286497438, "grad_norm": 17.0221318855345, "learning_rate": 6.443459736698106e-06, "loss": 3.1271, "step": 1468 }, { "epoch": 1.4004290311047551, "grad_norm": 19.657888292865955, "learning_rate": 6.438148415237705e-06, "loss": 3.0522, "step": 1469 }, { "epoch": 1.4013824335597664, "grad_norm": 19.89429018321631, "learning_rate": 6.432835324006976e-06, "loss": 3.009, "step": 1470 }, { "epoch": 1.4023358360147777, "grad_norm": 17.890874151082087, "learning_rate": 6.427520469544149e-06, "loss": 3.0794, "step": 1471 }, { "epoch": 1.403289238469789, "grad_norm": 17.913445484260997, "learning_rate": 6.422203858389633e-06, "loss": 3.1167, "step": 1472 }, { "epoch": 1.4042426409248003, "grad_norm": 20.426933075680374, "learning_rate": 6.41688549708599e-06, "loss": 3.2247, "step": 1473 }, { "epoch": 1.4051960433798116, "grad_norm": 21.029424964333423, "learning_rate": 6.411565392177941e-06, "loss": 3.2002, "step": 1474 }, { "epoch": 1.406149445834823, "grad_norm": 18.215604816684973, "learning_rate": 6.406243550212351e-06, "loss": 3.0104, "step": 1475 }, { "epoch": 1.4071028482898345, "grad_norm": 19.225977826720577, "learning_rate": 6.400919977738222e-06, "loss": 3.0817, "step": 1476 }, { "epoch": 1.4080562507448457, "grad_norm": 25.11250146802718, "learning_rate": 6.395594681306689e-06, "loss": 3.2714, "step": 1477 }, { "epoch": 1.409009653199857, "grad_norm": 16.432069189349242, "learning_rate": 6.3902676674710055e-06, "loss": 3.2748, "step": 1478 }, { "epoch": 1.4099630556548683, "grad_norm": 17.363401776187246, "learning_rate": 6.384938942786535e-06, "loss": 3.2089, "step": 1479 }, { "epoch": 1.4109164581098796, "grad_norm": 21.788477390395013, "learning_rate": 6.379608513810753e-06, "loss": 2.9246, "step": 1480 }, { "epoch": 1.411869860564891, "grad_norm": 19.835716659173006, "learning_rate": 6.37427638710323e-06, "loss": 3.2194, "step": 1481 }, { "epoch": 1.4128232630199022, "grad_norm": 21.15263259095823, "learning_rate": 6.368942569225623e-06, "loss": 2.9745, "step": 1482 }, { "epoch": 1.4137766654749135, "grad_norm": 15.46053533763857, "learning_rate": 6.363607066741673e-06, "loss": 3.0679, "step": 1483 }, { "epoch": 1.4147300679299248, "grad_norm": 22.885449248155766, "learning_rate": 6.3582698862171945e-06, "loss": 3.0678, "step": 1484 }, { "epoch": 1.4156834703849364, "grad_norm": 18.94494254351994, "learning_rate": 6.3529310342200646e-06, "loss": 3.1085, "step": 1485 }, { "epoch": 1.4166368728399474, "grad_norm": 19.874483441773435, "learning_rate": 6.347590517320218e-06, "loss": 3.1229, "step": 1486 }, { "epoch": 1.417590275294959, "grad_norm": 15.91676831095615, "learning_rate": 6.342248342089641e-06, "loss": 3.2775, "step": 1487 }, { "epoch": 1.4185436777499703, "grad_norm": 21.23853894994185, "learning_rate": 6.336904515102355e-06, "loss": 3.255, "step": 1488 }, { "epoch": 1.4194970802049816, "grad_norm": 20.85650238838254, "learning_rate": 6.331559042934419e-06, "loss": 3.1454, "step": 1489 }, { "epoch": 1.4204504826599929, "grad_norm": 19.65061825369434, "learning_rate": 6.326211932163916e-06, "loss": 3.1808, "step": 1490 }, { "epoch": 1.4214038851150042, "grad_norm": 21.24445383298407, "learning_rate": 6.320863189370943e-06, "loss": 2.9718, "step": 1491 }, { "epoch": 1.4223572875700154, "grad_norm": 17.75595086219527, "learning_rate": 6.315512821137606e-06, "loss": 3.141, "step": 1492 }, { "epoch": 1.4233106900250267, "grad_norm": 22.783544498621538, "learning_rate": 6.310160834048015e-06, "loss": 3.1536, "step": 1493 }, { "epoch": 1.4242640924800383, "grad_norm": 23.25768358933233, "learning_rate": 6.304807234688266e-06, "loss": 3.1328, "step": 1494 }, { "epoch": 1.4252174949350493, "grad_norm": 18.067772668060016, "learning_rate": 6.299452029646442e-06, "loss": 3.1411, "step": 1495 }, { "epoch": 1.4261708973900609, "grad_norm": 18.161332220134046, "learning_rate": 6.294095225512604e-06, "loss": 3.1294, "step": 1496 }, { "epoch": 1.4271242998450722, "grad_norm": 18.67940808793551, "learning_rate": 6.288736828878779e-06, "loss": 3.1421, "step": 1497 }, { "epoch": 1.4280777023000835, "grad_norm": 22.295483485837572, "learning_rate": 6.283376846338951e-06, "loss": 3.2329, "step": 1498 }, { "epoch": 1.4290311047550948, "grad_norm": 20.316099107382126, "learning_rate": 6.2780152844890606e-06, "loss": 3.2335, "step": 1499 }, { "epoch": 1.429984507210106, "grad_norm": 18.546663216688763, "learning_rate": 6.272652149926989e-06, "loss": 3.2124, "step": 1500 }, { "epoch": 1.4309379096651174, "grad_norm": 16.859074857518692, "learning_rate": 6.267287449252553e-06, "loss": 3.3819, "step": 1501 }, { "epoch": 1.4318913121201287, "grad_norm": 22.378922251308644, "learning_rate": 6.261921189067496e-06, "loss": 2.9829, "step": 1502 }, { "epoch": 1.43284471457514, "grad_norm": 15.771820748646794, "learning_rate": 6.256553375975484e-06, "loss": 3.1875, "step": 1503 }, { "epoch": 1.4337981170301513, "grad_norm": 18.739039980968485, "learning_rate": 6.251184016582088e-06, "loss": 3.1909, "step": 1504 }, { "epoch": 1.4347515194851628, "grad_norm": 18.47016052093821, "learning_rate": 6.245813117494788e-06, "loss": 2.9188, "step": 1505 }, { "epoch": 1.435704921940174, "grad_norm": 16.230853403523426, "learning_rate": 6.240440685322953e-06, "loss": 3.0776, "step": 1506 }, { "epoch": 1.4366583243951854, "grad_norm": 20.7482295459544, "learning_rate": 6.235066726677845e-06, "loss": 3.0091, "step": 1507 }, { "epoch": 1.4376117268501967, "grad_norm": 18.29849898339147, "learning_rate": 6.229691248172599e-06, "loss": 2.8555, "step": 1508 }, { "epoch": 1.438565129305208, "grad_norm": 16.228150781157517, "learning_rate": 6.224314256422223e-06, "loss": 3.1095, "step": 1509 }, { "epoch": 1.4395185317602193, "grad_norm": 20.84460318144094, "learning_rate": 6.218935758043587e-06, "loss": 3.1542, "step": 1510 }, { "epoch": 1.4404719342152306, "grad_norm": 18.652574647183865, "learning_rate": 6.213555759655414e-06, "loss": 3.2722, "step": 1511 }, { "epoch": 1.4414253366702419, "grad_norm": 17.798653158057046, "learning_rate": 6.208174267878272e-06, "loss": 3.3582, "step": 1512 }, { "epoch": 1.4423787391252532, "grad_norm": 29.231425391213516, "learning_rate": 6.202791289334572e-06, "loss": 3.4391, "step": 1513 }, { "epoch": 1.4433321415802647, "grad_norm": 22.417217167327106, "learning_rate": 6.197406830648547e-06, "loss": 3.0461, "step": 1514 }, { "epoch": 1.4442855440352758, "grad_norm": 23.120022322267946, "learning_rate": 6.192020898446257e-06, "loss": 3.1443, "step": 1515 }, { "epoch": 1.4452389464902873, "grad_norm": 18.33059625549484, "learning_rate": 6.186633499355576e-06, "loss": 3.178, "step": 1516 }, { "epoch": 1.4461923489452986, "grad_norm": 19.14026328996276, "learning_rate": 6.181244640006174e-06, "loss": 3.1438, "step": 1517 }, { "epoch": 1.4471457514003099, "grad_norm": 20.281273484606746, "learning_rate": 6.175854327029532e-06, "loss": 3.1339, "step": 1518 }, { "epoch": 1.4480991538553212, "grad_norm": 17.975960812057423, "learning_rate": 6.170462567058909e-06, "loss": 3.0316, "step": 1519 }, { "epoch": 1.4490525563103325, "grad_norm": 19.19752536883608, "learning_rate": 6.165069366729347e-06, "loss": 3.1764, "step": 1520 }, { "epoch": 1.4500059587653438, "grad_norm": 23.14960415474261, "learning_rate": 6.159674732677665e-06, "loss": 3.2842, "step": 1521 }, { "epoch": 1.450959361220355, "grad_norm": 18.129483478648044, "learning_rate": 6.154278671542441e-06, "loss": 3.2315, "step": 1522 }, { "epoch": 1.4519127636753666, "grad_norm": 20.10633254174716, "learning_rate": 6.14888118996401e-06, "loss": 3.2603, "step": 1523 }, { "epoch": 1.4528661661303777, "grad_norm": 18.2463902593885, "learning_rate": 6.143482294584459e-06, "loss": 3.0003, "step": 1524 }, { "epoch": 1.4538195685853892, "grad_norm": 24.945350338654173, "learning_rate": 6.13808199204761e-06, "loss": 3.1492, "step": 1525 }, { "epoch": 1.4547729710404005, "grad_norm": 18.052991177434613, "learning_rate": 6.132680288999019e-06, "loss": 3.2909, "step": 1526 }, { "epoch": 1.4557263734954118, "grad_norm": 21.169147694426066, "learning_rate": 6.127277192085963e-06, "loss": 3.1401, "step": 1527 }, { "epoch": 1.456679775950423, "grad_norm": 21.525219980790418, "learning_rate": 6.121872707957441e-06, "loss": 2.9297, "step": 1528 }, { "epoch": 1.4576331784054344, "grad_norm": 19.269863861066426, "learning_rate": 6.1164668432641505e-06, "loss": 3.1752, "step": 1529 }, { "epoch": 1.4585865808604457, "grad_norm": 16.925707313570893, "learning_rate": 6.111059604658492e-06, "loss": 3.0606, "step": 1530 }, { "epoch": 1.459539983315457, "grad_norm": 21.552909210207527, "learning_rate": 6.10565099879456e-06, "loss": 3.0817, "step": 1531 }, { "epoch": 1.4604933857704683, "grad_norm": 21.144244353030096, "learning_rate": 6.100241032328125e-06, "loss": 3.0676, "step": 1532 }, { "epoch": 1.4614467882254796, "grad_norm": 18.671306779620885, "learning_rate": 6.094829711916633e-06, "loss": 3.1848, "step": 1533 }, { "epoch": 1.462400190680491, "grad_norm": 22.347058344100006, "learning_rate": 6.089417044219202e-06, "loss": 3.3774, "step": 1534 }, { "epoch": 1.4633535931355024, "grad_norm": 21.871165976274124, "learning_rate": 6.084003035896604e-06, "loss": 3.0977, "step": 1535 }, { "epoch": 1.4643069955905137, "grad_norm": 18.77987402619343, "learning_rate": 6.078587693611258e-06, "loss": 3.0878, "step": 1536 }, { "epoch": 1.465260398045525, "grad_norm": 19.378399263629987, "learning_rate": 6.073171024027227e-06, "loss": 3.2622, "step": 1537 }, { "epoch": 1.4662138005005363, "grad_norm": 23.381486844784128, "learning_rate": 6.067753033810211e-06, "loss": 3.3481, "step": 1538 }, { "epoch": 1.4671672029555476, "grad_norm": 18.15637325401286, "learning_rate": 6.06233372962753e-06, "loss": 3.2573, "step": 1539 }, { "epoch": 1.4681206054105589, "grad_norm": 16.539594691766872, "learning_rate": 6.056913118148122e-06, "loss": 3.3792, "step": 1540 }, { "epoch": 1.4690740078655702, "grad_norm": 17.838265604306482, "learning_rate": 6.0514912060425355e-06, "loss": 3.1078, "step": 1541 }, { "epoch": 1.4700274103205815, "grad_norm": 19.390583205898977, "learning_rate": 6.04606799998292e-06, "loss": 3.1382, "step": 1542 }, { "epoch": 1.470980812775593, "grad_norm": 18.14300876657298, "learning_rate": 6.040643506643013e-06, "loss": 3.1879, "step": 1543 }, { "epoch": 1.471934215230604, "grad_norm": 20.401864853134523, "learning_rate": 6.035217732698141e-06, "loss": 3.3831, "step": 1544 }, { "epoch": 1.4728876176856156, "grad_norm": 19.31481277617167, "learning_rate": 6.029790684825203e-06, "loss": 3.1598, "step": 1545 }, { "epoch": 1.473841020140627, "grad_norm": 18.781672469238234, "learning_rate": 6.0243623697026685e-06, "loss": 3.0854, "step": 1546 }, { "epoch": 1.4747944225956382, "grad_norm": 18.423689729898292, "learning_rate": 6.018932794010564e-06, "loss": 2.9887, "step": 1547 }, { "epoch": 1.4757478250506495, "grad_norm": 24.432462504477968, "learning_rate": 6.013501964430468e-06, "loss": 3.2472, "step": 1548 }, { "epoch": 1.4767012275056608, "grad_norm": 21.53221960835849, "learning_rate": 6.008069887645504e-06, "loss": 3.2986, "step": 1549 }, { "epoch": 1.477654629960672, "grad_norm": 18.022070603551374, "learning_rate": 6.002636570340328e-06, "loss": 2.929, "step": 1550 }, { "epoch": 1.4786080324156834, "grad_norm": 16.911995555086357, "learning_rate": 5.997202019201123e-06, "loss": 3.2112, "step": 1551 }, { "epoch": 1.479561434870695, "grad_norm": 20.799452968118025, "learning_rate": 5.9917662409155896e-06, "loss": 3.1625, "step": 1552 }, { "epoch": 1.480514837325706, "grad_norm": 18.353722561069265, "learning_rate": 5.98632924217294e-06, "loss": 3.1616, "step": 1553 }, { "epoch": 1.4814682397807175, "grad_norm": 17.93258151733347, "learning_rate": 5.98089102966389e-06, "loss": 3.2354, "step": 1554 }, { "epoch": 1.4824216422357288, "grad_norm": 20.144669950557546, "learning_rate": 5.975451610080643e-06, "loss": 3.0839, "step": 1555 }, { "epoch": 1.48337504469074, "grad_norm": 19.051975014140673, "learning_rate": 5.970010990116892e-06, "loss": 3.0831, "step": 1556 }, { "epoch": 1.4843284471457514, "grad_norm": 19.448978819465175, "learning_rate": 5.964569176467812e-06, "loss": 3.2673, "step": 1557 }, { "epoch": 1.4852818496007627, "grad_norm": 19.81458561755394, "learning_rate": 5.959126175830033e-06, "loss": 3.2129, "step": 1558 }, { "epoch": 1.486235252055774, "grad_norm": 18.75610956220143, "learning_rate": 5.953681994901662e-06, "loss": 3.0543, "step": 1559 }, { "epoch": 1.4871886545107853, "grad_norm": 17.945923310302828, "learning_rate": 5.948236640382249e-06, "loss": 3.187, "step": 1560 }, { "epoch": 1.4881420569657968, "grad_norm": 16.492849701812734, "learning_rate": 5.942790118972787e-06, "loss": 3.085, "step": 1561 }, { "epoch": 1.489095459420808, "grad_norm": 18.537753509206873, "learning_rate": 5.93734243737571e-06, "loss": 3.0088, "step": 1562 }, { "epoch": 1.4900488618758194, "grad_norm": 18.57248363470216, "learning_rate": 5.93189360229488e-06, "loss": 3.3286, "step": 1563 }, { "epoch": 1.4910022643308307, "grad_norm": 21.91523656285753, "learning_rate": 5.926443620435572e-06, "loss": 3.0095, "step": 1564 }, { "epoch": 1.491955666785842, "grad_norm": 16.521790634873717, "learning_rate": 5.9209924985044795e-06, "loss": 3.2791, "step": 1565 }, { "epoch": 1.4929090692408533, "grad_norm": 19.249110404042472, "learning_rate": 5.915540243209694e-06, "loss": 3.0508, "step": 1566 }, { "epoch": 1.4938624716958646, "grad_norm": 22.46596741410952, "learning_rate": 5.910086861260707e-06, "loss": 3.0657, "step": 1567 }, { "epoch": 1.494815874150876, "grad_norm": 17.069758613494212, "learning_rate": 5.904632359368388e-06, "loss": 3.3724, "step": 1568 }, { "epoch": 1.4957692766058872, "grad_norm": 20.61727685050691, "learning_rate": 5.899176744244992e-06, "loss": 3.0529, "step": 1569 }, { "epoch": 1.4967226790608985, "grad_norm": 20.837881561862943, "learning_rate": 5.893720022604144e-06, "loss": 3.3218, "step": 1570 }, { "epoch": 1.4976760815159098, "grad_norm": 20.57119852960263, "learning_rate": 5.888262201160824e-06, "loss": 2.9021, "step": 1571 }, { "epoch": 1.4986294839709213, "grad_norm": 17.976076864059923, "learning_rate": 5.8828032866313725e-06, "loss": 3.3223, "step": 1572 }, { "epoch": 1.4995828864259326, "grad_norm": 22.93982285805392, "learning_rate": 5.877343285733472e-06, "loss": 3.3407, "step": 1573 }, { "epoch": 1.500536288880944, "grad_norm": 21.79766293619824, "learning_rate": 5.871882205186142e-06, "loss": 3.2854, "step": 1574 }, { "epoch": 1.5014896913359552, "grad_norm": 20.547070589639148, "learning_rate": 5.86642005170973e-06, "loss": 3.0499, "step": 1575 }, { "epoch": 1.5024430937909665, "grad_norm": 20.263799614229526, "learning_rate": 5.860956832025907e-06, "loss": 3.155, "step": 1576 }, { "epoch": 1.5033964962459778, "grad_norm": 22.193498674033258, "learning_rate": 5.8554925528576515e-06, "loss": 3.2535, "step": 1577 }, { "epoch": 1.5043498987009891, "grad_norm": 19.45122675372658, "learning_rate": 5.8500272209292485e-06, "loss": 3.1696, "step": 1578 }, { "epoch": 1.5053033011560006, "grad_norm": 22.841941497573067, "learning_rate": 5.844560842966278e-06, "loss": 3.2486, "step": 1579 }, { "epoch": 1.5062567036110117, "grad_norm": 20.0528769942216, "learning_rate": 5.839093425695609e-06, "loss": 2.936, "step": 1580 }, { "epoch": 1.5072101060660232, "grad_norm": 19.610855723975, "learning_rate": 5.833624975845385e-06, "loss": 3.007, "step": 1581 }, { "epoch": 1.5081635085210343, "grad_norm": 18.96651977118145, "learning_rate": 5.828155500145025e-06, "loss": 2.9578, "step": 1582 }, { "epoch": 1.5091169109760458, "grad_norm": 20.04303969838615, "learning_rate": 5.822685005325208e-06, "loss": 3.2993, "step": 1583 }, { "epoch": 1.5100703134310571, "grad_norm": 22.106485328739648, "learning_rate": 5.817213498117866e-06, "loss": 3.1252, "step": 1584 }, { "epoch": 1.5110237158860684, "grad_norm": 26.23995504669688, "learning_rate": 5.81174098525618e-06, "loss": 3.3123, "step": 1585 }, { "epoch": 1.5119771183410797, "grad_norm": 19.991339651583182, "learning_rate": 5.806267473474567e-06, "loss": 3.2596, "step": 1586 }, { "epoch": 1.512930520796091, "grad_norm": 20.29303222657534, "learning_rate": 5.800792969508672e-06, "loss": 3.0835, "step": 1587 }, { "epoch": 1.5138839232511023, "grad_norm": 19.009021239949856, "learning_rate": 5.795317480095361e-06, "loss": 3.0796, "step": 1588 }, { "epoch": 1.5148373257061136, "grad_norm": 21.394818146542544, "learning_rate": 5.7898410119727155e-06, "loss": 3.0823, "step": 1589 }, { "epoch": 1.5157907281611251, "grad_norm": 15.085351184076178, "learning_rate": 5.784363571880021e-06, "loss": 2.8279, "step": 1590 }, { "epoch": 1.5167441306161362, "grad_norm": 19.554028749948277, "learning_rate": 5.778885166557753e-06, "loss": 3.0913, "step": 1591 }, { "epoch": 1.5176975330711477, "grad_norm": 19.695110027674904, "learning_rate": 5.773405802747585e-06, "loss": 3.1397, "step": 1592 }, { "epoch": 1.5186509355261588, "grad_norm": 17.28969411503337, "learning_rate": 5.7679254871923604e-06, "loss": 2.9424, "step": 1593 }, { "epoch": 1.5196043379811703, "grad_norm": 16.66143843485345, "learning_rate": 5.7624442266361e-06, "loss": 2.9524, "step": 1594 }, { "epoch": 1.5205577404361816, "grad_norm": 18.242133782564977, "learning_rate": 5.7569620278239866e-06, "loss": 2.8628, "step": 1595 }, { "epoch": 1.521511142891193, "grad_norm": 19.16234598619388, "learning_rate": 5.751478897502353e-06, "loss": 3.0473, "step": 1596 }, { "epoch": 1.5224645453462042, "grad_norm": 20.258555234620932, "learning_rate": 5.745994842418683e-06, "loss": 3.159, "step": 1597 }, { "epoch": 1.5234179478012155, "grad_norm": 20.335776956959535, "learning_rate": 5.740509869321601e-06, "loss": 3.4127, "step": 1598 }, { "epoch": 1.524371350256227, "grad_norm": 20.47551705535837, "learning_rate": 5.735023984960851e-06, "loss": 3.0921, "step": 1599 }, { "epoch": 1.5253247527112381, "grad_norm": 20.516765681882877, "learning_rate": 5.729537196087309e-06, "loss": 3.1448, "step": 1600 }, { "epoch": 1.5262781551662497, "grad_norm": 18.344954194835854, "learning_rate": 5.724049509452959e-06, "loss": 3.3764, "step": 1601 }, { "epoch": 1.5272315576212607, "grad_norm": 21.873345847835516, "learning_rate": 5.718560931810888e-06, "loss": 3.1164, "step": 1602 }, { "epoch": 1.5281849600762722, "grad_norm": 18.883618564185497, "learning_rate": 5.713071469915285e-06, "loss": 3.4731, "step": 1603 }, { "epoch": 1.5291383625312835, "grad_norm": 19.417982014619184, "learning_rate": 5.707581130521424e-06, "loss": 3.3107, "step": 1604 }, { "epoch": 1.5300917649862948, "grad_norm": 17.09684213489698, "learning_rate": 5.702089920385657e-06, "loss": 3.0919, "step": 1605 }, { "epoch": 1.5310451674413061, "grad_norm": 18.11633164077511, "learning_rate": 5.696597846265412e-06, "loss": 3.1984, "step": 1606 }, { "epoch": 1.5319985698963174, "grad_norm": 17.79737867720799, "learning_rate": 5.691104914919173e-06, "loss": 3.112, "step": 1607 }, { "epoch": 1.532951972351329, "grad_norm": 18.438179913378214, "learning_rate": 5.685611133106491e-06, "loss": 3.0489, "step": 1608 }, { "epoch": 1.53390537480634, "grad_norm": 20.227823464441823, "learning_rate": 5.680116507587949e-06, "loss": 3.0274, "step": 1609 }, { "epoch": 1.5348587772613516, "grad_norm": 19.405204187920557, "learning_rate": 5.67462104512518e-06, "loss": 3.1557, "step": 1610 }, { "epoch": 1.5358121797163626, "grad_norm": 18.661187274469974, "learning_rate": 5.669124752480842e-06, "loss": 3.1062, "step": 1611 }, { "epoch": 1.5367655821713742, "grad_norm": 22.783103130572766, "learning_rate": 5.663627636418611e-06, "loss": 3.0797, "step": 1612 }, { "epoch": 1.5377189846263855, "grad_norm": 21.22754889195201, "learning_rate": 5.658129703703184e-06, "loss": 3.1257, "step": 1613 }, { "epoch": 1.5386723870813968, "grad_norm": 16.93502467089766, "learning_rate": 5.65263096110026e-06, "loss": 3.291, "step": 1614 }, { "epoch": 1.539625789536408, "grad_norm": 17.307738880415016, "learning_rate": 5.647131415376529e-06, "loss": 3.3571, "step": 1615 }, { "epoch": 1.5405791919914193, "grad_norm": 20.079474339916892, "learning_rate": 5.64163107329968e-06, "loss": 3.3866, "step": 1616 }, { "epoch": 1.5415325944464306, "grad_norm": 22.42765724896311, "learning_rate": 5.636129941638373e-06, "loss": 2.9606, "step": 1617 }, { "epoch": 1.542485996901442, "grad_norm": 20.62332452895825, "learning_rate": 5.630628027162244e-06, "loss": 3.1164, "step": 1618 }, { "epoch": 1.5434393993564535, "grad_norm": 16.613511311310933, "learning_rate": 5.625125336641889e-06, "loss": 3.0404, "step": 1619 }, { "epoch": 1.5443928018114645, "grad_norm": 18.565377197861974, "learning_rate": 5.619621876848864e-06, "loss": 3.0939, "step": 1620 }, { "epoch": 1.545346204266476, "grad_norm": 14.272646575020167, "learning_rate": 5.614117654555666e-06, "loss": 3.1077, "step": 1621 }, { "epoch": 1.5462996067214874, "grad_norm": 19.357867915674095, "learning_rate": 5.608612676535736e-06, "loss": 3.2214, "step": 1622 }, { "epoch": 1.5472530091764987, "grad_norm": 21.374528230368036, "learning_rate": 5.603106949563441e-06, "loss": 3.2032, "step": 1623 }, { "epoch": 1.54820641163151, "grad_norm": 19.71978870410121, "learning_rate": 5.597600480414069e-06, "loss": 3.1292, "step": 1624 }, { "epoch": 1.5491598140865213, "grad_norm": 28.15085966318026, "learning_rate": 5.592093275863825e-06, "loss": 3.2007, "step": 1625 }, { "epoch": 1.5501132165415326, "grad_norm": 24.09891474971749, "learning_rate": 5.586585342689817e-06, "loss": 3.2056, "step": 1626 }, { "epoch": 1.5510666189965439, "grad_norm": 21.482498594440536, "learning_rate": 5.581076687670051e-06, "loss": 3.112, "step": 1627 }, { "epoch": 1.5520200214515554, "grad_norm": 16.986135857928563, "learning_rate": 5.575567317583415e-06, "loss": 3.0982, "step": 1628 }, { "epoch": 1.5529734239065665, "grad_norm": 16.99745006549954, "learning_rate": 5.570057239209687e-06, "loss": 3.1129, "step": 1629 }, { "epoch": 1.553926826361578, "grad_norm": 17.48265382358015, "learning_rate": 5.564546459329509e-06, "loss": 2.9809, "step": 1630 }, { "epoch": 1.554880228816589, "grad_norm": 20.52114744521535, "learning_rate": 5.559034984724392e-06, "loss": 3.0399, "step": 1631 }, { "epoch": 1.5558336312716006, "grad_norm": 17.715027780234134, "learning_rate": 5.553522822176694e-06, "loss": 3.1288, "step": 1632 }, { "epoch": 1.5567870337266119, "grad_norm": 19.138466233538146, "learning_rate": 5.548009978469627e-06, "loss": 3.0979, "step": 1633 }, { "epoch": 1.5577404361816232, "grad_norm": 18.715775378414172, "learning_rate": 5.542496460387239e-06, "loss": 3.0001, "step": 1634 }, { "epoch": 1.5586938386366345, "grad_norm": 17.153481116809754, "learning_rate": 5.536982274714405e-06, "loss": 3.5096, "step": 1635 }, { "epoch": 1.5596472410916458, "grad_norm": 19.206885084379145, "learning_rate": 5.531467428236827e-06, "loss": 3.1718, "step": 1636 }, { "epoch": 1.5606006435466573, "grad_norm": 18.720418888786615, "learning_rate": 5.5259519277410165e-06, "loss": 3.1908, "step": 1637 }, { "epoch": 1.5615540460016684, "grad_norm": 20.484092959848592, "learning_rate": 5.520435780014288e-06, "loss": 3.1293, "step": 1638 }, { "epoch": 1.5625074484566799, "grad_norm": 21.42220061571147, "learning_rate": 5.514918991844759e-06, "loss": 3.2787, "step": 1639 }, { "epoch": 1.563460850911691, "grad_norm": 22.822144663731876, "learning_rate": 5.5094015700213254e-06, "loss": 2.8799, "step": 1640 }, { "epoch": 1.5644142533667025, "grad_norm": 16.53922231654449, "learning_rate": 5.503883521333674e-06, "loss": 3.0827, "step": 1641 }, { "epoch": 1.5653676558217138, "grad_norm": 17.48692811397963, "learning_rate": 5.498364852572255e-06, "loss": 3.1783, "step": 1642 }, { "epoch": 1.566321058276725, "grad_norm": 17.829702958242652, "learning_rate": 5.492845570528284e-06, "loss": 2.9319, "step": 1643 }, { "epoch": 1.5672744607317364, "grad_norm": 18.281493015653663, "learning_rate": 5.4873256819937325e-06, "loss": 3.1341, "step": 1644 }, { "epoch": 1.5682278631867477, "grad_norm": 19.77306520382642, "learning_rate": 5.481805193761316e-06, "loss": 2.956, "step": 1645 }, { "epoch": 1.5691812656417592, "grad_norm": 16.92397278307526, "learning_rate": 5.4762841126244915e-06, "loss": 3.0352, "step": 1646 }, { "epoch": 1.5701346680967703, "grad_norm": 17.926160895344292, "learning_rate": 5.470762445377442e-06, "loss": 3.0416, "step": 1647 }, { "epoch": 1.5710880705517818, "grad_norm": 22.127270651599357, "learning_rate": 5.465240198815073e-06, "loss": 2.8989, "step": 1648 }, { "epoch": 1.5720414730067929, "grad_norm": 17.835174713720995, "learning_rate": 5.4597173797330046e-06, "loss": 3.2399, "step": 1649 }, { "epoch": 1.5729948754618044, "grad_norm": 18.770190462552055, "learning_rate": 5.454193994927557e-06, "loss": 3.0145, "step": 1650 }, { "epoch": 1.5739482779168157, "grad_norm": 18.79243122780387, "learning_rate": 5.448670051195753e-06, "loss": 3.212, "step": 1651 }, { "epoch": 1.574901680371827, "grad_norm": 23.295998232354886, "learning_rate": 5.443145555335296e-06, "loss": 3.104, "step": 1652 }, { "epoch": 1.5758550828268383, "grad_norm": 23.47573425101132, "learning_rate": 5.437620514144575e-06, "loss": 2.955, "step": 1653 }, { "epoch": 1.5768084852818496, "grad_norm": 17.61545860294147, "learning_rate": 5.432094934422648e-06, "loss": 3.1775, "step": 1654 }, { "epoch": 1.5777618877368609, "grad_norm": 17.18598281766889, "learning_rate": 5.426568822969235e-06, "loss": 2.9254, "step": 1655 }, { "epoch": 1.5787152901918722, "grad_norm": 17.933895645464432, "learning_rate": 5.421042186584708e-06, "loss": 3.2909, "step": 1656 }, { "epoch": 1.5796686926468837, "grad_norm": 20.205522165733367, "learning_rate": 5.415515032070092e-06, "loss": 3.1941, "step": 1657 }, { "epoch": 1.5806220951018948, "grad_norm": 23.615749423170552, "learning_rate": 5.409987366227043e-06, "loss": 3.2383, "step": 1658 }, { "epoch": 1.5815754975569063, "grad_norm": 16.732791681780977, "learning_rate": 5.404459195857849e-06, "loss": 3.1785, "step": 1659 }, { "epoch": 1.5825289000119174, "grad_norm": 17.75404051653563, "learning_rate": 5.398930527765416e-06, "loss": 3.1391, "step": 1660 }, { "epoch": 1.583482302466929, "grad_norm": 17.428964863066078, "learning_rate": 5.393401368753268e-06, "loss": 3.196, "step": 1661 }, { "epoch": 1.5844357049219402, "grad_norm": 18.818397043760157, "learning_rate": 5.38787172562553e-06, "loss": 3.144, "step": 1662 }, { "epoch": 1.5853891073769515, "grad_norm": 16.67789053166813, "learning_rate": 5.38234160518692e-06, "loss": 3.0108, "step": 1663 }, { "epoch": 1.5863425098319628, "grad_norm": 26.56863529729854, "learning_rate": 5.376811014242749e-06, "loss": 3.1743, "step": 1664 }, { "epoch": 1.587295912286974, "grad_norm": 20.348783636666532, "learning_rate": 5.371279959598903e-06, "loss": 3.2888, "step": 1665 }, { "epoch": 1.5882493147419856, "grad_norm": 16.92673847659394, "learning_rate": 5.365748448061838e-06, "loss": 3.0275, "step": 1666 }, { "epoch": 1.5892027171969967, "grad_norm": 20.213057667390157, "learning_rate": 5.360216486438577e-06, "loss": 3.2196, "step": 1667 }, { "epoch": 1.5901561196520082, "grad_norm": 18.637022510623215, "learning_rate": 5.354684081536693e-06, "loss": 3.3261, "step": 1668 }, { "epoch": 1.5911095221070193, "grad_norm": 19.505223397096998, "learning_rate": 5.349151240164303e-06, "loss": 2.9461, "step": 1669 }, { "epoch": 1.5920629245620308, "grad_norm": 17.153193049469845, "learning_rate": 5.343617969130067e-06, "loss": 3.1023, "step": 1670 }, { "epoch": 1.593016327017042, "grad_norm": 18.222193961802827, "learning_rate": 5.338084275243168e-06, "loss": 3.0737, "step": 1671 }, { "epoch": 1.5939697294720534, "grad_norm": 18.354304537024632, "learning_rate": 5.332550165313312e-06, "loss": 3.141, "step": 1672 }, { "epoch": 1.5949231319270647, "grad_norm": 15.862420375743847, "learning_rate": 5.327015646150716e-06, "loss": 3.3977, "step": 1673 }, { "epoch": 1.595876534382076, "grad_norm": 21.211297785029696, "learning_rate": 5.3214807245661015e-06, "loss": 3.0955, "step": 1674 }, { "epoch": 1.5968299368370875, "grad_norm": 16.639231699986986, "learning_rate": 5.3159454073706865e-06, "loss": 3.0033, "step": 1675 }, { "epoch": 1.5977833392920986, "grad_norm": 18.120015305822548, "learning_rate": 5.31040970137617e-06, "loss": 3.1909, "step": 1676 }, { "epoch": 1.5987367417471101, "grad_norm": 28.179815652395984, "learning_rate": 5.304873613394739e-06, "loss": 3.1695, "step": 1677 }, { "epoch": 1.5996901442021212, "grad_norm": 19.762827821246322, "learning_rate": 5.2993371502390425e-06, "loss": 3.1515, "step": 1678 }, { "epoch": 1.6006435466571327, "grad_norm": 22.130484523082302, "learning_rate": 5.293800318722192e-06, "loss": 3.0176, "step": 1679 }, { "epoch": 1.601596949112144, "grad_norm": 19.957069910929288, "learning_rate": 5.288263125657757e-06, "loss": 3.3948, "step": 1680 }, { "epoch": 1.6025503515671553, "grad_norm": 21.284934440109634, "learning_rate": 5.282725577859749e-06, "loss": 2.9131, "step": 1681 }, { "epoch": 1.6035037540221666, "grad_norm": 20.25212590493311, "learning_rate": 5.277187682142615e-06, "loss": 3.0659, "step": 1682 }, { "epoch": 1.604457156477178, "grad_norm": 21.50572533509346, "learning_rate": 5.271649445321231e-06, "loss": 3.0302, "step": 1683 }, { "epoch": 1.6054105589321892, "grad_norm": 17.235432632974554, "learning_rate": 5.266110874210893e-06, "loss": 3.0987, "step": 1684 }, { "epoch": 1.6063639613872005, "grad_norm": 18.04825396892589, "learning_rate": 5.260571975627311e-06, "loss": 3.1758, "step": 1685 }, { "epoch": 1.607317363842212, "grad_norm": 17.1976677440515, "learning_rate": 5.255032756386592e-06, "loss": 3.1504, "step": 1686 }, { "epoch": 1.608270766297223, "grad_norm": 18.606551126137504, "learning_rate": 5.249493223305244e-06, "loss": 3.2577, "step": 1687 }, { "epoch": 1.6092241687522346, "grad_norm": 19.920539199379505, "learning_rate": 5.2439533832001565e-06, "loss": 3.2534, "step": 1688 }, { "epoch": 1.6101775712072457, "grad_norm": 17.611207159502523, "learning_rate": 5.238413242888597e-06, "loss": 3.1694, "step": 1689 }, { "epoch": 1.6111309736622572, "grad_norm": 20.172341502346704, "learning_rate": 5.2328728091882084e-06, "loss": 3.2732, "step": 1690 }, { "epoch": 1.6120843761172685, "grad_norm": 20.52982087518913, "learning_rate": 5.2273320889169854e-06, "loss": 3.132, "step": 1691 }, { "epoch": 1.6130377785722798, "grad_norm": 20.386278345973743, "learning_rate": 5.221791088893282e-06, "loss": 3.1614, "step": 1692 }, { "epoch": 1.613991181027291, "grad_norm": 16.52677522168969, "learning_rate": 5.216249815935798e-06, "loss": 3.17, "step": 1693 }, { "epoch": 1.6149445834823024, "grad_norm": 18.129959271234448, "learning_rate": 5.21070827686356e-06, "loss": 2.9972, "step": 1694 }, { "epoch": 1.615897985937314, "grad_norm": 18.21328905182513, "learning_rate": 5.20516647849593e-06, "loss": 3.1096, "step": 1695 }, { "epoch": 1.616851388392325, "grad_norm": 19.97424640429797, "learning_rate": 5.199624427652589e-06, "loss": 3.1482, "step": 1696 }, { "epoch": 1.6178047908473365, "grad_norm": 16.49860833935, "learning_rate": 5.194082131153523e-06, "loss": 3.3128, "step": 1697 }, { "epoch": 1.6187581933023476, "grad_norm": 20.168889771529138, "learning_rate": 5.188539595819027e-06, "loss": 3.2307, "step": 1698 }, { "epoch": 1.6197115957573591, "grad_norm": 21.61008010085369, "learning_rate": 5.182996828469684e-06, "loss": 3.3223, "step": 1699 }, { "epoch": 1.6206649982123704, "grad_norm": 20.40627740588734, "learning_rate": 5.177453835926366e-06, "loss": 3.1888, "step": 1700 }, { "epoch": 1.6216184006673817, "grad_norm": 18.050043515093044, "learning_rate": 5.171910625010223e-06, "loss": 3.1781, "step": 1701 }, { "epoch": 1.622571803122393, "grad_norm": 16.74717373031955, "learning_rate": 5.166367202542672e-06, "loss": 3.0452, "step": 1702 }, { "epoch": 1.6235252055774043, "grad_norm": 18.93905306982282, "learning_rate": 5.160823575345388e-06, "loss": 3.1393, "step": 1703 }, { "epoch": 1.6244786080324158, "grad_norm": 21.958153736082917, "learning_rate": 5.155279750240302e-06, "loss": 3.0884, "step": 1704 }, { "epoch": 1.625432010487427, "grad_norm": 23.466228710901895, "learning_rate": 5.149735734049588e-06, "loss": 3.0984, "step": 1705 }, { "epoch": 1.6263854129424384, "grad_norm": 17.14914711635277, "learning_rate": 5.1441915335956525e-06, "loss": 3.34, "step": 1706 }, { "epoch": 1.6273388153974495, "grad_norm": 22.90877352729242, "learning_rate": 5.13864715570113e-06, "loss": 3.0759, "step": 1707 }, { "epoch": 1.628292217852461, "grad_norm": 20.90977537882566, "learning_rate": 5.133102607188875e-06, "loss": 3.2285, "step": 1708 }, { "epoch": 1.6292456203074723, "grad_norm": 17.692668453900772, "learning_rate": 5.127557894881949e-06, "loss": 3.1216, "step": 1709 }, { "epoch": 1.6301990227624836, "grad_norm": 18.928369565310717, "learning_rate": 5.122013025603618e-06, "loss": 3.3224, "step": 1710 }, { "epoch": 1.631152425217495, "grad_norm": 16.333351176538283, "learning_rate": 5.116468006177341e-06, "loss": 3.1994, "step": 1711 }, { "epoch": 1.6321058276725062, "grad_norm": 21.86457655935642, "learning_rate": 5.1109228434267585e-06, "loss": 2.9659, "step": 1712 }, { "epoch": 1.6330592301275175, "grad_norm": 24.87712128204884, "learning_rate": 5.105377544175692e-06, "loss": 3.1048, "step": 1713 }, { "epoch": 1.6340126325825288, "grad_norm": 18.809245893248576, "learning_rate": 5.0998321152481235e-06, "loss": 3.1384, "step": 1714 }, { "epoch": 1.6349660350375403, "grad_norm": 19.610055389274475, "learning_rate": 5.094286563468205e-06, "loss": 3.2073, "step": 1715 }, { "epoch": 1.6359194374925514, "grad_norm": 19.026648508022753, "learning_rate": 5.0887408956602316e-06, "loss": 3.1055, "step": 1716 }, { "epoch": 1.636872839947563, "grad_norm": 21.48117268553624, "learning_rate": 5.083195118648644e-06, "loss": 3.1636, "step": 1717 }, { "epoch": 1.6378262424025742, "grad_norm": 21.11645472077021, "learning_rate": 5.077649239258018e-06, "loss": 3.0797, "step": 1718 }, { "epoch": 1.6387796448575855, "grad_norm": 20.135053344955193, "learning_rate": 5.072103264313054e-06, "loss": 2.9793, "step": 1719 }, { "epoch": 1.6397330473125968, "grad_norm": 17.64189501563371, "learning_rate": 5.06655720063857e-06, "loss": 3.0508, "step": 1720 }, { "epoch": 1.6406864497676081, "grad_norm": 17.127919700968405, "learning_rate": 5.061011055059496e-06, "loss": 3.0117, "step": 1721 }, { "epoch": 1.6416398522226194, "grad_norm": 17.193746842431754, "learning_rate": 5.055464834400856e-06, "loss": 3.3344, "step": 1722 }, { "epoch": 1.6425932546776307, "grad_norm": 21.28141801950472, "learning_rate": 5.049918545487775e-06, "loss": 3.0891, "step": 1723 }, { "epoch": 1.6435466571326423, "grad_norm": 21.492567237926945, "learning_rate": 5.044372195145455e-06, "loss": 3.3401, "step": 1724 }, { "epoch": 1.6445000595876533, "grad_norm": 21.00248210357767, "learning_rate": 5.03882579019918e-06, "loss": 3.3948, "step": 1725 }, { "epoch": 1.6454534620426648, "grad_norm": 19.722187190663167, "learning_rate": 5.033279337474295e-06, "loss": 3.2024, "step": 1726 }, { "epoch": 1.646406864497676, "grad_norm": 20.732803296047802, "learning_rate": 5.027732843796206e-06, "loss": 3.302, "step": 1727 }, { "epoch": 1.6473602669526874, "grad_norm": 18.072507286334734, "learning_rate": 5.022186315990371e-06, "loss": 3.308, "step": 1728 }, { "epoch": 1.6483136694076987, "grad_norm": 21.057324023094967, "learning_rate": 5.01663976088229e-06, "loss": 3.335, "step": 1729 }, { "epoch": 1.64926707186271, "grad_norm": 23.039938889389806, "learning_rate": 5.011093185297491e-06, "loss": 3.0338, "step": 1730 }, { "epoch": 1.6502204743177213, "grad_norm": 19.303217146793756, "learning_rate": 5.005546596061539e-06, "loss": 3.237, "step": 1731 }, { "epoch": 1.6511738767727326, "grad_norm": 21.251615519815008, "learning_rate": 5e-06, "loss": 3.2445, "step": 1732 }, { "epoch": 1.6521272792277442, "grad_norm": 17.96159960788823, "learning_rate": 4.994453403938463e-06, "loss": 3.2607, "step": 1733 }, { "epoch": 1.6530806816827552, "grad_norm": 16.62004609471517, "learning_rate": 4.988906814702509e-06, "loss": 2.984, "step": 1734 }, { "epoch": 1.6540340841377668, "grad_norm": 18.670214972234074, "learning_rate": 4.983360239117711e-06, "loss": 3.2298, "step": 1735 }, { "epoch": 1.6549874865927778, "grad_norm": 18.628410841741697, "learning_rate": 4.97781368400963e-06, "loss": 3.4686, "step": 1736 }, { "epoch": 1.6559408890477894, "grad_norm": 20.725989348737524, "learning_rate": 4.972267156203796e-06, "loss": 3.2756, "step": 1737 }, { "epoch": 1.6568942915028007, "grad_norm": 20.998677902962744, "learning_rate": 4.966720662525707e-06, "loss": 3.1399, "step": 1738 }, { "epoch": 1.657847693957812, "grad_norm": 20.377766117284395, "learning_rate": 4.961174209800821e-06, "loss": 3.2982, "step": 1739 }, { "epoch": 1.6588010964128233, "grad_norm": 18.030164433791203, "learning_rate": 4.9556278048545445e-06, "loss": 3.1001, "step": 1740 }, { "epoch": 1.6597544988678345, "grad_norm": 22.38402332732066, "learning_rate": 4.950081454512226e-06, "loss": 3.2193, "step": 1741 }, { "epoch": 1.660707901322846, "grad_norm": 21.73914230341786, "learning_rate": 4.9445351655991465e-06, "loss": 2.9059, "step": 1742 }, { "epoch": 1.6616613037778571, "grad_norm": 19.43780275667029, "learning_rate": 4.9389889449405075e-06, "loss": 2.9703, "step": 1743 }, { "epoch": 1.6626147062328687, "grad_norm": 17.651706052022828, "learning_rate": 4.933442799361432e-06, "loss": 3.1833, "step": 1744 }, { "epoch": 1.6635681086878797, "grad_norm": 20.25358265127559, "learning_rate": 4.9278967356869475e-06, "loss": 3.3023, "step": 1745 }, { "epoch": 1.6645215111428913, "grad_norm": 20.99110542904033, "learning_rate": 4.922350760741984e-06, "loss": 3.1659, "step": 1746 }, { "epoch": 1.6654749135979026, "grad_norm": 18.530004905233902, "learning_rate": 4.916804881351357e-06, "loss": 2.9835, "step": 1747 }, { "epoch": 1.6664283160529139, "grad_norm": 22.134283578312793, "learning_rate": 4.911259104339771e-06, "loss": 3.0453, "step": 1748 }, { "epoch": 1.6673817185079252, "grad_norm": 19.739762087180917, "learning_rate": 4.9057134365317964e-06, "loss": 3.1642, "step": 1749 }, { "epoch": 1.6683351209629365, "grad_norm": 24.33541122427858, "learning_rate": 4.900167884751878e-06, "loss": 3.1268, "step": 1750 }, { "epoch": 1.6692885234179478, "grad_norm": 16.17892820994856, "learning_rate": 4.894622455824311e-06, "loss": 3.2009, "step": 1751 }, { "epoch": 1.670241925872959, "grad_norm": 16.955026370077956, "learning_rate": 4.889077156573242e-06, "loss": 3.1636, "step": 1752 }, { "epoch": 1.6711953283279706, "grad_norm": 18.391828582977052, "learning_rate": 4.88353199382266e-06, "loss": 3.2181, "step": 1753 }, { "epoch": 1.6721487307829817, "grad_norm": 25.209021145996946, "learning_rate": 4.877986974396382e-06, "loss": 3.3152, "step": 1754 }, { "epoch": 1.6731021332379932, "grad_norm": 17.704732360973978, "learning_rate": 4.8724421051180526e-06, "loss": 3.0749, "step": 1755 }, { "epoch": 1.6740555356930042, "grad_norm": 18.508118202227017, "learning_rate": 4.866897392811127e-06, "loss": 3.265, "step": 1756 }, { "epoch": 1.6750089381480158, "grad_norm": 20.262100821197595, "learning_rate": 4.861352844298872e-06, "loss": 3.3735, "step": 1757 }, { "epoch": 1.675962340603027, "grad_norm": 18.05758618604456, "learning_rate": 4.855808466404349e-06, "loss": 3.1338, "step": 1758 }, { "epoch": 1.6769157430580384, "grad_norm": 16.035775889472006, "learning_rate": 4.8502642659504136e-06, "loss": 3.2987, "step": 1759 }, { "epoch": 1.6778691455130497, "grad_norm": 19.482011573819623, "learning_rate": 4.8447202497596975e-06, "loss": 3.163, "step": 1760 }, { "epoch": 1.678822547968061, "grad_norm": 19.748990063003543, "learning_rate": 4.839176424654614e-06, "loss": 3.1207, "step": 1761 }, { "epoch": 1.6797759504230725, "grad_norm": 18.21906719171915, "learning_rate": 4.833632797457331e-06, "loss": 2.7976, "step": 1762 }, { "epoch": 1.6807293528780836, "grad_norm": 17.639114423677164, "learning_rate": 4.8280893749897785e-06, "loss": 3.1562, "step": 1763 }, { "epoch": 1.681682755333095, "grad_norm": 19.961080106293767, "learning_rate": 4.822546164073635e-06, "loss": 3.2006, "step": 1764 }, { "epoch": 1.6826361577881062, "grad_norm": 20.37175243659279, "learning_rate": 4.8170031715303176e-06, "loss": 2.9988, "step": 1765 }, { "epoch": 1.6835895602431177, "grad_norm": 17.416012809169725, "learning_rate": 4.811460404180974e-06, "loss": 3.1364, "step": 1766 }, { "epoch": 1.684542962698129, "grad_norm": 18.411195144060272, "learning_rate": 4.805917868846479e-06, "loss": 3.1131, "step": 1767 }, { "epoch": 1.6854963651531403, "grad_norm": 19.023583409296837, "learning_rate": 4.800375572347414e-06, "loss": 3.2559, "step": 1768 }, { "epoch": 1.6864497676081516, "grad_norm": 23.67383750690532, "learning_rate": 4.794833521504071e-06, "loss": 3.1087, "step": 1769 }, { "epoch": 1.6874031700631629, "grad_norm": 18.16663443495935, "learning_rate": 4.789291723136442e-06, "loss": 3.2594, "step": 1770 }, { "epoch": 1.6883565725181744, "grad_norm": 22.17574891470023, "learning_rate": 4.783750184064204e-06, "loss": 3.0869, "step": 1771 }, { "epoch": 1.6893099749731855, "grad_norm": 16.145975810125652, "learning_rate": 4.778208911106718e-06, "loss": 3.1551, "step": 1772 }, { "epoch": 1.690263377428197, "grad_norm": 21.99783140290184, "learning_rate": 4.7726679110830145e-06, "loss": 3.0989, "step": 1773 }, { "epoch": 1.691216779883208, "grad_norm": 18.342212489800794, "learning_rate": 4.767127190811794e-06, "loss": 3.0256, "step": 1774 }, { "epoch": 1.6921701823382196, "grad_norm": 20.259501814470354, "learning_rate": 4.761586757111404e-06, "loss": 3.108, "step": 1775 }, { "epoch": 1.6931235847932309, "grad_norm": 16.80351851733557, "learning_rate": 4.756046616799845e-06, "loss": 3.2441, "step": 1776 }, { "epoch": 1.6940769872482422, "grad_norm": 19.00325529152075, "learning_rate": 4.750506776694757e-06, "loss": 3.333, "step": 1777 }, { "epoch": 1.6950303897032535, "grad_norm": 18.517799690236796, "learning_rate": 4.744967243613408e-06, "loss": 3.2352, "step": 1778 }, { "epoch": 1.6959837921582648, "grad_norm": 20.508165233356824, "learning_rate": 4.73942802437269e-06, "loss": 3.2156, "step": 1779 }, { "epoch": 1.696937194613276, "grad_norm": 22.625775892206118, "learning_rate": 4.7338891257891085e-06, "loss": 3.1545, "step": 1780 }, { "epoch": 1.6978905970682874, "grad_norm": 19.40266935751844, "learning_rate": 4.728350554678771e-06, "loss": 3.4368, "step": 1781 }, { "epoch": 1.698843999523299, "grad_norm": 18.38469886304819, "learning_rate": 4.722812317857387e-06, "loss": 3.1848, "step": 1782 }, { "epoch": 1.69979740197831, "grad_norm": 18.386105283768384, "learning_rate": 4.717274422140253e-06, "loss": 3.2072, "step": 1783 }, { "epoch": 1.7007508044333215, "grad_norm": 22.777427485703882, "learning_rate": 4.7117368743422435e-06, "loss": 3.4044, "step": 1784 }, { "epoch": 1.7017042068883326, "grad_norm": 22.022051973816783, "learning_rate": 4.706199681277809e-06, "loss": 3.0847, "step": 1785 }, { "epoch": 1.702657609343344, "grad_norm": 23.26567490021552, "learning_rate": 4.700662849760961e-06, "loss": 2.9563, "step": 1786 }, { "epoch": 1.7036110117983554, "grad_norm": 17.757426840221324, "learning_rate": 4.695126386605263e-06, "loss": 3.303, "step": 1787 }, { "epoch": 1.7045644142533667, "grad_norm": 21.39786129850706, "learning_rate": 4.689590298623831e-06, "loss": 3.2781, "step": 1788 }, { "epoch": 1.705517816708378, "grad_norm": 22.78453791907821, "learning_rate": 4.684054592629315e-06, "loss": 3.2174, "step": 1789 }, { "epoch": 1.7064712191633893, "grad_norm": 20.03517493529535, "learning_rate": 4.678519275433899e-06, "loss": 3.0708, "step": 1790 }, { "epoch": 1.7074246216184008, "grad_norm": 22.357643407443163, "learning_rate": 4.672984353849285e-06, "loss": 3.0169, "step": 1791 }, { "epoch": 1.7083780240734119, "grad_norm": 22.447276170109735, "learning_rate": 4.667449834686689e-06, "loss": 3.0786, "step": 1792 }, { "epoch": 1.7093314265284234, "grad_norm": 18.856138061796127, "learning_rate": 4.661915724756834e-06, "loss": 3.2533, "step": 1793 }, { "epoch": 1.7102848289834345, "grad_norm": 19.57750587052043, "learning_rate": 4.6563820308699344e-06, "loss": 3.2342, "step": 1794 }, { "epoch": 1.711238231438446, "grad_norm": 17.27085562651813, "learning_rate": 4.650848759835698e-06, "loss": 3.1426, "step": 1795 }, { "epoch": 1.7121916338934573, "grad_norm": 17.038703077735793, "learning_rate": 4.645315918463308e-06, "loss": 3.1217, "step": 1796 }, { "epoch": 1.7131450363484686, "grad_norm": 20.82363783174902, "learning_rate": 4.639783513561423e-06, "loss": 3.2466, "step": 1797 }, { "epoch": 1.71409843880348, "grad_norm": 24.351084009561763, "learning_rate": 4.634251551938162e-06, "loss": 3.2463, "step": 1798 }, { "epoch": 1.7150518412584912, "grad_norm": 21.249430070337656, "learning_rate": 4.628720040401099e-06, "loss": 3.0804, "step": 1799 }, { "epoch": 1.7160052437135027, "grad_norm": 18.122530641477976, "learning_rate": 4.623188985757252e-06, "loss": 3.1794, "step": 1800 }, { "epoch": 1.7169586461685138, "grad_norm": 21.357699824648247, "learning_rate": 4.617658394813081e-06, "loss": 3.1543, "step": 1801 }, { "epoch": 1.7179120486235253, "grad_norm": 18.577493005941523, "learning_rate": 4.612128274374471e-06, "loss": 3.1789, "step": 1802 }, { "epoch": 1.7188654510785364, "grad_norm": 18.6845561883702, "learning_rate": 4.606598631246733e-06, "loss": 3.1494, "step": 1803 }, { "epoch": 1.719818853533548, "grad_norm": 19.152670166726086, "learning_rate": 4.601069472234584e-06, "loss": 3.1915, "step": 1804 }, { "epoch": 1.7207722559885592, "grad_norm": 18.521933605299047, "learning_rate": 4.595540804142154e-06, "loss": 3.195, "step": 1805 }, { "epoch": 1.7217256584435705, "grad_norm": 21.184226172775038, "learning_rate": 4.59001263377296e-06, "loss": 3.339, "step": 1806 }, { "epoch": 1.7226790608985818, "grad_norm": 19.59648205163862, "learning_rate": 4.584484967929909e-06, "loss": 2.9723, "step": 1807 }, { "epoch": 1.723632463353593, "grad_norm": 15.798058783294996, "learning_rate": 4.578957813415293e-06, "loss": 3.0286, "step": 1808 }, { "epoch": 1.7245858658086046, "grad_norm": 18.03341191482605, "learning_rate": 4.573431177030767e-06, "loss": 3.0988, "step": 1809 }, { "epoch": 1.7255392682636157, "grad_norm": 17.36852857914895, "learning_rate": 4.567905065577354e-06, "loss": 3.3775, "step": 1810 }, { "epoch": 1.7264926707186272, "grad_norm": 20.89842432911033, "learning_rate": 4.562379485855425e-06, "loss": 3.2131, "step": 1811 }, { "epoch": 1.7274460731736383, "grad_norm": 24.964729605639306, "learning_rate": 4.556854444664706e-06, "loss": 3.2577, "step": 1812 }, { "epoch": 1.7283994756286498, "grad_norm": 15.276157822615724, "learning_rate": 4.55132994880425e-06, "loss": 3.1415, "step": 1813 }, { "epoch": 1.7293528780836611, "grad_norm": 17.00298320300189, "learning_rate": 4.545806005072445e-06, "loss": 3.0933, "step": 1814 }, { "epoch": 1.7303062805386724, "grad_norm": 19.390022419662728, "learning_rate": 4.540282620266997e-06, "loss": 3.3072, "step": 1815 }, { "epoch": 1.7312596829936837, "grad_norm": 17.8784282813571, "learning_rate": 4.534759801184928e-06, "loss": 3.1195, "step": 1816 }, { "epoch": 1.732213085448695, "grad_norm": 23.797379807589714, "learning_rate": 4.5292375546225585e-06, "loss": 3.1467, "step": 1817 }, { "epoch": 1.7331664879037063, "grad_norm": 19.363761943882704, "learning_rate": 4.523715887375509e-06, "loss": 3.0069, "step": 1818 }, { "epoch": 1.7341198903587176, "grad_norm": 14.923274045598733, "learning_rate": 4.518194806238685e-06, "loss": 3.1241, "step": 1819 }, { "epoch": 1.7350732928137291, "grad_norm": 20.78009118487177, "learning_rate": 4.512674318006268e-06, "loss": 2.9795, "step": 1820 }, { "epoch": 1.7360266952687402, "grad_norm": 19.413061197322293, "learning_rate": 4.507154429471717e-06, "loss": 3.0903, "step": 1821 }, { "epoch": 1.7369800977237517, "grad_norm": 18.133106381484577, "learning_rate": 4.501635147427746e-06, "loss": 3.1068, "step": 1822 }, { "epoch": 1.7379335001787628, "grad_norm": 17.94474917320771, "learning_rate": 4.496116478666327e-06, "loss": 3.0277, "step": 1823 }, { "epoch": 1.7388869026337743, "grad_norm": 18.498481656478006, "learning_rate": 4.490598429978676e-06, "loss": 2.867, "step": 1824 }, { "epoch": 1.7398403050887856, "grad_norm": 19.508979629228655, "learning_rate": 4.4850810081552435e-06, "loss": 2.9751, "step": 1825 }, { "epoch": 1.740793707543797, "grad_norm": 18.98140510016662, "learning_rate": 4.479564219985714e-06, "loss": 3.2296, "step": 1826 }, { "epoch": 1.7417471099988082, "grad_norm": 21.587323110678845, "learning_rate": 4.474048072258985e-06, "loss": 3.1686, "step": 1827 }, { "epoch": 1.7427005124538195, "grad_norm": 17.876331031585103, "learning_rate": 4.468532571763174e-06, "loss": 3.2299, "step": 1828 }, { "epoch": 1.743653914908831, "grad_norm": 23.030457376950416, "learning_rate": 4.463017725285595e-06, "loss": 2.9509, "step": 1829 }, { "epoch": 1.7446073173638421, "grad_norm": 18.058022305456312, "learning_rate": 4.4575035396127635e-06, "loss": 3.3565, "step": 1830 }, { "epoch": 1.7455607198188536, "grad_norm": 18.323440781392335, "learning_rate": 4.451990021530374e-06, "loss": 3.047, "step": 1831 }, { "epoch": 1.7465141222738647, "grad_norm": 17.639794834349654, "learning_rate": 4.446477177823308e-06, "loss": 2.9311, "step": 1832 }, { "epoch": 1.7474675247288762, "grad_norm": 18.142541149855695, "learning_rate": 4.44096501527561e-06, "loss": 3.3227, "step": 1833 }, { "epoch": 1.7484209271838875, "grad_norm": 19.875643327381145, "learning_rate": 4.4354535406704915e-06, "loss": 3.1936, "step": 1834 }, { "epoch": 1.7493743296388988, "grad_norm": 22.04163684886038, "learning_rate": 4.429942760790314e-06, "loss": 2.9785, "step": 1835 }, { "epoch": 1.7503277320939101, "grad_norm": 16.72477949838465, "learning_rate": 4.424432682416585e-06, "loss": 3.0163, "step": 1836 }, { "epoch": 1.7512811345489214, "grad_norm": 17.789384410310827, "learning_rate": 4.418923312329952e-06, "loss": 3.1371, "step": 1837 }, { "epoch": 1.752234537003933, "grad_norm": 18.48932170010309, "learning_rate": 4.4134146573101835e-06, "loss": 3.026, "step": 1838 }, { "epoch": 1.753187939458944, "grad_norm": 21.26586536609748, "learning_rate": 4.407906724136176e-06, "loss": 2.8544, "step": 1839 }, { "epoch": 1.7541413419139555, "grad_norm": 18.893095980461283, "learning_rate": 4.402399519585932e-06, "loss": 3.0819, "step": 1840 }, { "epoch": 1.7550947443689666, "grad_norm": 19.170851990266947, "learning_rate": 4.396893050436561e-06, "loss": 2.9429, "step": 1841 }, { "epoch": 1.7560481468239781, "grad_norm": 18.26698856677261, "learning_rate": 4.3913873234642645e-06, "loss": 3.3587, "step": 1842 }, { "epoch": 1.7570015492789894, "grad_norm": 25.169722159701237, "learning_rate": 4.385882345444335e-06, "loss": 3.2478, "step": 1843 }, { "epoch": 1.7579549517340007, "grad_norm": 20.883617439373815, "learning_rate": 4.380378123151139e-06, "loss": 3.1235, "step": 1844 }, { "epoch": 1.758908354189012, "grad_norm": 23.41877366354891, "learning_rate": 4.374874663358113e-06, "loss": 2.9952, "step": 1845 }, { "epoch": 1.7598617566440233, "grad_norm": 18.176340376255556, "learning_rate": 4.3693719728377575e-06, "loss": 3.2401, "step": 1846 }, { "epoch": 1.7608151590990346, "grad_norm": 20.468719783874718, "learning_rate": 4.363870058361628e-06, "loss": 3.1602, "step": 1847 }, { "epoch": 1.761768561554046, "grad_norm": 17.355786909909796, "learning_rate": 4.358368926700321e-06, "loss": 3.1674, "step": 1848 }, { "epoch": 1.7627219640090575, "grad_norm": 15.936210086448742, "learning_rate": 4.352868584623472e-06, "loss": 3.1445, "step": 1849 }, { "epoch": 1.7636753664640685, "grad_norm": 27.141416618128314, "learning_rate": 4.347369038899744e-06, "loss": 2.9986, "step": 1850 }, { "epoch": 1.76462876891908, "grad_norm": 17.653565175828767, "learning_rate": 4.341870296296817e-06, "loss": 3.1765, "step": 1851 }, { "epoch": 1.7655821713740911, "grad_norm": 14.955597350111232, "learning_rate": 4.336372363581391e-06, "loss": 3.4286, "step": 1852 }, { "epoch": 1.7665355738291026, "grad_norm": 22.730129296407284, "learning_rate": 4.330875247519161e-06, "loss": 2.8324, "step": 1853 }, { "epoch": 1.767488976284114, "grad_norm": 18.47562945911664, "learning_rate": 4.325378954874821e-06, "loss": 3.4563, "step": 1854 }, { "epoch": 1.7684423787391252, "grad_norm": 19.38460882679615, "learning_rate": 4.319883492412052e-06, "loss": 3.1336, "step": 1855 }, { "epoch": 1.7693957811941365, "grad_norm": 15.955788397821118, "learning_rate": 4.314388866893512e-06, "loss": 3.213, "step": 1856 }, { "epoch": 1.7703491836491478, "grad_norm": 17.65126941867721, "learning_rate": 4.308895085080828e-06, "loss": 3.2509, "step": 1857 }, { "epoch": 1.7713025861041594, "grad_norm": 19.262191027232888, "learning_rate": 4.303402153734591e-06, "loss": 3.1173, "step": 1858 }, { "epoch": 1.7722559885591704, "grad_norm": 19.143706958110023, "learning_rate": 4.297910079614344e-06, "loss": 3.3, "step": 1859 }, { "epoch": 1.773209391014182, "grad_norm": 20.95434641812274, "learning_rate": 4.292418869478577e-06, "loss": 3.0231, "step": 1860 }, { "epoch": 1.774162793469193, "grad_norm": 16.624362566690916, "learning_rate": 4.286928530084715e-06, "loss": 3.109, "step": 1861 }, { "epoch": 1.7751161959242046, "grad_norm": 20.02719013896278, "learning_rate": 4.281439068189113e-06, "loss": 2.9356, "step": 1862 }, { "epoch": 1.7760695983792159, "grad_norm": 20.96353365936155, "learning_rate": 4.2759504905470435e-06, "loss": 3.3088, "step": 1863 }, { "epoch": 1.7770230008342272, "grad_norm": 18.771886447890772, "learning_rate": 4.270462803912692e-06, "loss": 3.1657, "step": 1864 }, { "epoch": 1.7779764032892384, "grad_norm": 18.020665564934813, "learning_rate": 4.2649760150391505e-06, "loss": 3.2145, "step": 1865 }, { "epoch": 1.7789298057442497, "grad_norm": 20.253044335367235, "learning_rate": 4.2594901306784006e-06, "loss": 3.1003, "step": 1866 }, { "epoch": 1.7798832081992613, "grad_norm": 20.57993987258862, "learning_rate": 4.254005157581316e-06, "loss": 3.1719, "step": 1867 }, { "epoch": 1.7808366106542723, "grad_norm": 19.047553968399647, "learning_rate": 4.248521102497649e-06, "loss": 3.1003, "step": 1868 }, { "epoch": 1.7817900131092839, "grad_norm": 20.02209895847076, "learning_rate": 4.243037972176016e-06, "loss": 3.0657, "step": 1869 }, { "epoch": 1.782743415564295, "grad_norm": 19.75598061064686, "learning_rate": 4.237555773363901e-06, "loss": 3.0668, "step": 1870 }, { "epoch": 1.7836968180193065, "grad_norm": 18.11249389392085, "learning_rate": 4.23207451280764e-06, "loss": 3.2056, "step": 1871 }, { "epoch": 1.7846502204743178, "grad_norm": 20.306150395140893, "learning_rate": 4.226594197252417e-06, "loss": 3.0755, "step": 1872 }, { "epoch": 1.785603622929329, "grad_norm": 18.832205033630636, "learning_rate": 4.221114833442247e-06, "loss": 3.1458, "step": 1873 }, { "epoch": 1.7865570253843404, "grad_norm": 21.180723797767833, "learning_rate": 4.215636428119982e-06, "loss": 3.1628, "step": 1874 }, { "epoch": 1.7875104278393517, "grad_norm": 19.977096901116212, "learning_rate": 4.210158988027286e-06, "loss": 2.9256, "step": 1875 }, { "epoch": 1.788463830294363, "grad_norm": 20.487015026124126, "learning_rate": 4.204682519904641e-06, "loss": 3.1422, "step": 1876 }, { "epoch": 1.7894172327493743, "grad_norm": 18.807062232723503, "learning_rate": 4.19920703049133e-06, "loss": 2.9667, "step": 1877 }, { "epoch": 1.7903706352043858, "grad_norm": 17.440667366416708, "learning_rate": 4.193732526525435e-06, "loss": 3.1027, "step": 1878 }, { "epoch": 1.7913240376593969, "grad_norm": 20.847736597505563, "learning_rate": 4.18825901474382e-06, "loss": 3.1019, "step": 1879 }, { "epoch": 1.7922774401144084, "grad_norm": 23.123997773590727, "learning_rate": 4.182786501882135e-06, "loss": 3.1585, "step": 1880 }, { "epoch": 1.7932308425694197, "grad_norm": 18.660521369707872, "learning_rate": 4.1773149946747945e-06, "loss": 3.125, "step": 1881 }, { "epoch": 1.794184245024431, "grad_norm": 20.166885431428497, "learning_rate": 4.171844499854977e-06, "loss": 3.2098, "step": 1882 }, { "epoch": 1.7951376474794423, "grad_norm": 21.09958329823313, "learning_rate": 4.166375024154616e-06, "loss": 3.1075, "step": 1883 }, { "epoch": 1.7960910499344536, "grad_norm": 17.944366563134956, "learning_rate": 4.160906574304392e-06, "loss": 3.1266, "step": 1884 }, { "epoch": 1.7970444523894649, "grad_norm": 19.009987579583363, "learning_rate": 4.155439157033723e-06, "loss": 3.0327, "step": 1885 }, { "epoch": 1.7979978548444762, "grad_norm": 18.330004339856934, "learning_rate": 4.149972779070752e-06, "loss": 3.3435, "step": 1886 }, { "epoch": 1.7989512572994877, "grad_norm": 22.28142110020564, "learning_rate": 4.144507447142351e-06, "loss": 3.0367, "step": 1887 }, { "epoch": 1.7999046597544988, "grad_norm": 22.58399696231677, "learning_rate": 4.139043167974096e-06, "loss": 3.1514, "step": 1888 }, { "epoch": 1.8008580622095103, "grad_norm": 18.893841867073228, "learning_rate": 4.133579948290271e-06, "loss": 2.8267, "step": 1889 }, { "epoch": 1.8018114646645214, "grad_norm": 18.34046887407212, "learning_rate": 4.12811779481386e-06, "loss": 3.1042, "step": 1890 }, { "epoch": 1.8027648671195329, "grad_norm": 20.275198167589707, "learning_rate": 4.122656714266529e-06, "loss": 3.143, "step": 1891 }, { "epoch": 1.8037182695745442, "grad_norm": 17.060740671381105, "learning_rate": 4.117196713368629e-06, "loss": 3.105, "step": 1892 }, { "epoch": 1.8046716720295555, "grad_norm": 20.067363339071928, "learning_rate": 4.111737798839177e-06, "loss": 3.1776, "step": 1893 }, { "epoch": 1.8056250744845668, "grad_norm": 18.770469452780407, "learning_rate": 4.106279977395859e-06, "loss": 3.4217, "step": 1894 }, { "epoch": 1.806578476939578, "grad_norm": 17.2263483089388, "learning_rate": 4.100823255755009e-06, "loss": 2.9513, "step": 1895 }, { "epoch": 1.8075318793945896, "grad_norm": 21.78943317363265, "learning_rate": 4.095367640631614e-06, "loss": 3.0016, "step": 1896 }, { "epoch": 1.8084852818496007, "grad_norm": 20.824046095676184, "learning_rate": 4.089913138739295e-06, "loss": 3.1294, "step": 1897 }, { "epoch": 1.8094386843046122, "grad_norm": 22.315710521920895, "learning_rate": 4.084459756790307e-06, "loss": 3.0742, "step": 1898 }, { "epoch": 1.8103920867596233, "grad_norm": 21.759771390171743, "learning_rate": 4.0790075014955205e-06, "loss": 3.2927, "step": 1899 }, { "epoch": 1.8113454892146348, "grad_norm": 20.271862880287653, "learning_rate": 4.073556379564429e-06, "loss": 2.8947, "step": 1900 }, { "epoch": 1.812298891669646, "grad_norm": 18.851940233113037, "learning_rate": 4.068106397705123e-06, "loss": 3.2605, "step": 1901 }, { "epoch": 1.8132522941246574, "grad_norm": 22.428929388826326, "learning_rate": 4.0626575626242905e-06, "loss": 3.1194, "step": 1902 }, { "epoch": 1.8142056965796687, "grad_norm": 20.849935786582485, "learning_rate": 4.057209881027215e-06, "loss": 3.3843, "step": 1903 }, { "epoch": 1.81515909903468, "grad_norm": 24.68654915547585, "learning_rate": 4.051763359617753e-06, "loss": 3.1225, "step": 1904 }, { "epoch": 1.8161125014896915, "grad_norm": 23.17819509722408, "learning_rate": 4.0463180050983385e-06, "loss": 3.0973, "step": 1905 }, { "epoch": 1.8170659039447026, "grad_norm": 22.05299124379541, "learning_rate": 4.0408738241699695e-06, "loss": 3.08, "step": 1906 }, { "epoch": 1.818019306399714, "grad_norm": 18.992768069579842, "learning_rate": 4.035430823532192e-06, "loss": 3.3304, "step": 1907 }, { "epoch": 1.8189727088547252, "grad_norm": 20.349560389796007, "learning_rate": 4.0299890098831096e-06, "loss": 3.3156, "step": 1908 }, { "epoch": 1.8199261113097367, "grad_norm": 21.054333086687972, "learning_rate": 4.02454838991936e-06, "loss": 3.1214, "step": 1909 }, { "epoch": 1.820879513764748, "grad_norm": 20.75335001353331, "learning_rate": 4.019108970336112e-06, "loss": 2.9033, "step": 1910 }, { "epoch": 1.8218329162197593, "grad_norm": 17.84414595284209, "learning_rate": 4.013670757827061e-06, "loss": 3.0176, "step": 1911 }, { "epoch": 1.8227863186747706, "grad_norm": 21.527324843642727, "learning_rate": 4.00823375908441e-06, "loss": 3.1073, "step": 1912 }, { "epoch": 1.8237397211297819, "grad_norm": 22.50010143862204, "learning_rate": 4.002797980798879e-06, "loss": 3.0773, "step": 1913 }, { "epoch": 1.8246931235847932, "grad_norm": 19.24858474928457, "learning_rate": 3.997363429659674e-06, "loss": 2.9616, "step": 1914 }, { "epoch": 1.8256465260398045, "grad_norm": 18.807683717036124, "learning_rate": 3.991930112354497e-06, "loss": 3.0139, "step": 1915 }, { "epoch": 1.826599928494816, "grad_norm": 17.336476139218856, "learning_rate": 3.986498035569533e-06, "loss": 3.3131, "step": 1916 }, { "epoch": 1.827553330949827, "grad_norm": 22.061233248039883, "learning_rate": 3.981067205989437e-06, "loss": 3.3559, "step": 1917 }, { "epoch": 1.8285067334048386, "grad_norm": 22.19132509972461, "learning_rate": 3.975637630297333e-06, "loss": 3.1586, "step": 1918 }, { "epoch": 1.8294601358598497, "grad_norm": 20.23212437264283, "learning_rate": 3.970209315174799e-06, "loss": 3.1734, "step": 1919 }, { "epoch": 1.8304135383148612, "grad_norm": 20.70576864328152, "learning_rate": 3.964782267301861e-06, "loss": 3.4, "step": 1920 }, { "epoch": 1.8313669407698725, "grad_norm": 22.47674558891714, "learning_rate": 3.95935649335699e-06, "loss": 3.218, "step": 1921 }, { "epoch": 1.8323203432248838, "grad_norm": 22.107065603053762, "learning_rate": 3.953932000017082e-06, "loss": 3.0958, "step": 1922 }, { "epoch": 1.833273745679895, "grad_norm": 21.0894110670487, "learning_rate": 3.948508793957465e-06, "loss": 3.0481, "step": 1923 }, { "epoch": 1.8342271481349064, "grad_norm": 21.62991893123561, "learning_rate": 3.9430868818518786e-06, "loss": 2.8197, "step": 1924 }, { "epoch": 1.835180550589918, "grad_norm": 19.986979729428533, "learning_rate": 3.937666270372472e-06, "loss": 3.1319, "step": 1925 }, { "epoch": 1.836133953044929, "grad_norm": 19.61441202117173, "learning_rate": 3.93224696618979e-06, "loss": 3.2313, "step": 1926 }, { "epoch": 1.8370873554999405, "grad_norm": 20.762335129740812, "learning_rate": 3.926828975972774e-06, "loss": 3.2244, "step": 1927 }, { "epoch": 1.8380407579549516, "grad_norm": 15.37074593199603, "learning_rate": 3.921412306388744e-06, "loss": 3.401, "step": 1928 }, { "epoch": 1.838994160409963, "grad_norm": 19.815385142688324, "learning_rate": 3.915996964103398e-06, "loss": 3.1879, "step": 1929 }, { "epoch": 1.8399475628649744, "grad_norm": 19.555273234410183, "learning_rate": 3.910582955780798e-06, "loss": 3.0646, "step": 1930 }, { "epoch": 1.8409009653199857, "grad_norm": 20.544318710552904, "learning_rate": 3.905170288083367e-06, "loss": 3.2974, "step": 1931 }, { "epoch": 1.841854367774997, "grad_norm": 16.72465124179551, "learning_rate": 3.899758967671879e-06, "loss": 3.0816, "step": 1932 }, { "epoch": 1.8428077702300083, "grad_norm": 21.529218989941835, "learning_rate": 3.894349001205442e-06, "loss": 3.0873, "step": 1933 }, { "epoch": 1.8437611726850198, "grad_norm": 21.82341871962611, "learning_rate": 3.888940395341509e-06, "loss": 3.267, "step": 1934 }, { "epoch": 1.844714575140031, "grad_norm": 16.91102303381329, "learning_rate": 3.88353315673585e-06, "loss": 3.2987, "step": 1935 }, { "epoch": 1.8456679775950424, "grad_norm": 21.148265900630616, "learning_rate": 3.8781272920425605e-06, "loss": 2.9044, "step": 1936 }, { "epoch": 1.8466213800500535, "grad_norm": 19.717920320555923, "learning_rate": 3.872722807914036e-06, "loss": 3.1998, "step": 1937 }, { "epoch": 1.847574782505065, "grad_norm": 25.394800299080224, "learning_rate": 3.8673197110009845e-06, "loss": 2.9906, "step": 1938 }, { "epoch": 1.8485281849600763, "grad_norm": 18.794992629778513, "learning_rate": 3.861918007952393e-06, "loss": 3.1085, "step": 1939 }, { "epoch": 1.8494815874150876, "grad_norm": 25.377245792966217, "learning_rate": 3.856517705415543e-06, "loss": 3.1596, "step": 1940 }, { "epoch": 1.850434989870099, "grad_norm": 19.476574831653124, "learning_rate": 3.851118810035992e-06, "loss": 3.1728, "step": 1941 }, { "epoch": 1.8513883923251102, "grad_norm": 19.47793721742032, "learning_rate": 3.845721328457561e-06, "loss": 3.157, "step": 1942 }, { "epoch": 1.8523417947801215, "grad_norm": 20.902283140156335, "learning_rate": 3.840325267322336e-06, "loss": 3.2067, "step": 1943 }, { "epoch": 1.8532951972351328, "grad_norm": 20.80690438531017, "learning_rate": 3.834930633270654e-06, "loss": 3.0171, "step": 1944 }, { "epoch": 1.8542485996901443, "grad_norm": 19.428701408412554, "learning_rate": 3.829537432941093e-06, "loss": 3.1012, "step": 1945 }, { "epoch": 1.8552020021451554, "grad_norm": 19.43332392728654, "learning_rate": 3.824145672970469e-06, "loss": 3.3009, "step": 1946 }, { "epoch": 1.856155404600167, "grad_norm": 20.20194240849449, "learning_rate": 3.8187553599938264e-06, "loss": 3.0651, "step": 1947 }, { "epoch": 1.857108807055178, "grad_norm": 20.72192522305947, "learning_rate": 3.813366500644426e-06, "loss": 3.0158, "step": 1948 }, { "epoch": 1.8580622095101895, "grad_norm": 16.723308988709558, "learning_rate": 3.8079791015537432e-06, "loss": 2.9655, "step": 1949 }, { "epoch": 1.8590156119652008, "grad_norm": 17.486542812812914, "learning_rate": 3.802593169351453e-06, "loss": 3.3409, "step": 1950 }, { "epoch": 1.8599690144202121, "grad_norm": 24.441217813330407, "learning_rate": 3.79720871066543e-06, "loss": 3.1164, "step": 1951 }, { "epoch": 1.8609224168752234, "grad_norm": 20.181172022823333, "learning_rate": 3.791825732121729e-06, "loss": 3.2058, "step": 1952 }, { "epoch": 1.8618758193302347, "grad_norm": 17.585246672196746, "learning_rate": 3.7864442403445877e-06, "loss": 3.3073, "step": 1953 }, { "epoch": 1.8628292217852462, "grad_norm": 22.908760138118716, "learning_rate": 3.781064241956415e-06, "loss": 3.0799, "step": 1954 }, { "epoch": 1.8637826242402573, "grad_norm": 21.7973602230946, "learning_rate": 3.775685743577777e-06, "loss": 3.0667, "step": 1955 }, { "epoch": 1.8647360266952688, "grad_norm": 21.839750486446427, "learning_rate": 3.770308751827402e-06, "loss": 3.2555, "step": 1956 }, { "epoch": 1.86568942915028, "grad_norm": 21.66504994892601, "learning_rate": 3.7649332733221576e-06, "loss": 3.1353, "step": 1957 }, { "epoch": 1.8666428316052914, "grad_norm": 18.574304586956906, "learning_rate": 3.7595593146770487e-06, "loss": 3.0892, "step": 1958 }, { "epoch": 1.8675962340603027, "grad_norm": 16.770288057471845, "learning_rate": 3.754186882505214e-06, "loss": 3.2144, "step": 1959 }, { "epoch": 1.868549636515314, "grad_norm": 20.042555279381276, "learning_rate": 3.748815983417914e-06, "loss": 2.9942, "step": 1960 }, { "epoch": 1.8695030389703253, "grad_norm": 17.181752404708757, "learning_rate": 3.743446624024517e-06, "loss": 3.1337, "step": 1961 }, { "epoch": 1.8704564414253366, "grad_norm": 18.15180682375404, "learning_rate": 3.7380788109325043e-06, "loss": 3.1616, "step": 1962 }, { "epoch": 1.8714098438803481, "grad_norm": 19.80424148358012, "learning_rate": 3.7327125507474493e-06, "loss": 3.4725, "step": 1963 }, { "epoch": 1.8723632463353592, "grad_norm": 20.39063089526524, "learning_rate": 3.727347850073012e-06, "loss": 3.0538, "step": 1964 }, { "epoch": 1.8733166487903707, "grad_norm": 18.45905244111494, "learning_rate": 3.721984715510941e-06, "loss": 3.2947, "step": 1965 }, { "epoch": 1.8742700512453818, "grad_norm": 18.498059473519646, "learning_rate": 3.7166231536610497e-06, "loss": 2.9355, "step": 1966 }, { "epoch": 1.8752234537003933, "grad_norm": 22.5071533411534, "learning_rate": 3.7112631711212226e-06, "loss": 3.3488, "step": 1967 }, { "epoch": 1.8761768561554046, "grad_norm": 17.767138445541182, "learning_rate": 3.705904774487396e-06, "loss": 3.2311, "step": 1968 }, { "epoch": 1.877130258610416, "grad_norm": 16.9858665804846, "learning_rate": 3.7005479703535585e-06, "loss": 3.1389, "step": 1969 }, { "epoch": 1.8780836610654272, "grad_norm": 22.40821045075689, "learning_rate": 3.695192765311737e-06, "loss": 3.2022, "step": 1970 }, { "epoch": 1.8790370635204385, "grad_norm": 25.126879029130528, "learning_rate": 3.6898391659519874e-06, "loss": 2.9952, "step": 1971 }, { "epoch": 1.8799904659754498, "grad_norm": 22.094498928768672, "learning_rate": 3.6844871788623946e-06, "loss": 3.2926, "step": 1972 }, { "epoch": 1.8809438684304611, "grad_norm": 18.392915996121555, "learning_rate": 3.6791368106290583e-06, "loss": 3.0909, "step": 1973 }, { "epoch": 1.8818972708854727, "grad_norm": 21.1313812800616, "learning_rate": 3.6737880678360845e-06, "loss": 2.9759, "step": 1974 }, { "epoch": 1.8828506733404837, "grad_norm": 17.592542126142213, "learning_rate": 3.668440957065581e-06, "loss": 3.1003, "step": 1975 }, { "epoch": 1.8838040757954952, "grad_norm": 18.853406904093948, "learning_rate": 3.6630954848976472e-06, "loss": 2.9941, "step": 1976 }, { "epoch": 1.8847574782505065, "grad_norm": 15.010845175376787, "learning_rate": 3.657751657910361e-06, "loss": 3.3823, "step": 1977 }, { "epoch": 1.8857108807055178, "grad_norm": 22.709547899850193, "learning_rate": 3.6524094826797836e-06, "loss": 3.3588, "step": 1978 }, { "epoch": 1.8866642831605291, "grad_norm": 22.552113657822392, "learning_rate": 3.6470689657799362e-06, "loss": 3.2117, "step": 1979 }, { "epoch": 1.8876176856155404, "grad_norm": 20.26779308231581, "learning_rate": 3.641730113782807e-06, "loss": 3.207, "step": 1980 }, { "epoch": 1.8885710880705517, "grad_norm": 20.725599808355728, "learning_rate": 3.636392933258327e-06, "loss": 3.3454, "step": 1981 }, { "epoch": 1.889524490525563, "grad_norm": 17.942703877446814, "learning_rate": 3.6310574307743798e-06, "loss": 3.2187, "step": 1982 }, { "epoch": 1.8904778929805746, "grad_norm": 17.99156089884667, "learning_rate": 3.625723612896773e-06, "loss": 3.1319, "step": 1983 }, { "epoch": 1.8914312954355856, "grad_norm": 25.180246127217295, "learning_rate": 3.6203914861892483e-06, "loss": 2.9011, "step": 1984 }, { "epoch": 1.8923846978905972, "grad_norm": 19.156025399144117, "learning_rate": 3.615061057213467e-06, "loss": 3.1286, "step": 1985 }, { "epoch": 1.8933381003456082, "grad_norm": 19.747765843895568, "learning_rate": 3.609732332528996e-06, "loss": 3.2673, "step": 1986 }, { "epoch": 1.8942915028006198, "grad_norm": 16.74020510762191, "learning_rate": 3.6044053186933115e-06, "loss": 3.1241, "step": 1987 }, { "epoch": 1.895244905255631, "grad_norm": 20.17032278938578, "learning_rate": 3.5990800222617774e-06, "loss": 3.2042, "step": 1988 }, { "epoch": 1.8961983077106424, "grad_norm": 18.301220335932655, "learning_rate": 3.593756449787651e-06, "loss": 3.2462, "step": 1989 }, { "epoch": 1.8971517101656536, "grad_norm": 21.306580426537007, "learning_rate": 3.5884346078220612e-06, "loss": 3.2581, "step": 1990 }, { "epoch": 1.898105112620665, "grad_norm": 23.09363970068888, "learning_rate": 3.5831145029140124e-06, "loss": 3.1443, "step": 1991 }, { "epoch": 1.8990585150756765, "grad_norm": 17.77182302517535, "learning_rate": 3.577796141610369e-06, "loss": 3.2655, "step": 1992 }, { "epoch": 1.9000119175306875, "grad_norm": 17.657963818149454, "learning_rate": 3.5724795304558517e-06, "loss": 3.2115, "step": 1993 }, { "epoch": 1.900965319985699, "grad_norm": 19.131193112014422, "learning_rate": 3.5671646759930256e-06, "loss": 3.2554, "step": 1994 }, { "epoch": 1.9019187224407101, "grad_norm": 16.465374347750913, "learning_rate": 3.5618515847622965e-06, "loss": 3.005, "step": 1995 }, { "epoch": 1.9028721248957217, "grad_norm": 22.862085100693303, "learning_rate": 3.5565402633018963e-06, "loss": 3.1641, "step": 1996 }, { "epoch": 1.903825527350733, "grad_norm": 20.763669824050467, "learning_rate": 3.551230718147881e-06, "loss": 3.2154, "step": 1997 }, { "epoch": 1.9047789298057443, "grad_norm": 24.228056531282416, "learning_rate": 3.5459229558341235e-06, "loss": 3.1172, "step": 1998 }, { "epoch": 1.9057323322607556, "grad_norm": 20.54112392294109, "learning_rate": 3.5406169828922976e-06, "loss": 3.0503, "step": 1999 }, { "epoch": 1.9066857347157669, "grad_norm": 19.893386112602574, "learning_rate": 3.535312805851881e-06, "loss": 2.9066, "step": 2000 }, { "epoch": 1.9076391371707784, "grad_norm": 18.0349056821661, "learning_rate": 3.530010431240137e-06, "loss": 3.1904, "step": 2001 }, { "epoch": 1.9085925396257895, "grad_norm": 17.889520530102452, "learning_rate": 3.5247098655821107e-06, "loss": 3.1381, "step": 2002 }, { "epoch": 1.909545942080801, "grad_norm": 22.09542507105436, "learning_rate": 3.519411115400624e-06, "loss": 3.2358, "step": 2003 }, { "epoch": 1.910499344535812, "grad_norm": 25.44297226224331, "learning_rate": 3.5141141872162613e-06, "loss": 3.2608, "step": 2004 }, { "epoch": 1.9114527469908236, "grad_norm": 31.126023081967404, "learning_rate": 3.5088190875473693e-06, "loss": 3.2319, "step": 2005 }, { "epoch": 1.9124061494458349, "grad_norm": 25.33991222102823, "learning_rate": 3.5035258229100398e-06, "loss": 3.071, "step": 2006 }, { "epoch": 1.9133595519008462, "grad_norm": 18.438342065850993, "learning_rate": 3.4982343998181125e-06, "loss": 3.2064, "step": 2007 }, { "epoch": 1.9143129543558575, "grad_norm": 19.22580726718397, "learning_rate": 3.4929448247831523e-06, "loss": 3.2447, "step": 2008 }, { "epoch": 1.9152663568108688, "grad_norm": 19.61730597722587, "learning_rate": 3.487657104314457e-06, "loss": 3.2708, "step": 2009 }, { "epoch": 1.91621975926588, "grad_norm": 17.2874855288762, "learning_rate": 3.482371244919038e-06, "loss": 3.1534, "step": 2010 }, { "epoch": 1.9171731617208914, "grad_norm": 21.80608315394512, "learning_rate": 3.477087253101622e-06, "loss": 3.1686, "step": 2011 }, { "epoch": 1.9181265641759029, "grad_norm": 16.740727035204582, "learning_rate": 3.4718051353646304e-06, "loss": 3.1791, "step": 2012 }, { "epoch": 1.919079966630914, "grad_norm": 17.331782765980915, "learning_rate": 3.466524898208184e-06, "loss": 3.1062, "step": 2013 }, { "epoch": 1.9200333690859255, "grad_norm": 18.61490938116356, "learning_rate": 3.461246548130087e-06, "loss": 3.1197, "step": 2014 }, { "epoch": 1.9209867715409366, "grad_norm": 19.418790643441998, "learning_rate": 3.455970091625819e-06, "loss": 3.0892, "step": 2015 }, { "epoch": 1.921940173995948, "grad_norm": 18.710624682546545, "learning_rate": 3.4506955351885346e-06, "loss": 3.2525, "step": 2016 }, { "epoch": 1.9228935764509594, "grad_norm": 20.716516401283574, "learning_rate": 3.4454228853090455e-06, "loss": 3.1013, "step": 2017 }, { "epoch": 1.9238469789059707, "grad_norm": 21.06561514500473, "learning_rate": 3.4401521484758218e-06, "loss": 3.0687, "step": 2018 }, { "epoch": 1.924800381360982, "grad_norm": 16.416883738797853, "learning_rate": 3.4348833311749743e-06, "loss": 3.146, "step": 2019 }, { "epoch": 1.9257537838159933, "grad_norm": 17.87484404553486, "learning_rate": 3.4296164398902576e-06, "loss": 3.0637, "step": 2020 }, { "epoch": 1.9267071862710048, "grad_norm": 20.196806604824967, "learning_rate": 3.4243514811030486e-06, "loss": 3.4508, "step": 2021 }, { "epoch": 1.9276605887260159, "grad_norm": 22.246962169600437, "learning_rate": 3.4190884612923525e-06, "loss": 3.0188, "step": 2022 }, { "epoch": 1.9286139911810274, "grad_norm": 17.903149942692284, "learning_rate": 3.413827386934785e-06, "loss": 3.0474, "step": 2023 }, { "epoch": 1.9295673936360385, "grad_norm": 23.12418493602327, "learning_rate": 3.408568264504571e-06, "loss": 3.09, "step": 2024 }, { "epoch": 1.93052079609105, "grad_norm": 21.891630200771825, "learning_rate": 3.4033111004735292e-06, "loss": 3.2354, "step": 2025 }, { "epoch": 1.9314741985460613, "grad_norm": 18.135083957210934, "learning_rate": 3.398055901311074e-06, "loss": 3.073, "step": 2026 }, { "epoch": 1.9324276010010726, "grad_norm": 15.91460914287954, "learning_rate": 3.3928026734841935e-06, "loss": 3.176, "step": 2027 }, { "epoch": 1.9333810034560839, "grad_norm": 23.792773582497787, "learning_rate": 3.387551423457456e-06, "loss": 3.3289, "step": 2028 }, { "epoch": 1.9343344059110952, "grad_norm": 26.40332958535507, "learning_rate": 3.382302157692997e-06, "loss": 3.1783, "step": 2029 }, { "epoch": 1.9352878083661067, "grad_norm": 20.8802586190092, "learning_rate": 3.3770548826505046e-06, "loss": 3.1308, "step": 2030 }, { "epoch": 1.9362412108211178, "grad_norm": 17.53036867486249, "learning_rate": 3.371809604787224e-06, "loss": 2.9945, "step": 2031 }, { "epoch": 1.9371946132761293, "grad_norm": 16.11215702071334, "learning_rate": 3.366566330557935e-06, "loss": 3.0767, "step": 2032 }, { "epoch": 1.9381480157311404, "grad_norm": 16.925219653204234, "learning_rate": 3.3613250664149597e-06, "loss": 3.1301, "step": 2033 }, { "epoch": 1.939101418186152, "grad_norm": 20.893259089812407, "learning_rate": 3.35608581880814e-06, "loss": 2.9412, "step": 2034 }, { "epoch": 1.9400548206411632, "grad_norm": 16.761199864375175, "learning_rate": 3.350848594184839e-06, "loss": 3.1309, "step": 2035 }, { "epoch": 1.9410082230961745, "grad_norm": 18.756221297212676, "learning_rate": 3.345613398989932e-06, "loss": 3.2078, "step": 2036 }, { "epoch": 1.9419616255511858, "grad_norm": 19.882610863895003, "learning_rate": 3.3403802396657927e-06, "loss": 3.0794, "step": 2037 }, { "epoch": 1.942915028006197, "grad_norm": 15.615035621077993, "learning_rate": 3.3351491226522934e-06, "loss": 3.3142, "step": 2038 }, { "epoch": 1.9438684304612084, "grad_norm": 21.419538850981247, "learning_rate": 3.3299200543867944e-06, "loss": 3.3451, "step": 2039 }, { "epoch": 1.9448218329162197, "grad_norm": 22.353111038595543, "learning_rate": 3.324693041304128e-06, "loss": 3.1091, "step": 2040 }, { "epoch": 1.9457752353712312, "grad_norm": 18.098421352152002, "learning_rate": 3.319468089836603e-06, "loss": 3.299, "step": 2041 }, { "epoch": 1.9467286378262423, "grad_norm": 22.622919616876082, "learning_rate": 3.31424520641399e-06, "loss": 3.2443, "step": 2042 }, { "epoch": 1.9476820402812538, "grad_norm": 23.686073995347936, "learning_rate": 3.309024397463514e-06, "loss": 3.156, "step": 2043 }, { "epoch": 1.948635442736265, "grad_norm": 26.925924662576534, "learning_rate": 3.3038056694098485e-06, "loss": 3.1453, "step": 2044 }, { "epoch": 1.9495888451912764, "grad_norm": 20.621813176907605, "learning_rate": 3.2985890286751065e-06, "loss": 3.2336, "step": 2045 }, { "epoch": 1.9505422476462877, "grad_norm": 19.59833170907688, "learning_rate": 3.2933744816788276e-06, "loss": 3.1524, "step": 2046 }, { "epoch": 1.951495650101299, "grad_norm": 21.490885800728822, "learning_rate": 3.288162034837982e-06, "loss": 3.3014, "step": 2047 }, { "epoch": 1.9524490525563103, "grad_norm": 20.926009724980872, "learning_rate": 3.2829516945669493e-06, "loss": 3.3597, "step": 2048 }, { "epoch": 1.9534024550113216, "grad_norm": 20.15605094310258, "learning_rate": 3.277743467277522e-06, "loss": 3.2587, "step": 2049 }, { "epoch": 1.9543558574663331, "grad_norm": 21.733190459034283, "learning_rate": 3.2725373593788873e-06, "loss": 2.9637, "step": 2050 }, { "epoch": 1.9553092599213442, "grad_norm": 18.391925833930976, "learning_rate": 3.26733337727763e-06, "loss": 3.3175, "step": 2051 }, { "epoch": 1.9562626623763557, "grad_norm": 19.39814031460826, "learning_rate": 3.262131527377715e-06, "loss": 3.1787, "step": 2052 }, { "epoch": 1.9572160648313668, "grad_norm": 19.0538274152475, "learning_rate": 3.2569318160804807e-06, "loss": 3.052, "step": 2053 }, { "epoch": 1.9581694672863783, "grad_norm": 20.929340845422978, "learning_rate": 3.2517342497846407e-06, "loss": 2.8217, "step": 2054 }, { "epoch": 1.9591228697413896, "grad_norm": 19.313919869072002, "learning_rate": 3.2465388348862648e-06, "loss": 3.155, "step": 2055 }, { "epoch": 1.960076272196401, "grad_norm": 20.679536328637614, "learning_rate": 3.241345577778775e-06, "loss": 3.0539, "step": 2056 }, { "epoch": 1.9610296746514122, "grad_norm": 16.98928300281049, "learning_rate": 3.2361544848529415e-06, "loss": 3.043, "step": 2057 }, { "epoch": 1.9619830771064235, "grad_norm": 17.89674816680369, "learning_rate": 3.2309655624968673e-06, "loss": 2.9348, "step": 2058 }, { "epoch": 1.962936479561435, "grad_norm": 17.76937881225272, "learning_rate": 3.2257788170959826e-06, "loss": 3.1878, "step": 2059 }, { "epoch": 1.963889882016446, "grad_norm": 17.511231051581177, "learning_rate": 3.220594255033046e-06, "loss": 3.0886, "step": 2060 }, { "epoch": 1.9648432844714576, "grad_norm": 16.510055599624483, "learning_rate": 3.215411882688123e-06, "loss": 3.252, "step": 2061 }, { "epoch": 1.9657966869264687, "grad_norm": 19.44314730083483, "learning_rate": 3.210231706438588e-06, "loss": 3.1945, "step": 2062 }, { "epoch": 1.9667500893814802, "grad_norm": 18.907062890746875, "learning_rate": 3.2050537326591093e-06, "loss": 3.0702, "step": 2063 }, { "epoch": 1.9677034918364915, "grad_norm": 21.453481772585054, "learning_rate": 3.1998779677216508e-06, "loss": 3.0694, "step": 2064 }, { "epoch": 1.9686568942915028, "grad_norm": 18.291616113515968, "learning_rate": 3.194704417995451e-06, "loss": 2.9801, "step": 2065 }, { "epoch": 1.969610296746514, "grad_norm": 18.739305302711212, "learning_rate": 3.1895330898470255e-06, "loss": 3.1344, "step": 2066 }, { "epoch": 1.9705636992015254, "grad_norm": 18.86710853479559, "learning_rate": 3.184363989640159e-06, "loss": 3.3575, "step": 2067 }, { "epoch": 1.971517101656537, "grad_norm": 20.665328917746432, "learning_rate": 3.1791971237358893e-06, "loss": 2.9946, "step": 2068 }, { "epoch": 1.972470504111548, "grad_norm": 17.19388781763234, "learning_rate": 3.17403249849251e-06, "loss": 3.1111, "step": 2069 }, { "epoch": 1.9734239065665595, "grad_norm": 17.696674776763853, "learning_rate": 3.1688701202655523e-06, "loss": 3.2484, "step": 2070 }, { "epoch": 1.9743773090215706, "grad_norm": 19.431012508903578, "learning_rate": 3.1637099954077877e-06, "loss": 3.3009, "step": 2071 }, { "epoch": 1.9753307114765821, "grad_norm": 21.076985997007043, "learning_rate": 3.1585521302692073e-06, "loss": 2.8168, "step": 2072 }, { "epoch": 1.9762841139315934, "grad_norm": 19.521195264012235, "learning_rate": 3.1533965311970296e-06, "loss": 3.2136, "step": 2073 }, { "epoch": 1.9772375163866047, "grad_norm": 21.82386477645536, "learning_rate": 3.148243204535677e-06, "loss": 3.0614, "step": 2074 }, { "epoch": 1.978190918841616, "grad_norm": 17.837024186429947, "learning_rate": 3.143092156626783e-06, "loss": 3.3079, "step": 2075 }, { "epoch": 1.9791443212966273, "grad_norm": 21.39631715898118, "learning_rate": 3.1379433938091695e-06, "loss": 2.9003, "step": 2076 }, { "epoch": 1.9800977237516386, "grad_norm": 18.179472395166805, "learning_rate": 3.1327969224188548e-06, "loss": 3.1598, "step": 2077 }, { "epoch": 1.98105112620665, "grad_norm": 19.828689548459046, "learning_rate": 3.1276527487890275e-06, "loss": 3.1552, "step": 2078 }, { "epoch": 1.9820045286616614, "grad_norm": 17.503017193971658, "learning_rate": 3.1225108792500546e-06, "loss": 3.0275, "step": 2079 }, { "epoch": 1.9829579311166725, "grad_norm": 20.38151117290308, "learning_rate": 3.117371320129469e-06, "loss": 3.0696, "step": 2080 }, { "epoch": 1.983911333571684, "grad_norm": 19.263652845997623, "learning_rate": 3.1122340777519555e-06, "loss": 3.0349, "step": 2081 }, { "epoch": 1.984864736026695, "grad_norm": 17.434730899665354, "learning_rate": 3.1070991584393534e-06, "loss": 3.1875, "step": 2082 }, { "epoch": 1.9858181384817066, "grad_norm": 25.68804309054819, "learning_rate": 3.1019665685106393e-06, "loss": 3.4444, "step": 2083 }, { "epoch": 1.986771540936718, "grad_norm": 24.466787611757226, "learning_rate": 3.0968363142819226e-06, "loss": 3.1378, "step": 2084 }, { "epoch": 1.9877249433917292, "grad_norm": 19.07379231735805, "learning_rate": 3.0917084020664423e-06, "loss": 2.9687, "step": 2085 }, { "epoch": 1.9886783458467405, "grad_norm": 17.50291003110257, "learning_rate": 3.0865828381745515e-06, "loss": 3.3502, "step": 2086 }, { "epoch": 1.9896317483017518, "grad_norm": 22.131279940091588, "learning_rate": 3.081459628913717e-06, "loss": 3.1001, "step": 2087 }, { "epoch": 1.9905851507567633, "grad_norm": 20.24825413821041, "learning_rate": 3.076338780588507e-06, "loss": 3.1485, "step": 2088 }, { "epoch": 1.9915385532117744, "grad_norm": 19.691321045933023, "learning_rate": 3.0712202995005808e-06, "loss": 2.8973, "step": 2089 }, { "epoch": 1.992491955666786, "grad_norm": 19.99662597337853, "learning_rate": 3.066104191948691e-06, "loss": 2.8098, "step": 2090 }, { "epoch": 1.993445358121797, "grad_norm": 17.07776126845036, "learning_rate": 3.0609904642286635e-06, "loss": 3.2449, "step": 2091 }, { "epoch": 1.9943987605768085, "grad_norm": 23.70466565711116, "learning_rate": 3.0558791226333974e-06, "loss": 3.1435, "step": 2092 }, { "epoch": 1.9953521630318198, "grad_norm": 19.662293949722013, "learning_rate": 3.050770173452857e-06, "loss": 3.109, "step": 2093 }, { "epoch": 1.9963055654868311, "grad_norm": 23.961064018081, "learning_rate": 3.045663622974061e-06, "loss": 3.017, "step": 2094 }, { "epoch": 1.9972589679418424, "grad_norm": 17.60937306544528, "learning_rate": 3.0405594774810775e-06, "loss": 3.0222, "step": 2095 }, { "epoch": 1.9982123703968537, "grad_norm": 17.515587994154803, "learning_rate": 3.035457743255016e-06, "loss": 3.1537, "step": 2096 }, { "epoch": 1.9991657728518653, "grad_norm": 21.791037959014076, "learning_rate": 3.030358426574012e-06, "loss": 3.303, "step": 2097 }, { "epoch": 2.0, "grad_norm": 22.327367345913, "learning_rate": 3.025261533713235e-06, "loss": 2.8766, "step": 2098 }, { "epoch": 2.0009534024550115, "grad_norm": 21.99950562605886, "learning_rate": 3.020167070944866e-06, "loss": 2.7564, "step": 2099 }, { "epoch": 2.0019068049100226, "grad_norm": 19.4870720698375, "learning_rate": 3.0150750445380995e-06, "loss": 2.6652, "step": 2100 }, { "epoch": 2.002860207365034, "grad_norm": 18.914650380993823, "learning_rate": 3.009985460759127e-06, "loss": 2.6507, "step": 2101 }, { "epoch": 2.003813609820045, "grad_norm": 30.05682484963558, "learning_rate": 3.0048983258711408e-06, "loss": 2.9147, "step": 2102 }, { "epoch": 2.0047670122750567, "grad_norm": 22.5186077318639, "learning_rate": 2.9998136461343096e-06, "loss": 2.9107, "step": 2103 }, { "epoch": 2.005720414730068, "grad_norm": 20.933484608469186, "learning_rate": 2.9947314278057927e-06, "loss": 2.6902, "step": 2104 }, { "epoch": 2.0066738171850793, "grad_norm": 20.323952894439536, "learning_rate": 2.9896516771397103e-06, "loss": 2.7022, "step": 2105 }, { "epoch": 2.0076272196400904, "grad_norm": 17.557539320849465, "learning_rate": 2.9845744003871534e-06, "loss": 2.6616, "step": 2106 }, { "epoch": 2.008580622095102, "grad_norm": 19.690279395695732, "learning_rate": 2.979499603796163e-06, "loss": 2.6376, "step": 2107 }, { "epoch": 2.0095340245501134, "grad_norm": 20.83887023726255, "learning_rate": 2.9744272936117323e-06, "loss": 2.5109, "step": 2108 }, { "epoch": 2.0104874270051245, "grad_norm": 19.165003654246203, "learning_rate": 2.9693574760757936e-06, "loss": 2.7683, "step": 2109 }, { "epoch": 2.011440829460136, "grad_norm": 22.365357212786833, "learning_rate": 2.9642901574272078e-06, "loss": 2.5082, "step": 2110 }, { "epoch": 2.012394231915147, "grad_norm": 22.448831537458158, "learning_rate": 2.9592253439017658e-06, "loss": 2.7405, "step": 2111 }, { "epoch": 2.0133476343701586, "grad_norm": 16.951912310032466, "learning_rate": 2.954163041732174e-06, "loss": 2.6967, "step": 2112 }, { "epoch": 2.0143010368251697, "grad_norm": 20.87635575420944, "learning_rate": 2.9491032571480488e-06, "loss": 3.001, "step": 2113 }, { "epoch": 2.015254439280181, "grad_norm": 23.088210852654786, "learning_rate": 2.9440459963759065e-06, "loss": 2.7078, "step": 2114 }, { "epoch": 2.0162078417351923, "grad_norm": 21.378714709702898, "learning_rate": 2.938991265639162e-06, "loss": 2.8534, "step": 2115 }, { "epoch": 2.017161244190204, "grad_norm": 18.959940476737774, "learning_rate": 2.9339390711581105e-06, "loss": 2.8293, "step": 2116 }, { "epoch": 2.018114646645215, "grad_norm": 18.17015012364712, "learning_rate": 2.9288894191499294e-06, "loss": 2.5805, "step": 2117 }, { "epoch": 2.0190680491002264, "grad_norm": 17.820651454645514, "learning_rate": 2.923842315828671e-06, "loss": 2.6917, "step": 2118 }, { "epoch": 2.020021451555238, "grad_norm": 18.405274136723538, "learning_rate": 2.9187977674052425e-06, "loss": 2.7231, "step": 2119 }, { "epoch": 2.020974854010249, "grad_norm": 16.84374989087031, "learning_rate": 2.9137557800874177e-06, "loss": 2.5426, "step": 2120 }, { "epoch": 2.0219282564652605, "grad_norm": 20.073754558708117, "learning_rate": 2.9087163600798107e-06, "loss": 2.8719, "step": 2121 }, { "epoch": 2.0228816589202716, "grad_norm": 20.09109134242132, "learning_rate": 2.9036795135838767e-06, "loss": 2.6069, "step": 2122 }, { "epoch": 2.023835061375283, "grad_norm": 19.37149087399203, "learning_rate": 2.8986452467979085e-06, "loss": 2.7291, "step": 2123 }, { "epoch": 2.024788463830294, "grad_norm": 20.892705762472076, "learning_rate": 2.8936135659170217e-06, "loss": 2.7113, "step": 2124 }, { "epoch": 2.0257418662853057, "grad_norm": 20.487900429406338, "learning_rate": 2.8885844771331507e-06, "loss": 2.5726, "step": 2125 }, { "epoch": 2.026695268740317, "grad_norm": 17.716658355276586, "learning_rate": 2.883557986635037e-06, "loss": 2.5984, "step": 2126 }, { "epoch": 2.0276486711953283, "grad_norm": 17.532879364300022, "learning_rate": 2.8785341006082278e-06, "loss": 2.6124, "step": 2127 }, { "epoch": 2.02860207365034, "grad_norm": 21.97996224950137, "learning_rate": 2.8735128252350677e-06, "loss": 2.8821, "step": 2128 }, { "epoch": 2.029555476105351, "grad_norm": 20.522290128460796, "learning_rate": 2.8684941666946796e-06, "loss": 2.5418, "step": 2129 }, { "epoch": 2.0305088785603624, "grad_norm": 20.16673628229572, "learning_rate": 2.8634781311629778e-06, "loss": 2.6541, "step": 2130 }, { "epoch": 2.0314622810153735, "grad_norm": 21.995364734272115, "learning_rate": 2.8584647248126385e-06, "loss": 2.779, "step": 2131 }, { "epoch": 2.032415683470385, "grad_norm": 20.379530219181262, "learning_rate": 2.853453953813108e-06, "loss": 2.3958, "step": 2132 }, { "epoch": 2.033369085925396, "grad_norm": 20.740339431825486, "learning_rate": 2.848445824330589e-06, "loss": 2.6567, "step": 2133 }, { "epoch": 2.0343224883804076, "grad_norm": 21.976036199643215, "learning_rate": 2.843440342528035e-06, "loss": 2.6339, "step": 2134 }, { "epoch": 2.0352758908354187, "grad_norm": 22.328816906400196, "learning_rate": 2.8384375145651354e-06, "loss": 2.6938, "step": 2135 }, { "epoch": 2.0362292932904302, "grad_norm": 21.349500083983788, "learning_rate": 2.8334373465983216e-06, "loss": 2.4675, "step": 2136 }, { "epoch": 2.0371826957454418, "grad_norm": 18.676077624579225, "learning_rate": 2.8284398447807437e-06, "loss": 2.525, "step": 2137 }, { "epoch": 2.038136098200453, "grad_norm": 21.861647936244132, "learning_rate": 2.8234450152622777e-06, "loss": 2.5575, "step": 2138 }, { "epoch": 2.0390895006554643, "grad_norm": 21.97873873104578, "learning_rate": 2.818452864189507e-06, "loss": 2.6272, "step": 2139 }, { "epoch": 2.0400429031104754, "grad_norm": 20.61133982256723, "learning_rate": 2.8134633977057236e-06, "loss": 2.7953, "step": 2140 }, { "epoch": 2.040996305565487, "grad_norm": 20.350265575603327, "learning_rate": 2.8084766219509098e-06, "loss": 2.769, "step": 2141 }, { "epoch": 2.041949708020498, "grad_norm": 20.083211475919086, "learning_rate": 2.8034925430617365e-06, "loss": 2.4545, "step": 2142 }, { "epoch": 2.0429031104755095, "grad_norm": 23.044273309746263, "learning_rate": 2.7985111671715626e-06, "loss": 2.7162, "step": 2143 }, { "epoch": 2.0438565129305206, "grad_norm": 24.18269863256037, "learning_rate": 2.7935325004104164e-06, "loss": 2.6254, "step": 2144 }, { "epoch": 2.044809915385532, "grad_norm": 21.562231551882135, "learning_rate": 2.7885565489049948e-06, "loss": 2.7377, "step": 2145 }, { "epoch": 2.045763317840543, "grad_norm": 21.314395559706085, "learning_rate": 2.7835833187786466e-06, "loss": 2.726, "step": 2146 }, { "epoch": 2.0467167202955547, "grad_norm": 21.837999840467674, "learning_rate": 2.7786128161513816e-06, "loss": 2.7184, "step": 2147 }, { "epoch": 2.0476701227505663, "grad_norm": 19.25668027377307, "learning_rate": 2.7736450471398435e-06, "loss": 2.7889, "step": 2148 }, { "epoch": 2.0486235252055773, "grad_norm": 20.523668705567857, "learning_rate": 2.76868001785732e-06, "loss": 2.8937, "step": 2149 }, { "epoch": 2.049576927660589, "grad_norm": 23.504923766448425, "learning_rate": 2.7637177344137236e-06, "loss": 2.7279, "step": 2150 }, { "epoch": 2.0505303301156, "grad_norm": 21.38961051364782, "learning_rate": 2.7587582029155862e-06, "loss": 2.5428, "step": 2151 }, { "epoch": 2.0514837325706115, "grad_norm": 18.97190230112355, "learning_rate": 2.7538014294660564e-06, "loss": 2.8621, "step": 2152 }, { "epoch": 2.0524371350256225, "grad_norm": 18.282592497273253, "learning_rate": 2.748847420164889e-06, "loss": 2.482, "step": 2153 }, { "epoch": 2.053390537480634, "grad_norm": 19.42227706144232, "learning_rate": 2.743896181108433e-06, "loss": 2.4851, "step": 2154 }, { "epoch": 2.054343939935645, "grad_norm": 19.522946853069342, "learning_rate": 2.738947718389632e-06, "loss": 2.5248, "step": 2155 }, { "epoch": 2.0552973423906566, "grad_norm": 20.02706000117516, "learning_rate": 2.734002038098015e-06, "loss": 2.3151, "step": 2156 }, { "epoch": 2.056250744845668, "grad_norm": 19.24730255381586, "learning_rate": 2.7290591463196805e-06, "loss": 2.7293, "step": 2157 }, { "epoch": 2.0572041473006792, "grad_norm": 20.038820059206227, "learning_rate": 2.7241190491372992e-06, "loss": 2.5571, "step": 2158 }, { "epoch": 2.0581575497556908, "grad_norm": 21.22849874149918, "learning_rate": 2.7191817526301074e-06, "loss": 2.5327, "step": 2159 }, { "epoch": 2.059110952210702, "grad_norm": 18.464729033556967, "learning_rate": 2.7142472628738846e-06, "loss": 2.6021, "step": 2160 }, { "epoch": 2.0600643546657134, "grad_norm": 20.431493206993657, "learning_rate": 2.709315585940967e-06, "loss": 2.6246, "step": 2161 }, { "epoch": 2.0610177571207244, "grad_norm": 22.45422660819986, "learning_rate": 2.70438672790022e-06, "loss": 2.3564, "step": 2162 }, { "epoch": 2.061971159575736, "grad_norm": 19.44295398578306, "learning_rate": 2.699460694817046e-06, "loss": 2.7012, "step": 2163 }, { "epoch": 2.062924562030747, "grad_norm": 17.99837238658574, "learning_rate": 2.69453749275337e-06, "loss": 2.4443, "step": 2164 }, { "epoch": 2.0638779644857586, "grad_norm": 22.91066320169031, "learning_rate": 2.6896171277676355e-06, "loss": 2.6341, "step": 2165 }, { "epoch": 2.06483136694077, "grad_norm": 19.55954978788445, "learning_rate": 2.684699605914789e-06, "loss": 2.7989, "step": 2166 }, { "epoch": 2.065784769395781, "grad_norm": 20.68821835377129, "learning_rate": 2.6797849332462787e-06, "loss": 2.3747, "step": 2167 }, { "epoch": 2.0667381718507927, "grad_norm": 16.883229222622994, "learning_rate": 2.6748731158100528e-06, "loss": 2.4799, "step": 2168 }, { "epoch": 2.0676915743058037, "grad_norm": 15.648674039120161, "learning_rate": 2.6699641596505416e-06, "loss": 2.5149, "step": 2169 }, { "epoch": 2.0686449767608153, "grad_norm": 19.150139866071836, "learning_rate": 2.665058070808654e-06, "loss": 2.8404, "step": 2170 }, { "epoch": 2.0695983792158263, "grad_norm": 23.188402575702337, "learning_rate": 2.660154855321775e-06, "loss": 2.5776, "step": 2171 }, { "epoch": 2.070551781670838, "grad_norm": 19.669684934602238, "learning_rate": 2.655254519223746e-06, "loss": 2.6303, "step": 2172 }, { "epoch": 2.071505184125849, "grad_norm": 21.675552893660406, "learning_rate": 2.6503570685448694e-06, "loss": 2.6338, "step": 2173 }, { "epoch": 2.0724585865808605, "grad_norm": 24.448089331770632, "learning_rate": 2.6454625093118968e-06, "loss": 2.7204, "step": 2174 }, { "epoch": 2.073411989035872, "grad_norm": 22.966849711598, "learning_rate": 2.640570847548022e-06, "loss": 2.9445, "step": 2175 }, { "epoch": 2.074365391490883, "grad_norm": 28.56161838427609, "learning_rate": 2.6356820892728752e-06, "loss": 2.8678, "step": 2176 }, { "epoch": 2.0753187939458946, "grad_norm": 20.988447173646257, "learning_rate": 2.6307962405025055e-06, "loss": 2.5783, "step": 2177 }, { "epoch": 2.0762721964009057, "grad_norm": 17.502711369003226, "learning_rate": 2.6259133072493926e-06, "loss": 2.5675, "step": 2178 }, { "epoch": 2.077225598855917, "grad_norm": 19.27270392414742, "learning_rate": 2.621033295522418e-06, "loss": 2.4612, "step": 2179 }, { "epoch": 2.0781790013109283, "grad_norm": 20.63500484866574, "learning_rate": 2.616156211326875e-06, "loss": 2.4913, "step": 2180 }, { "epoch": 2.0791324037659398, "grad_norm": 19.194748817572435, "learning_rate": 2.611282060664454e-06, "loss": 2.4723, "step": 2181 }, { "epoch": 2.080085806220951, "grad_norm": 18.589163780306475, "learning_rate": 2.6064108495332297e-06, "loss": 2.6751, "step": 2182 }, { "epoch": 2.0810392086759624, "grad_norm": 17.64463506504062, "learning_rate": 2.601542583927666e-06, "loss": 2.5722, "step": 2183 }, { "epoch": 2.0819926111309734, "grad_norm": 21.364974771362093, "learning_rate": 2.5966772698386e-06, "loss": 2.706, "step": 2184 }, { "epoch": 2.082946013585985, "grad_norm": 22.007595065093454, "learning_rate": 2.5918149132532337e-06, "loss": 2.6589, "step": 2185 }, { "epoch": 2.0838994160409965, "grad_norm": 22.383680417836437, "learning_rate": 2.586955520155133e-06, "loss": 2.7181, "step": 2186 }, { "epoch": 2.0848528184960076, "grad_norm": 22.19876432755705, "learning_rate": 2.582099096524219e-06, "loss": 2.8545, "step": 2187 }, { "epoch": 2.085806220951019, "grad_norm": 20.202609910563876, "learning_rate": 2.57724564833675e-06, "loss": 2.3857, "step": 2188 }, { "epoch": 2.08675962340603, "grad_norm": 17.728449245157464, "learning_rate": 2.5723951815653313e-06, "loss": 2.8654, "step": 2189 }, { "epoch": 2.0877130258610417, "grad_norm": 19.2743736545841, "learning_rate": 2.5675477021788963e-06, "loss": 2.5654, "step": 2190 }, { "epoch": 2.0886664283160528, "grad_norm": 21.412984095422033, "learning_rate": 2.562703216142704e-06, "loss": 2.5817, "step": 2191 }, { "epoch": 2.0896198307710643, "grad_norm": 24.810409953150025, "learning_rate": 2.557861729418326e-06, "loss": 2.6806, "step": 2192 }, { "epoch": 2.0905732332260754, "grad_norm": 22.272238106256083, "learning_rate": 2.553023247963644e-06, "loss": 2.6995, "step": 2193 }, { "epoch": 2.091526635681087, "grad_norm": 21.53574861552773, "learning_rate": 2.5481877777328428e-06, "loss": 2.8753, "step": 2194 }, { "epoch": 2.0924800381360984, "grad_norm": 23.808463119074585, "learning_rate": 2.5433553246764026e-06, "loss": 2.6057, "step": 2195 }, { "epoch": 2.0934334405911095, "grad_norm": 18.934437822511523, "learning_rate": 2.5385258947410908e-06, "loss": 2.5237, "step": 2196 }, { "epoch": 2.094386843046121, "grad_norm": 20.132856265803174, "learning_rate": 2.5336994938699504e-06, "loss": 2.5836, "step": 2197 }, { "epoch": 2.095340245501132, "grad_norm": 22.762740191497713, "learning_rate": 2.5288761280022987e-06, "loss": 2.6782, "step": 2198 }, { "epoch": 2.0962936479561436, "grad_norm": 19.678432619701315, "learning_rate": 2.5240558030737215e-06, "loss": 2.9117, "step": 2199 }, { "epoch": 2.0972470504111547, "grad_norm": 24.03079858281074, "learning_rate": 2.5192385250160587e-06, "loss": 2.7753, "step": 2200 }, { "epoch": 2.098200452866166, "grad_norm": 21.44463063336838, "learning_rate": 2.5144242997574026e-06, "loss": 2.4443, "step": 2201 }, { "epoch": 2.0991538553211773, "grad_norm": 18.00505287972496, "learning_rate": 2.50961313322209e-06, "loss": 2.5409, "step": 2202 }, { "epoch": 2.100107257776189, "grad_norm": 18.818723102513108, "learning_rate": 2.5048050313306904e-06, "loss": 2.5938, "step": 2203 }, { "epoch": 2.1010606602312003, "grad_norm": 19.08994693721465, "learning_rate": 2.5000000000000015e-06, "loss": 2.7617, "step": 2204 }, { "epoch": 2.1020140626862114, "grad_norm": 17.74896312356385, "learning_rate": 2.495198045143045e-06, "loss": 2.7705, "step": 2205 }, { "epoch": 2.102967465141223, "grad_norm": 24.070624486315474, "learning_rate": 2.4903991726690585e-06, "loss": 2.6528, "step": 2206 }, { "epoch": 2.103920867596234, "grad_norm": 20.371600802400025, "learning_rate": 2.4856033884834834e-06, "loss": 2.6256, "step": 2207 }, { "epoch": 2.1048742700512455, "grad_norm": 21.977766356857824, "learning_rate": 2.4808106984879597e-06, "loss": 2.4145, "step": 2208 }, { "epoch": 2.1058276725062566, "grad_norm": 26.00957228277134, "learning_rate": 2.4760211085803213e-06, "loss": 2.9768, "step": 2209 }, { "epoch": 2.106781074961268, "grad_norm": 25.167128933520875, "learning_rate": 2.471234624654591e-06, "loss": 2.5915, "step": 2210 }, { "epoch": 2.107734477416279, "grad_norm": 24.592855781745147, "learning_rate": 2.4664512526009608e-06, "loss": 2.5679, "step": 2211 }, { "epoch": 2.1086878798712907, "grad_norm": 21.1799391924718, "learning_rate": 2.461670998305802e-06, "loss": 3.0323, "step": 2212 }, { "epoch": 2.1096412823263018, "grad_norm": 18.914213802536278, "learning_rate": 2.456893867651641e-06, "loss": 2.5637, "step": 2213 }, { "epoch": 2.1105946847813133, "grad_norm": 19.94302458557168, "learning_rate": 2.4521198665171675e-06, "loss": 2.6393, "step": 2214 }, { "epoch": 2.111548087236325, "grad_norm": 21.596277250508276, "learning_rate": 2.4473490007772164e-06, "loss": 2.5119, "step": 2215 }, { "epoch": 2.112501489691336, "grad_norm": 16.996153930030733, "learning_rate": 2.4425812763027672e-06, "loss": 2.7159, "step": 2216 }, { "epoch": 2.1134548921463474, "grad_norm": 20.00335132057897, "learning_rate": 2.4378166989609275e-06, "loss": 2.7705, "step": 2217 }, { "epoch": 2.1144082946013585, "grad_norm": 20.091202217898257, "learning_rate": 2.4330552746149406e-06, "loss": 2.6404, "step": 2218 }, { "epoch": 2.11536169705637, "grad_norm": 23.365819986798563, "learning_rate": 2.428297009124161e-06, "loss": 2.7167, "step": 2219 }, { "epoch": 2.116315099511381, "grad_norm": 21.163471956324706, "learning_rate": 2.4235419083440615e-06, "loss": 2.6361, "step": 2220 }, { "epoch": 2.1172685019663926, "grad_norm": 19.103092538869852, "learning_rate": 2.4187899781262197e-06, "loss": 2.8205, "step": 2221 }, { "epoch": 2.1182219044214037, "grad_norm": 23.776890476385233, "learning_rate": 2.414041224318313e-06, "loss": 2.7263, "step": 2222 }, { "epoch": 2.119175306876415, "grad_norm": 24.358387705257588, "learning_rate": 2.4092956527641066e-06, "loss": 2.6002, "step": 2223 }, { "epoch": 2.1201287093314267, "grad_norm": 17.661354626316246, "learning_rate": 2.404553269303448e-06, "loss": 2.5834, "step": 2224 }, { "epoch": 2.121082111786438, "grad_norm": 18.87106660668636, "learning_rate": 2.3998140797722664e-06, "loss": 2.5071, "step": 2225 }, { "epoch": 2.1220355142414493, "grad_norm": 22.30736402876568, "learning_rate": 2.3950780900025594e-06, "loss": 2.7056, "step": 2226 }, { "epoch": 2.1229889166964604, "grad_norm": 22.094234274623826, "learning_rate": 2.390345305822388e-06, "loss": 2.9333, "step": 2227 }, { "epoch": 2.123942319151472, "grad_norm": 21.985688605311207, "learning_rate": 2.3856157330558625e-06, "loss": 2.8373, "step": 2228 }, { "epoch": 2.124895721606483, "grad_norm": 24.728755325736266, "learning_rate": 2.3808893775231503e-06, "loss": 2.6823, "step": 2229 }, { "epoch": 2.1258491240614945, "grad_norm": 24.476375378016733, "learning_rate": 2.3761662450404493e-06, "loss": 2.6185, "step": 2230 }, { "epoch": 2.1268025265165056, "grad_norm": 21.882152688236324, "learning_rate": 2.3714463414199993e-06, "loss": 2.7857, "step": 2231 }, { "epoch": 2.127755928971517, "grad_norm": 25.834132311847874, "learning_rate": 2.366729672470065e-06, "loss": 2.5252, "step": 2232 }, { "epoch": 2.1287093314265286, "grad_norm": 19.912610779458696, "learning_rate": 2.3620162439949306e-06, "loss": 2.6774, "step": 2233 }, { "epoch": 2.1296627338815397, "grad_norm": 18.953962778999355, "learning_rate": 2.3573060617948885e-06, "loss": 2.6649, "step": 2234 }, { "epoch": 2.1306161363365512, "grad_norm": 23.506585734368606, "learning_rate": 2.3525991316662427e-06, "loss": 2.5416, "step": 2235 }, { "epoch": 2.1315695387915623, "grad_norm": 19.211110173989518, "learning_rate": 2.3478954594012884e-06, "loss": 2.655, "step": 2236 }, { "epoch": 2.132522941246574, "grad_norm": 17.758630731644693, "learning_rate": 2.3431950507883165e-06, "loss": 2.7722, "step": 2237 }, { "epoch": 2.133476343701585, "grad_norm": 23.5914074190466, "learning_rate": 2.3384979116116034e-06, "loss": 2.6018, "step": 2238 }, { "epoch": 2.1344297461565964, "grad_norm": 23.744322891503725, "learning_rate": 2.333804047651395e-06, "loss": 2.7539, "step": 2239 }, { "epoch": 2.1353831486116075, "grad_norm": 21.23212494726595, "learning_rate": 2.329113464683913e-06, "loss": 2.7212, "step": 2240 }, { "epoch": 2.136336551066619, "grad_norm": 19.623468390331094, "learning_rate": 2.3244261684813415e-06, "loss": 2.7094, "step": 2241 }, { "epoch": 2.1372899535216305, "grad_norm": 20.393586985196222, "learning_rate": 2.3197421648118134e-06, "loss": 2.8896, "step": 2242 }, { "epoch": 2.1382433559766416, "grad_norm": 25.18900613488441, "learning_rate": 2.315061459439419e-06, "loss": 2.7463, "step": 2243 }, { "epoch": 2.139196758431653, "grad_norm": 20.03022867041527, "learning_rate": 2.310384058124181e-06, "loss": 2.5353, "step": 2244 }, { "epoch": 2.140150160886664, "grad_norm": 15.233638518335932, "learning_rate": 2.3057099666220624e-06, "loss": 2.5931, "step": 2245 }, { "epoch": 2.1411035633416757, "grad_norm": 18.084339369319032, "learning_rate": 2.3010391906849512e-06, "loss": 2.6043, "step": 2246 }, { "epoch": 2.142056965796687, "grad_norm": 17.5102772014411, "learning_rate": 2.296371736060655e-06, "loss": 2.5967, "step": 2247 }, { "epoch": 2.1430103682516983, "grad_norm": 19.10723736327892, "learning_rate": 2.2917076084928953e-06, "loss": 2.4954, "step": 2248 }, { "epoch": 2.1439637707067094, "grad_norm": 16.660122958889133, "learning_rate": 2.2870468137212987e-06, "loss": 2.7566, "step": 2249 }, { "epoch": 2.144917173161721, "grad_norm": 19.334658955567132, "learning_rate": 2.2823893574813864e-06, "loss": 2.6307, "step": 2250 }, { "epoch": 2.1458705756167324, "grad_norm": 20.73143157513678, "learning_rate": 2.277735245504579e-06, "loss": 2.8146, "step": 2251 }, { "epoch": 2.1468239780717435, "grad_norm": 19.592547430433868, "learning_rate": 2.273084483518176e-06, "loss": 2.6049, "step": 2252 }, { "epoch": 2.147777380526755, "grad_norm": 16.55992881370064, "learning_rate": 2.2684370772453586e-06, "loss": 2.5527, "step": 2253 }, { "epoch": 2.148730782981766, "grad_norm": 16.966324156688568, "learning_rate": 2.2637930324051748e-06, "loss": 2.7005, "step": 2254 }, { "epoch": 2.1496841854367776, "grad_norm": 20.986058824974844, "learning_rate": 2.2591523547125348e-06, "loss": 2.6103, "step": 2255 }, { "epoch": 2.1506375878917887, "grad_norm": 21.219166538997978, "learning_rate": 2.25451504987821e-06, "loss": 2.8134, "step": 2256 }, { "epoch": 2.1515909903468002, "grad_norm": 16.830113730814144, "learning_rate": 2.24988112360882e-06, "loss": 2.5692, "step": 2257 }, { "epoch": 2.1525443928018113, "grad_norm": 18.269835719090903, "learning_rate": 2.245250581606826e-06, "loss": 2.3709, "step": 2258 }, { "epoch": 2.153497795256823, "grad_norm": 17.16148340443342, "learning_rate": 2.2406234295705215e-06, "loss": 2.7204, "step": 2259 }, { "epoch": 2.154451197711834, "grad_norm": 21.29664172922115, "learning_rate": 2.2359996731940348e-06, "loss": 2.7808, "step": 2260 }, { "epoch": 2.1554046001668454, "grad_norm": 26.188209472791637, "learning_rate": 2.2313793181673077e-06, "loss": 2.7688, "step": 2261 }, { "epoch": 2.156358002621857, "grad_norm": 22.411553337317507, "learning_rate": 2.2267623701761033e-06, "loss": 2.8248, "step": 2262 }, { "epoch": 2.157311405076868, "grad_norm": 26.160041699332915, "learning_rate": 2.2221488349019903e-06, "loss": 2.6123, "step": 2263 }, { "epoch": 2.1582648075318795, "grad_norm": 19.743297311795168, "learning_rate": 2.2175387180223333e-06, "loss": 2.7215, "step": 2264 }, { "epoch": 2.1592182099868906, "grad_norm": 18.95545966700954, "learning_rate": 2.212932025210296e-06, "loss": 2.7419, "step": 2265 }, { "epoch": 2.160171612441902, "grad_norm": 21.498955332451445, "learning_rate": 2.208328762134826e-06, "loss": 2.6801, "step": 2266 }, { "epoch": 2.161125014896913, "grad_norm": 21.47708717017721, "learning_rate": 2.2037289344606516e-06, "loss": 2.5169, "step": 2267 }, { "epoch": 2.1620784173519247, "grad_norm": 18.03545278597108, "learning_rate": 2.1991325478482695e-06, "loss": 2.6052, "step": 2268 }, { "epoch": 2.163031819806936, "grad_norm": 19.603740158039656, "learning_rate": 2.194539607953948e-06, "loss": 2.7576, "step": 2269 }, { "epoch": 2.1639852222619473, "grad_norm": 24.21975694962942, "learning_rate": 2.189950120429708e-06, "loss": 2.7237, "step": 2270 }, { "epoch": 2.1649386247169584, "grad_norm": 21.16547019780456, "learning_rate": 2.1853640909233246e-06, "loss": 2.7822, "step": 2271 }, { "epoch": 2.16589202717197, "grad_norm": 23.937084500818667, "learning_rate": 2.1807815250783194e-06, "loss": 2.7587, "step": 2272 }, { "epoch": 2.1668454296269815, "grad_norm": 28.53306931300425, "learning_rate": 2.1762024285339504e-06, "loss": 2.8105, "step": 2273 }, { "epoch": 2.1677988320819925, "grad_norm": 22.264597606531158, "learning_rate": 2.1716268069252045e-06, "loss": 2.8942, "step": 2274 }, { "epoch": 2.168752234537004, "grad_norm": 20.31455978096138, "learning_rate": 2.167054665882791e-06, "loss": 2.7938, "step": 2275 }, { "epoch": 2.169705636992015, "grad_norm": 21.76255550861574, "learning_rate": 2.162486011033142e-06, "loss": 2.5404, "step": 2276 }, { "epoch": 2.1706590394470267, "grad_norm": 19.115201596424317, "learning_rate": 2.1579208479983944e-06, "loss": 2.7223, "step": 2277 }, { "epoch": 2.1716124419020377, "grad_norm": 19.05312474734761, "learning_rate": 2.1533591823963927e-06, "loss": 2.74, "step": 2278 }, { "epoch": 2.1725658443570492, "grad_norm": 22.16815516985506, "learning_rate": 2.1488010198406722e-06, "loss": 2.7579, "step": 2279 }, { "epoch": 2.1735192468120603, "grad_norm": 24.074067092374424, "learning_rate": 2.1442463659404587e-06, "loss": 2.3327, "step": 2280 }, { "epoch": 2.174472649267072, "grad_norm": 27.588284928133454, "learning_rate": 2.139695226300663e-06, "loss": 2.6389, "step": 2281 }, { "epoch": 2.1754260517220834, "grad_norm": 19.275176898083526, "learning_rate": 2.1351476065218703e-06, "loss": 2.6587, "step": 2282 }, { "epoch": 2.1763794541770944, "grad_norm": 22.852922200316183, "learning_rate": 2.130603512200332e-06, "loss": 2.7417, "step": 2283 }, { "epoch": 2.177332856632106, "grad_norm": 24.39101330444298, "learning_rate": 2.1260629489279662e-06, "loss": 2.5875, "step": 2284 }, { "epoch": 2.178286259087117, "grad_norm": 26.018463207588727, "learning_rate": 2.1215259222923383e-06, "loss": 2.7694, "step": 2285 }, { "epoch": 2.1792396615421286, "grad_norm": 19.363154723817797, "learning_rate": 2.116992437876669e-06, "loss": 2.6269, "step": 2286 }, { "epoch": 2.1801930639971396, "grad_norm": 20.51207394838668, "learning_rate": 2.1124625012598137e-06, "loss": 2.7032, "step": 2287 }, { "epoch": 2.181146466452151, "grad_norm": 22.596161081519416, "learning_rate": 2.1079361180162657e-06, "loss": 2.7565, "step": 2288 }, { "epoch": 2.1820998689071622, "grad_norm": 21.90181761215558, "learning_rate": 2.1034132937161468e-06, "loss": 2.5753, "step": 2289 }, { "epoch": 2.1830532713621738, "grad_norm": 21.466608553553538, "learning_rate": 2.098894033925194e-06, "loss": 2.3674, "step": 2290 }, { "epoch": 2.1840066738171853, "grad_norm": 20.2517834691018, "learning_rate": 2.0943783442047618e-06, "loss": 2.4886, "step": 2291 }, { "epoch": 2.1849600762721963, "grad_norm": 21.208698541915677, "learning_rate": 2.089866230111813e-06, "loss": 2.7407, "step": 2292 }, { "epoch": 2.185913478727208, "grad_norm": 25.07798151018785, "learning_rate": 2.085357697198904e-06, "loss": 2.441, "step": 2293 }, { "epoch": 2.186866881182219, "grad_norm": 23.758890791014274, "learning_rate": 2.080852751014191e-06, "loss": 2.592, "step": 2294 }, { "epoch": 2.1878202836372305, "grad_norm": 22.853786340721634, "learning_rate": 2.076351397101411e-06, "loss": 2.7746, "step": 2295 }, { "epoch": 2.1887736860922415, "grad_norm": 21.457098968565262, "learning_rate": 2.0718536409998834e-06, "loss": 2.6328, "step": 2296 }, { "epoch": 2.189727088547253, "grad_norm": 25.382640843479223, "learning_rate": 2.067359488244501e-06, "loss": 2.6111, "step": 2297 }, { "epoch": 2.190680491002264, "grad_norm": 20.7416267814072, "learning_rate": 2.062868944365722e-06, "loss": 2.6937, "step": 2298 }, { "epoch": 2.1916338934572757, "grad_norm": 23.49128098604401, "learning_rate": 2.0583820148895585e-06, "loss": 2.4729, "step": 2299 }, { "epoch": 2.192587295912287, "grad_norm": 15.941249907952303, "learning_rate": 2.053898705337583e-06, "loss": 2.669, "step": 2300 }, { "epoch": 2.1935406983672983, "grad_norm": 20.444075870369208, "learning_rate": 2.0494190212269054e-06, "loss": 2.4019, "step": 2301 }, { "epoch": 2.19449410082231, "grad_norm": 20.285405458815834, "learning_rate": 2.0449429680701798e-06, "loss": 2.6886, "step": 2302 }, { "epoch": 2.195447503277321, "grad_norm": 23.341117089135484, "learning_rate": 2.0404705513755903e-06, "loss": 2.7108, "step": 2303 }, { "epoch": 2.1964009057323324, "grad_norm": 22.81109797763696, "learning_rate": 2.0360017766468466e-06, "loss": 2.3586, "step": 2304 }, { "epoch": 2.1973543081873435, "grad_norm": 20.241971965358054, "learning_rate": 2.0315366493831755e-06, "loss": 2.6116, "step": 2305 }, { "epoch": 2.198307710642355, "grad_norm": 20.699027735956374, "learning_rate": 2.027075175079313e-06, "loss": 2.3846, "step": 2306 }, { "epoch": 2.199261113097366, "grad_norm": 18.432376777631692, "learning_rate": 2.0226173592255055e-06, "loss": 2.5828, "step": 2307 }, { "epoch": 2.2002145155523776, "grad_norm": 20.1626223216912, "learning_rate": 2.0181632073074925e-06, "loss": 2.628, "step": 2308 }, { "epoch": 2.201167918007389, "grad_norm": 25.882775914326594, "learning_rate": 2.0137127248065103e-06, "loss": 2.7747, "step": 2309 }, { "epoch": 2.2021213204624, "grad_norm": 20.37671139071885, "learning_rate": 2.0092659171992708e-06, "loss": 2.7203, "step": 2310 }, { "epoch": 2.2030747229174117, "grad_norm": 21.4026138778507, "learning_rate": 2.004822789957973e-06, "loss": 2.6059, "step": 2311 }, { "epoch": 2.2040281253724228, "grad_norm": 23.978720928900184, "learning_rate": 2.000383348550279e-06, "loss": 2.9371, "step": 2312 }, { "epoch": 2.2049815278274343, "grad_norm": 24.11025064609309, "learning_rate": 1.99594759843932e-06, "loss": 2.6221, "step": 2313 }, { "epoch": 2.2059349302824454, "grad_norm": 22.40285751418615, "learning_rate": 1.991515545083684e-06, "loss": 2.5117, "step": 2314 }, { "epoch": 2.206888332737457, "grad_norm": 20.950220996203573, "learning_rate": 1.9870871939374114e-06, "loss": 2.6017, "step": 2315 }, { "epoch": 2.207841735192468, "grad_norm": 20.385744774876727, "learning_rate": 1.9826625504499807e-06, "loss": 2.5916, "step": 2316 }, { "epoch": 2.2087951376474795, "grad_norm": 19.965913755008998, "learning_rate": 1.9782416200663152e-06, "loss": 2.6414, "step": 2317 }, { "epoch": 2.209748540102491, "grad_norm": 24.359945711636932, "learning_rate": 1.9738244082267614e-06, "loss": 2.3452, "step": 2318 }, { "epoch": 2.210701942557502, "grad_norm": 23.132623833371024, "learning_rate": 1.9694109203670964e-06, "loss": 2.589, "step": 2319 }, { "epoch": 2.2116553450125136, "grad_norm": 20.175110984470333, "learning_rate": 1.965001161918513e-06, "loss": 2.5054, "step": 2320 }, { "epoch": 2.2126087474675247, "grad_norm": 16.342551196869763, "learning_rate": 1.9605951383076105e-06, "loss": 2.6713, "step": 2321 }, { "epoch": 2.213562149922536, "grad_norm": 22.122398844867806, "learning_rate": 1.956192854956397e-06, "loss": 2.7213, "step": 2322 }, { "epoch": 2.2145155523775473, "grad_norm": 24.43950135197836, "learning_rate": 1.9517943172822756e-06, "loss": 2.6263, "step": 2323 }, { "epoch": 2.215468954832559, "grad_norm": 18.15336714101878, "learning_rate": 1.947399530698043e-06, "loss": 2.6037, "step": 2324 }, { "epoch": 2.21642235728757, "grad_norm": 19.532253363062868, "learning_rate": 1.943008500611876e-06, "loss": 2.4657, "step": 2325 }, { "epoch": 2.2173757597425814, "grad_norm": 20.900215339635317, "learning_rate": 1.938621232427327e-06, "loss": 2.6952, "step": 2326 }, { "epoch": 2.2183291621975925, "grad_norm": 23.607467974749554, "learning_rate": 1.9342377315433253e-06, "loss": 2.4555, "step": 2327 }, { "epoch": 2.219282564652604, "grad_norm": 20.82167073117579, "learning_rate": 1.92985800335416e-06, "loss": 2.6158, "step": 2328 }, { "epoch": 2.2202359671076155, "grad_norm": 21.012770714653925, "learning_rate": 1.9254820532494788e-06, "loss": 2.608, "step": 2329 }, { "epoch": 2.2211893695626266, "grad_norm": 18.82760235090683, "learning_rate": 1.9211098866142825e-06, "loss": 2.5414, "step": 2330 }, { "epoch": 2.222142772017638, "grad_norm": 19.0519420011625, "learning_rate": 1.9167415088289103e-06, "loss": 2.7001, "step": 2331 }, { "epoch": 2.223096174472649, "grad_norm": 23.344688428254937, "learning_rate": 1.912376925269041e-06, "loss": 2.7348, "step": 2332 }, { "epoch": 2.2240495769276607, "grad_norm": 20.273238130236873, "learning_rate": 1.9080161413056876e-06, "loss": 2.6278, "step": 2333 }, { "epoch": 2.2250029793826718, "grad_norm": 21.279882462162995, "learning_rate": 1.9036591623051836e-06, "loss": 2.614, "step": 2334 }, { "epoch": 2.2259563818376833, "grad_norm": 19.333868967966, "learning_rate": 1.8993059936291848e-06, "loss": 2.8104, "step": 2335 }, { "epoch": 2.2269097842926944, "grad_norm": 27.42875024351238, "learning_rate": 1.894956640634652e-06, "loss": 2.5368, "step": 2336 }, { "epoch": 2.227863186747706, "grad_norm": 17.23693890611471, "learning_rate": 1.8906111086738522e-06, "loss": 2.5145, "step": 2337 }, { "epoch": 2.228816589202717, "grad_norm": 19.874143986722547, "learning_rate": 1.8862694030943528e-06, "loss": 2.6506, "step": 2338 }, { "epoch": 2.2297699916577285, "grad_norm": 24.28278175231808, "learning_rate": 1.8819315292390116e-06, "loss": 2.6818, "step": 2339 }, { "epoch": 2.23072339411274, "grad_norm": 25.978907792030906, "learning_rate": 1.8775974924459716e-06, "loss": 2.667, "step": 2340 }, { "epoch": 2.231676796567751, "grad_norm": 18.79020888433169, "learning_rate": 1.8732672980486494e-06, "loss": 2.79, "step": 2341 }, { "epoch": 2.2326301990227626, "grad_norm": 19.878772457119815, "learning_rate": 1.8689409513757396e-06, "loss": 2.735, "step": 2342 }, { "epoch": 2.2335836014777737, "grad_norm": 22.29439513769313, "learning_rate": 1.8646184577511995e-06, "loss": 2.4843, "step": 2343 }, { "epoch": 2.234537003932785, "grad_norm": 22.192174953669962, "learning_rate": 1.860299822494241e-06, "loss": 2.6427, "step": 2344 }, { "epoch": 2.2354904063877963, "grad_norm": 18.78084009742784, "learning_rate": 1.8559850509193339e-06, "loss": 2.8848, "step": 2345 }, { "epoch": 2.236443808842808, "grad_norm": 25.13819803301557, "learning_rate": 1.8516741483361916e-06, "loss": 2.7825, "step": 2346 }, { "epoch": 2.237397211297819, "grad_norm": 21.507577954614177, "learning_rate": 1.8473671200497622e-06, "loss": 2.6964, "step": 2347 }, { "epoch": 2.2383506137528304, "grad_norm": 21.378219612756915, "learning_rate": 1.8430639713602317e-06, "loss": 2.7389, "step": 2348 }, { "epoch": 2.239304016207842, "grad_norm": 20.921983092690652, "learning_rate": 1.8387647075630106e-06, "loss": 2.6691, "step": 2349 }, { "epoch": 2.240257418662853, "grad_norm": 19.367537653414125, "learning_rate": 1.8344693339487252e-06, "loss": 2.5075, "step": 2350 }, { "epoch": 2.2412108211178645, "grad_norm": 20.350839418578026, "learning_rate": 1.83017785580322e-06, "loss": 2.7915, "step": 2351 }, { "epoch": 2.2421642235728756, "grad_norm": 19.813163829712522, "learning_rate": 1.8258902784075394e-06, "loss": 2.6276, "step": 2352 }, { "epoch": 2.243117626027887, "grad_norm": 19.766107450105483, "learning_rate": 1.8216066070379335e-06, "loss": 2.7384, "step": 2353 }, { "epoch": 2.244071028482898, "grad_norm": 24.021349792435604, "learning_rate": 1.8173268469658424e-06, "loss": 2.5286, "step": 2354 }, { "epoch": 2.2450244309379097, "grad_norm": 22.681318213066668, "learning_rate": 1.8130510034578964e-06, "loss": 2.5817, "step": 2355 }, { "epoch": 2.245977833392921, "grad_norm": 20.87533753835966, "learning_rate": 1.808779081775901e-06, "loss": 2.6768, "step": 2356 }, { "epoch": 2.2469312358479323, "grad_norm": 21.141963943650538, "learning_rate": 1.8045110871768373e-06, "loss": 2.7972, "step": 2357 }, { "epoch": 2.247884638302944, "grad_norm": 20.832737222621784, "learning_rate": 1.8002470249128557e-06, "loss": 2.5249, "step": 2358 }, { "epoch": 2.248838040757955, "grad_norm": 19.41689728216971, "learning_rate": 1.7959869002312668e-06, "loss": 2.7097, "step": 2359 }, { "epoch": 2.2497914432129664, "grad_norm": 20.482210474759228, "learning_rate": 1.7917307183745353e-06, "loss": 2.7135, "step": 2360 }, { "epoch": 2.2507448456679775, "grad_norm": 27.681399511013794, "learning_rate": 1.787478484580275e-06, "loss": 2.7582, "step": 2361 }, { "epoch": 2.251698248122989, "grad_norm": 21.1265606689176, "learning_rate": 1.7832302040812394e-06, "loss": 2.5244, "step": 2362 }, { "epoch": 2.252651650578, "grad_norm": 19.056466947676874, "learning_rate": 1.7789858821053153e-06, "loss": 2.684, "step": 2363 }, { "epoch": 2.2536050530330116, "grad_norm": 18.009659575171163, "learning_rate": 1.7747455238755223e-06, "loss": 2.7203, "step": 2364 }, { "epoch": 2.2545584554880227, "grad_norm": 19.96938180104416, "learning_rate": 1.7705091346100017e-06, "loss": 2.6489, "step": 2365 }, { "epoch": 2.255511857943034, "grad_norm": 20.103208981728145, "learning_rate": 1.7662767195220104e-06, "loss": 2.9255, "step": 2366 }, { "epoch": 2.2564652603980457, "grad_norm": 21.616507183196045, "learning_rate": 1.762048283819911e-06, "loss": 2.821, "step": 2367 }, { "epoch": 2.257418662853057, "grad_norm": 20.953932961805847, "learning_rate": 1.757823832707175e-06, "loss": 2.6202, "step": 2368 }, { "epoch": 2.2583720653080683, "grad_norm": 21.347993720211566, "learning_rate": 1.7536033713823647e-06, "loss": 2.473, "step": 2369 }, { "epoch": 2.2593254677630794, "grad_norm": 17.38994287799826, "learning_rate": 1.7493869050391371e-06, "loss": 2.5308, "step": 2370 }, { "epoch": 2.260278870218091, "grad_norm": 19.228442437266477, "learning_rate": 1.745174438866233e-06, "loss": 2.9844, "step": 2371 }, { "epoch": 2.261232272673102, "grad_norm": 23.186246900351446, "learning_rate": 1.7409659780474652e-06, "loss": 2.6196, "step": 2372 }, { "epoch": 2.2621856751281135, "grad_norm": 17.878663549343674, "learning_rate": 1.7367615277617233e-06, "loss": 2.4276, "step": 2373 }, { "epoch": 2.2631390775831246, "grad_norm": 17.771403816126693, "learning_rate": 1.7325610931829618e-06, "loss": 2.6771, "step": 2374 }, { "epoch": 2.264092480038136, "grad_norm": 20.92023395851323, "learning_rate": 1.7283646794801874e-06, "loss": 2.4441, "step": 2375 }, { "epoch": 2.2650458824931476, "grad_norm": 21.494546320676584, "learning_rate": 1.7241722918174642e-06, "loss": 2.6261, "step": 2376 }, { "epoch": 2.2659992849481587, "grad_norm": 18.998704745751933, "learning_rate": 1.7199839353539032e-06, "loss": 2.5482, "step": 2377 }, { "epoch": 2.2669526874031702, "grad_norm": 18.224458349539134, "learning_rate": 1.7157996152436473e-06, "loss": 2.4262, "step": 2378 }, { "epoch": 2.2679060898581813, "grad_norm": 21.649660043679148, "learning_rate": 1.7116193366358796e-06, "loss": 2.7337, "step": 2379 }, { "epoch": 2.268859492313193, "grad_norm": 19.226217379171118, "learning_rate": 1.7074431046748075e-06, "loss": 2.775, "step": 2380 }, { "epoch": 2.269812894768204, "grad_norm": 18.467717290140236, "learning_rate": 1.7032709244996559e-06, "loss": 2.8517, "step": 2381 }, { "epoch": 2.2707662972232154, "grad_norm": 19.1053675730068, "learning_rate": 1.6991028012446687e-06, "loss": 2.7353, "step": 2382 }, { "epoch": 2.2717196996782265, "grad_norm": 18.0823033448567, "learning_rate": 1.6949387400390916e-06, "loss": 2.6869, "step": 2383 }, { "epoch": 2.272673102133238, "grad_norm": 21.58478086065851, "learning_rate": 1.6907787460071756e-06, "loss": 2.9198, "step": 2384 }, { "epoch": 2.2736265045882496, "grad_norm": 22.11680466594718, "learning_rate": 1.6866228242681658e-06, "loss": 2.5819, "step": 2385 }, { "epoch": 2.2745799070432606, "grad_norm": 19.856717656341328, "learning_rate": 1.6824709799362982e-06, "loss": 2.4017, "step": 2386 }, { "epoch": 2.275533309498272, "grad_norm": 27.335072834858487, "learning_rate": 1.6783232181207865e-06, "loss": 2.6401, "step": 2387 }, { "epoch": 2.2764867119532832, "grad_norm": 23.097811349082587, "learning_rate": 1.6741795439258218e-06, "loss": 2.6634, "step": 2388 }, { "epoch": 2.2774401144082947, "grad_norm": 20.733717099387736, "learning_rate": 1.6700399624505665e-06, "loss": 2.3889, "step": 2389 }, { "epoch": 2.278393516863306, "grad_norm": 19.348217034621474, "learning_rate": 1.665904478789147e-06, "loss": 2.6332, "step": 2390 }, { "epoch": 2.2793469193183173, "grad_norm": 21.608020031202702, "learning_rate": 1.661773098030648e-06, "loss": 2.8822, "step": 2391 }, { "epoch": 2.2803003217733284, "grad_norm": 20.61300170734263, "learning_rate": 1.6576458252590988e-06, "loss": 2.9369, "step": 2392 }, { "epoch": 2.28125372422834, "grad_norm": 22.112497259187126, "learning_rate": 1.653522665553482e-06, "loss": 2.794, "step": 2393 }, { "epoch": 2.282207126683351, "grad_norm": 22.137985228271237, "learning_rate": 1.6494036239877115e-06, "loss": 2.6591, "step": 2394 }, { "epoch": 2.2831605291383625, "grad_norm": 22.39952268374578, "learning_rate": 1.6452887056306378e-06, "loss": 2.8072, "step": 2395 }, { "epoch": 2.2841139315933736, "grad_norm": 18.97364030131112, "learning_rate": 1.641177915546036e-06, "loss": 2.5469, "step": 2396 }, { "epoch": 2.285067334048385, "grad_norm": 25.37711354060764, "learning_rate": 1.637071258792603e-06, "loss": 2.5556, "step": 2397 }, { "epoch": 2.2860207365033967, "grad_norm": 25.188709474500566, "learning_rate": 1.6329687404239446e-06, "loss": 2.8303, "step": 2398 }, { "epoch": 2.2869741389584077, "grad_norm": 24.71828089600324, "learning_rate": 1.6288703654885796e-06, "loss": 2.6505, "step": 2399 }, { "epoch": 2.2879275414134193, "grad_norm": 23.04436016685809, "learning_rate": 1.6247761390299221e-06, "loss": 2.5145, "step": 2400 }, { "epoch": 2.2888809438684303, "grad_norm": 17.776787702188347, "learning_rate": 1.620686066086286e-06, "loss": 2.8556, "step": 2401 }, { "epoch": 2.289834346323442, "grad_norm": 25.20548809017554, "learning_rate": 1.616600151690873e-06, "loss": 2.7195, "step": 2402 }, { "epoch": 2.290787748778453, "grad_norm": 22.61192812857611, "learning_rate": 1.6125184008717636e-06, "loss": 2.7282, "step": 2403 }, { "epoch": 2.2917411512334644, "grad_norm": 16.735090047331386, "learning_rate": 1.6084408186519195e-06, "loss": 2.5349, "step": 2404 }, { "epoch": 2.2926945536884755, "grad_norm": 20.304869063321593, "learning_rate": 1.6043674100491703e-06, "loss": 2.7325, "step": 2405 }, { "epoch": 2.293647956143487, "grad_norm": 21.343205642403888, "learning_rate": 1.6002981800762106e-06, "loss": 2.6389, "step": 2406 }, { "epoch": 2.2946013585984986, "grad_norm": 18.88180650647813, "learning_rate": 1.5962331337405917e-06, "loss": 2.4535, "step": 2407 }, { "epoch": 2.2955547610535096, "grad_norm": 18.340482983244147, "learning_rate": 1.5921722760447144e-06, "loss": 2.5508, "step": 2408 }, { "epoch": 2.296508163508521, "grad_norm": 19.76780023341477, "learning_rate": 1.5881156119858293e-06, "loss": 2.5554, "step": 2409 }, { "epoch": 2.2974615659635322, "grad_norm": 21.080835913851143, "learning_rate": 1.5840631465560252e-06, "loss": 2.6055, "step": 2410 }, { "epoch": 2.2984149684185438, "grad_norm": 20.04858032772496, "learning_rate": 1.5800148847422225e-06, "loss": 2.3365, "step": 2411 }, { "epoch": 2.299368370873555, "grad_norm": 23.60074629666726, "learning_rate": 1.5759708315261724e-06, "loss": 2.6593, "step": 2412 }, { "epoch": 2.3003217733285664, "grad_norm": 18.05096603707983, "learning_rate": 1.5719309918844417e-06, "loss": 2.5517, "step": 2413 }, { "epoch": 2.3012751757835774, "grad_norm": 22.44604478239135, "learning_rate": 1.5678953707884132e-06, "loss": 2.873, "step": 2414 }, { "epoch": 2.302228578238589, "grad_norm": 20.60218222815873, "learning_rate": 1.5638639732042822e-06, "loss": 2.4833, "step": 2415 }, { "epoch": 2.3031819806936005, "grad_norm": 20.619500866050394, "learning_rate": 1.5598368040930427e-06, "loss": 2.6937, "step": 2416 }, { "epoch": 2.3041353831486115, "grad_norm": 19.52381914905056, "learning_rate": 1.5558138684104894e-06, "loss": 2.7111, "step": 2417 }, { "epoch": 2.305088785603623, "grad_norm": 19.065024872128976, "learning_rate": 1.5517951711072037e-06, "loss": 2.8395, "step": 2418 }, { "epoch": 2.306042188058634, "grad_norm": 19.973305310012204, "learning_rate": 1.5477807171285492e-06, "loss": 2.7055, "step": 2419 }, { "epoch": 2.3069955905136457, "grad_norm": 23.24517293750766, "learning_rate": 1.5437705114146735e-06, "loss": 2.8479, "step": 2420 }, { "epoch": 2.3079489929686567, "grad_norm": 21.088558174592727, "learning_rate": 1.5397645589004928e-06, "loss": 2.8196, "step": 2421 }, { "epoch": 2.3089023954236683, "grad_norm": 25.277007977541647, "learning_rate": 1.5357628645156918e-06, "loss": 2.7196, "step": 2422 }, { "epoch": 2.3098557978786793, "grad_norm": 25.24837968869965, "learning_rate": 1.5317654331847116e-06, "loss": 2.5205, "step": 2423 }, { "epoch": 2.310809200333691, "grad_norm": 22.287554103635504, "learning_rate": 1.527772269826749e-06, "loss": 2.5871, "step": 2424 }, { "epoch": 2.3117626027887024, "grad_norm": 23.058912033496426, "learning_rate": 1.5237833793557516e-06, "loss": 2.6299, "step": 2425 }, { "epoch": 2.3127160052437135, "grad_norm": 21.512122033341996, "learning_rate": 1.5197987666804032e-06, "loss": 2.6715, "step": 2426 }, { "epoch": 2.313669407698725, "grad_norm": 21.51414998040258, "learning_rate": 1.5158184367041268e-06, "loss": 2.7255, "step": 2427 }, { "epoch": 2.314622810153736, "grad_norm": 21.27431057607038, "learning_rate": 1.511842394325077e-06, "loss": 2.6439, "step": 2428 }, { "epoch": 2.3155762126087476, "grad_norm": 25.122150933206367, "learning_rate": 1.507870644436127e-06, "loss": 2.8794, "step": 2429 }, { "epoch": 2.3165296150637587, "grad_norm": 28.30459407646667, "learning_rate": 1.503903191924871e-06, "loss": 2.611, "step": 2430 }, { "epoch": 2.31748301751877, "grad_norm": 19.286391172507404, "learning_rate": 1.499940041673616e-06, "loss": 2.7753, "step": 2431 }, { "epoch": 2.3184364199737812, "grad_norm": 15.92153642435608, "learning_rate": 1.4959811985593707e-06, "loss": 2.4462, "step": 2432 }, { "epoch": 2.3193898224287928, "grad_norm": 23.48480286417481, "learning_rate": 1.492026667453849e-06, "loss": 2.9832, "step": 2433 }, { "epoch": 2.3203432248838043, "grad_norm": 20.90522802574181, "learning_rate": 1.4880764532234515e-06, "loss": 2.7819, "step": 2434 }, { "epoch": 2.3212966273388154, "grad_norm": 21.922078684746456, "learning_rate": 1.4841305607292723e-06, "loss": 2.4672, "step": 2435 }, { "epoch": 2.322250029793827, "grad_norm": 20.95948091496246, "learning_rate": 1.4801889948270852e-06, "loss": 2.6917, "step": 2436 }, { "epoch": 2.323203432248838, "grad_norm": 18.226433796675188, "learning_rate": 1.476251760367341e-06, "loss": 2.67, "step": 2437 }, { "epoch": 2.3241568347038495, "grad_norm": 22.256532955902554, "learning_rate": 1.4723188621951584e-06, "loss": 2.7058, "step": 2438 }, { "epoch": 2.3251102371588606, "grad_norm": 21.729362873329553, "learning_rate": 1.468390305150318e-06, "loss": 2.7744, "step": 2439 }, { "epoch": 2.326063639613872, "grad_norm": 22.119359174539223, "learning_rate": 1.4644660940672628e-06, "loss": 2.6693, "step": 2440 }, { "epoch": 2.327017042068883, "grad_norm": 23.16684367948479, "learning_rate": 1.460546233775086e-06, "loss": 2.3926, "step": 2441 }, { "epoch": 2.3279704445238947, "grad_norm": 18.582462606283247, "learning_rate": 1.4566307290975256e-06, "loss": 2.5137, "step": 2442 }, { "epoch": 2.328923846978906, "grad_norm": 19.236067993441075, "learning_rate": 1.4527195848529634e-06, "loss": 2.5413, "step": 2443 }, { "epoch": 2.3298772494339173, "grad_norm": 19.644838726933035, "learning_rate": 1.44881280585441e-06, "loss": 2.4603, "step": 2444 }, { "epoch": 2.330830651888929, "grad_norm": 20.848359999451883, "learning_rate": 1.4449103969095052e-06, "loss": 2.7552, "step": 2445 }, { "epoch": 2.33178405434394, "grad_norm": 26.011198928529957, "learning_rate": 1.4410123628205136e-06, "loss": 2.4806, "step": 2446 }, { "epoch": 2.3327374567989514, "grad_norm": 17.60805120568874, "learning_rate": 1.4371187083843159e-06, "loss": 2.7491, "step": 2447 }, { "epoch": 2.3336908592539625, "grad_norm": 18.93402450751124, "learning_rate": 1.4332294383924034e-06, "loss": 2.4651, "step": 2448 }, { "epoch": 2.334644261708974, "grad_norm": 20.6321100556673, "learning_rate": 1.4293445576308674e-06, "loss": 2.4823, "step": 2449 }, { "epoch": 2.335597664163985, "grad_norm": 21.053013207378697, "learning_rate": 1.4254640708804053e-06, "loss": 2.7566, "step": 2450 }, { "epoch": 2.3365510666189966, "grad_norm": 23.391175797316507, "learning_rate": 1.4215879829162994e-06, "loss": 2.7048, "step": 2451 }, { "epoch": 2.337504469074008, "grad_norm": 22.218070878466214, "learning_rate": 1.4177162985084242e-06, "loss": 2.6763, "step": 2452 }, { "epoch": 2.338457871529019, "grad_norm": 19.63681475134317, "learning_rate": 1.4138490224212365e-06, "loss": 2.7316, "step": 2453 }, { "epoch": 2.3394112739840303, "grad_norm": 23.124255017126508, "learning_rate": 1.409986159413762e-06, "loss": 2.638, "step": 2454 }, { "epoch": 2.340364676439042, "grad_norm": 22.682059693421017, "learning_rate": 1.4061277142396008e-06, "loss": 2.4381, "step": 2455 }, { "epoch": 2.3413180788940533, "grad_norm": 18.631771459459355, "learning_rate": 1.4022736916469166e-06, "loss": 2.5716, "step": 2456 }, { "epoch": 2.3422714813490644, "grad_norm": 19.434218297674363, "learning_rate": 1.3984240963784257e-06, "loss": 2.5208, "step": 2457 }, { "epoch": 2.343224883804076, "grad_norm": 23.665626344208345, "learning_rate": 1.3945789331714016e-06, "loss": 2.7374, "step": 2458 }, { "epoch": 2.344178286259087, "grad_norm": 22.377173873165024, "learning_rate": 1.3907382067576631e-06, "loss": 2.6483, "step": 2459 }, { "epoch": 2.3451316887140985, "grad_norm": 20.979529534444353, "learning_rate": 1.3869019218635644e-06, "loss": 2.6769, "step": 2460 }, { "epoch": 2.3460850911691096, "grad_norm": 20.27426508558985, "learning_rate": 1.3830700832099997e-06, "loss": 2.6271, "step": 2461 }, { "epoch": 2.347038493624121, "grad_norm": 19.107812933282933, "learning_rate": 1.3792426955123883e-06, "loss": 2.6098, "step": 2462 }, { "epoch": 2.347991896079132, "grad_norm": 18.30131791534121, "learning_rate": 1.375419763480676e-06, "loss": 2.7433, "step": 2463 }, { "epoch": 2.3489452985341437, "grad_norm": 18.558015119339906, "learning_rate": 1.3716012918193206e-06, "loss": 2.5988, "step": 2464 }, { "epoch": 2.349898700989155, "grad_norm": 20.295094560806735, "learning_rate": 1.3677872852272918e-06, "loss": 2.674, "step": 2465 }, { "epoch": 2.3508521034441663, "grad_norm": 26.86606863973616, "learning_rate": 1.363977748398067e-06, "loss": 2.5146, "step": 2466 }, { "epoch": 2.351805505899178, "grad_norm": 23.465310654530846, "learning_rate": 1.360172686019623e-06, "loss": 2.7093, "step": 2467 }, { "epoch": 2.352758908354189, "grad_norm": 18.906912970961187, "learning_rate": 1.3563721027744309e-06, "loss": 2.6328, "step": 2468 }, { "epoch": 2.3537123108092004, "grad_norm": 17.99183273336957, "learning_rate": 1.352576003339447e-06, "loss": 2.6004, "step": 2469 }, { "epoch": 2.3546657132642115, "grad_norm": 19.451764603419726, "learning_rate": 1.3487843923861098e-06, "loss": 2.6512, "step": 2470 }, { "epoch": 2.355619115719223, "grad_norm": 22.340522502573055, "learning_rate": 1.3449972745803375e-06, "loss": 2.7656, "step": 2471 }, { "epoch": 2.356572518174234, "grad_norm": 21.476662657635842, "learning_rate": 1.3412146545825166e-06, "loss": 2.6089, "step": 2472 }, { "epoch": 2.3575259206292456, "grad_norm": 27.06899117031766, "learning_rate": 1.3374365370475012e-06, "loss": 2.5037, "step": 2473 }, { "epoch": 2.358479323084257, "grad_norm": 23.98843162078651, "learning_rate": 1.3336629266246032e-06, "loss": 2.7857, "step": 2474 }, { "epoch": 2.359432725539268, "grad_norm": 25.600197883231015, "learning_rate": 1.3298938279575874e-06, "loss": 2.7045, "step": 2475 }, { "epoch": 2.3603861279942797, "grad_norm": 24.26890919650992, "learning_rate": 1.3261292456846648e-06, "loss": 2.5604, "step": 2476 }, { "epoch": 2.361339530449291, "grad_norm": 23.71508844595543, "learning_rate": 1.3223691844384923e-06, "loss": 2.4453, "step": 2477 }, { "epoch": 2.3622929329043023, "grad_norm": 20.285061535738908, "learning_rate": 1.3186136488461621e-06, "loss": 2.6411, "step": 2478 }, { "epoch": 2.3632463353593134, "grad_norm": 17.292728019819375, "learning_rate": 1.3148626435291978e-06, "loss": 2.5924, "step": 2479 }, { "epoch": 2.364199737814325, "grad_norm": 19.199388478515147, "learning_rate": 1.3111161731035448e-06, "loss": 2.6642, "step": 2480 }, { "epoch": 2.365153140269336, "grad_norm": 21.496865036024328, "learning_rate": 1.3073742421795715e-06, "loss": 2.7321, "step": 2481 }, { "epoch": 2.3661065427243475, "grad_norm": 21.497956948985752, "learning_rate": 1.3036368553620605e-06, "loss": 2.6454, "step": 2482 }, { "epoch": 2.367059945179359, "grad_norm": 25.25282801219483, "learning_rate": 1.2999040172501975e-06, "loss": 2.906, "step": 2483 }, { "epoch": 2.36801334763437, "grad_norm": 25.9400806158181, "learning_rate": 1.2961757324375768e-06, "loss": 2.4124, "step": 2484 }, { "epoch": 2.3689667500893816, "grad_norm": 18.899759126796543, "learning_rate": 1.2924520055121836e-06, "loss": 2.5275, "step": 2485 }, { "epoch": 2.3699201525443927, "grad_norm": 18.829897422434705, "learning_rate": 1.2887328410563983e-06, "loss": 2.8174, "step": 2486 }, { "epoch": 2.370873554999404, "grad_norm": 22.525006294461306, "learning_rate": 1.285018243646986e-06, "loss": 2.7184, "step": 2487 }, { "epoch": 2.3718269574544153, "grad_norm": 18.201974837427912, "learning_rate": 1.2813082178550929e-06, "loss": 2.8479, "step": 2488 }, { "epoch": 2.372780359909427, "grad_norm": 19.82216765825369, "learning_rate": 1.2776027682462343e-06, "loss": 2.7465, "step": 2489 }, { "epoch": 2.373733762364438, "grad_norm": 18.05322126286332, "learning_rate": 1.2739018993803015e-06, "loss": 2.6318, "step": 2490 }, { "epoch": 2.3746871648194494, "grad_norm": 23.13837960971515, "learning_rate": 1.2702056158115406e-06, "loss": 2.4622, "step": 2491 }, { "epoch": 2.375640567274461, "grad_norm": 20.20944095654104, "learning_rate": 1.2665139220885615e-06, "loss": 2.674, "step": 2492 }, { "epoch": 2.376593969729472, "grad_norm": 21.3321642462137, "learning_rate": 1.2628268227543234e-06, "loss": 2.4382, "step": 2493 }, { "epoch": 2.3775473721844835, "grad_norm": 19.38313503089319, "learning_rate": 1.2591443223461337e-06, "loss": 2.6359, "step": 2494 }, { "epoch": 2.3785007746394946, "grad_norm": 21.70966934348999, "learning_rate": 1.2554664253956368e-06, "loss": 2.6816, "step": 2495 }, { "epoch": 2.379454177094506, "grad_norm": 23.646752085524735, "learning_rate": 1.2517931364288133e-06, "loss": 2.5917, "step": 2496 }, { "epoch": 2.380407579549517, "grad_norm": 18.833443622292087, "learning_rate": 1.2481244599659753e-06, "loss": 2.9128, "step": 2497 }, { "epoch": 2.3813609820045287, "grad_norm": 22.211290455418705, "learning_rate": 1.244460400521757e-06, "loss": 2.774, "step": 2498 }, { "epoch": 2.38231438445954, "grad_norm": 21.85377197586896, "learning_rate": 1.2408009626051137e-06, "loss": 2.8046, "step": 2499 }, { "epoch": 2.3832677869145513, "grad_norm": 20.99625801825818, "learning_rate": 1.2371461507193077e-06, "loss": 2.6645, "step": 2500 }, { "epoch": 2.384221189369563, "grad_norm": 25.216278757241046, "learning_rate": 1.2334959693619154e-06, "loss": 2.5845, "step": 2501 }, { "epoch": 2.385174591824574, "grad_norm": 20.560686086480686, "learning_rate": 1.2298504230248082e-06, "loss": 2.8381, "step": 2502 }, { "epoch": 2.3861279942795854, "grad_norm": 24.838384871932107, "learning_rate": 1.226209516194159e-06, "loss": 2.8561, "step": 2503 }, { "epoch": 2.3870813967345965, "grad_norm": 21.881808082075146, "learning_rate": 1.2225732533504309e-06, "loss": 2.6344, "step": 2504 }, { "epoch": 2.388034799189608, "grad_norm": 18.36603672267138, "learning_rate": 1.2189416389683672e-06, "loss": 2.54, "step": 2505 }, { "epoch": 2.388988201644619, "grad_norm": 23.865410348612272, "learning_rate": 1.2153146775169972e-06, "loss": 2.8142, "step": 2506 }, { "epoch": 2.3899416040996306, "grad_norm": 22.804579466282863, "learning_rate": 1.211692373459622e-06, "loss": 2.4223, "step": 2507 }, { "epoch": 2.3908950065546417, "grad_norm": 22.26922201606944, "learning_rate": 1.2080747312538082e-06, "loss": 2.7637, "step": 2508 }, { "epoch": 2.3918484090096532, "grad_norm": 19.607694543979733, "learning_rate": 1.2044617553513899e-06, "loss": 2.7199, "step": 2509 }, { "epoch": 2.3928018114646648, "grad_norm": 19.6364741531768, "learning_rate": 1.2008534501984587e-06, "loss": 2.482, "step": 2510 }, { "epoch": 2.393755213919676, "grad_norm": 18.473333171691614, "learning_rate": 1.1972498202353545e-06, "loss": 2.6238, "step": 2511 }, { "epoch": 2.3947086163746873, "grad_norm": 23.306572605989484, "learning_rate": 1.1936508698966664e-06, "loss": 2.6138, "step": 2512 }, { "epoch": 2.3956620188296984, "grad_norm": 23.026965815540912, "learning_rate": 1.1900566036112281e-06, "loss": 2.7237, "step": 2513 }, { "epoch": 2.39661542128471, "grad_norm": 20.323192564919847, "learning_rate": 1.1864670258021022e-06, "loss": 2.5948, "step": 2514 }, { "epoch": 2.397568823739721, "grad_norm": 17.9778870798449, "learning_rate": 1.1828821408865887e-06, "loss": 2.5664, "step": 2515 }, { "epoch": 2.3985222261947325, "grad_norm": 19.31234907947918, "learning_rate": 1.1793019532762057e-06, "loss": 2.6668, "step": 2516 }, { "epoch": 2.3994756286497436, "grad_norm": 24.023339864818823, "learning_rate": 1.1757264673766972e-06, "loss": 2.7363, "step": 2517 }, { "epoch": 2.400429031104755, "grad_norm": 21.2727537220244, "learning_rate": 1.172155687588017e-06, "loss": 2.762, "step": 2518 }, { "epoch": 2.4013824335597667, "grad_norm": 20.67309471519212, "learning_rate": 1.1685896183043317e-06, "loss": 2.5786, "step": 2519 }, { "epoch": 2.4023358360147777, "grad_norm": 21.031547675014895, "learning_rate": 1.1650282639140066e-06, "loss": 2.8876, "step": 2520 }, { "epoch": 2.403289238469789, "grad_norm": 21.877192715995893, "learning_rate": 1.1614716287996063e-06, "loss": 2.4447, "step": 2521 }, { "epoch": 2.4042426409248003, "grad_norm": 20.26515337442168, "learning_rate": 1.1579197173378893e-06, "loss": 2.8313, "step": 2522 }, { "epoch": 2.405196043379812, "grad_norm": 21.30128088817918, "learning_rate": 1.1543725338998013e-06, "loss": 2.5791, "step": 2523 }, { "epoch": 2.406149445834823, "grad_norm": 21.084267010914726, "learning_rate": 1.1508300828504682e-06, "loss": 2.7659, "step": 2524 }, { "epoch": 2.4071028482898345, "grad_norm": 22.316628129699634, "learning_rate": 1.1472923685491948e-06, "loss": 2.6613, "step": 2525 }, { "epoch": 2.4080562507448455, "grad_norm": 20.055023102321638, "learning_rate": 1.1437593953494542e-06, "loss": 2.5408, "step": 2526 }, { "epoch": 2.409009653199857, "grad_norm": 21.152227651380596, "learning_rate": 1.1402311675988836e-06, "loss": 2.5945, "step": 2527 }, { "epoch": 2.409963055654868, "grad_norm": 20.588004222654337, "learning_rate": 1.1367076896392853e-06, "loss": 2.6046, "step": 2528 }, { "epoch": 2.4109164581098796, "grad_norm": 21.66122029111323, "learning_rate": 1.1331889658066141e-06, "loss": 2.4933, "step": 2529 }, { "epoch": 2.4118698605648907, "grad_norm": 21.7386325164395, "learning_rate": 1.129675000430976e-06, "loss": 2.6216, "step": 2530 }, { "epoch": 2.4128232630199022, "grad_norm": 19.475217454197743, "learning_rate": 1.1261657978366164e-06, "loss": 2.5925, "step": 2531 }, { "epoch": 2.4137766654749138, "grad_norm": 21.90540249009088, "learning_rate": 1.122661362341927e-06, "loss": 2.6487, "step": 2532 }, { "epoch": 2.414730067929925, "grad_norm": 19.20963971915618, "learning_rate": 1.119161698259426e-06, "loss": 2.6587, "step": 2533 }, { "epoch": 2.4156834703849364, "grad_norm": 21.849180625216277, "learning_rate": 1.1156668098957646e-06, "loss": 2.6934, "step": 2534 }, { "epoch": 2.4166368728399474, "grad_norm": 19.125286754515606, "learning_rate": 1.1121767015517166e-06, "loss": 2.9014, "step": 2535 }, { "epoch": 2.417590275294959, "grad_norm": 20.36013319770378, "learning_rate": 1.1086913775221709e-06, "loss": 2.4971, "step": 2536 }, { "epoch": 2.41854367774997, "grad_norm": 21.36170223514274, "learning_rate": 1.1052108420961316e-06, "loss": 2.8101, "step": 2537 }, { "epoch": 2.4194970802049816, "grad_norm": 20.575897935378002, "learning_rate": 1.10173509955671e-06, "loss": 2.7304, "step": 2538 }, { "epoch": 2.4204504826599926, "grad_norm": 19.35962568325246, "learning_rate": 1.098264154181116e-06, "loss": 2.7272, "step": 2539 }, { "epoch": 2.421403885115004, "grad_norm": 20.41827767197444, "learning_rate": 1.0947980102406597e-06, "loss": 2.5878, "step": 2540 }, { "epoch": 2.4223572875700157, "grad_norm": 21.294002589892187, "learning_rate": 1.0913366720007434e-06, "loss": 2.5088, "step": 2541 }, { "epoch": 2.4233106900250267, "grad_norm": 25.913288737397615, "learning_rate": 1.0878801437208497e-06, "loss": 2.6892, "step": 2542 }, { "epoch": 2.4242640924800383, "grad_norm": 25.030207230714147, "learning_rate": 1.0844284296545488e-06, "loss": 2.822, "step": 2543 }, { "epoch": 2.4252174949350493, "grad_norm": 19.86611252648381, "learning_rate": 1.0809815340494822e-06, "loss": 2.7816, "step": 2544 }, { "epoch": 2.426170897390061, "grad_norm": 22.020624171814728, "learning_rate": 1.0775394611473661e-06, "loss": 2.4991, "step": 2545 }, { "epoch": 2.427124299845072, "grad_norm": 19.11897429824734, "learning_rate": 1.074102215183977e-06, "loss": 2.4699, "step": 2546 }, { "epoch": 2.4280777023000835, "grad_norm": 20.986850769156675, "learning_rate": 1.0706698003891525e-06, "loss": 2.5352, "step": 2547 }, { "epoch": 2.4290311047550945, "grad_norm": 21.366802401952825, "learning_rate": 1.0672422209867879e-06, "loss": 2.7248, "step": 2548 }, { "epoch": 2.429984507210106, "grad_norm": 23.983361165104533, "learning_rate": 1.063819481194826e-06, "loss": 2.5711, "step": 2549 }, { "epoch": 2.4309379096651176, "grad_norm": 24.969876434608686, "learning_rate": 1.0604015852252559e-06, "loss": 2.5103, "step": 2550 }, { "epoch": 2.4318913121201287, "grad_norm": 19.089182410476173, "learning_rate": 1.0569885372841031e-06, "loss": 2.6906, "step": 2551 }, { "epoch": 2.43284471457514, "grad_norm": 21.67040548110633, "learning_rate": 1.053580341571428e-06, "loss": 2.7734, "step": 2552 }, { "epoch": 2.4337981170301513, "grad_norm": 18.349277171515276, "learning_rate": 1.05017700228132e-06, "loss": 2.6389, "step": 2553 }, { "epoch": 2.4347515194851628, "grad_norm": 20.713677060068033, "learning_rate": 1.0467785236018946e-06, "loss": 2.7574, "step": 2554 }, { "epoch": 2.435704921940174, "grad_norm": 19.341392855238194, "learning_rate": 1.0433849097152825e-06, "loss": 2.8591, "step": 2555 }, { "epoch": 2.4366583243951854, "grad_norm": 21.67529319103709, "learning_rate": 1.0399961647976315e-06, "loss": 2.7728, "step": 2556 }, { "epoch": 2.4376117268501964, "grad_norm": 20.251198172754783, "learning_rate": 1.0366122930190936e-06, "loss": 2.6305, "step": 2557 }, { "epoch": 2.438565129305208, "grad_norm": 20.878370765316536, "learning_rate": 1.0332332985438248e-06, "loss": 2.5997, "step": 2558 }, { "epoch": 2.4395185317602195, "grad_norm": 27.421590182856427, "learning_rate": 1.0298591855299817e-06, "loss": 2.6778, "step": 2559 }, { "epoch": 2.4404719342152306, "grad_norm": 21.31057157858821, "learning_rate": 1.0264899581297121e-06, "loss": 2.5709, "step": 2560 }, { "epoch": 2.441425336670242, "grad_norm": 21.21422491602074, "learning_rate": 1.0231256204891532e-06, "loss": 2.6582, "step": 2561 }, { "epoch": 2.442378739125253, "grad_norm": 26.810608632070927, "learning_rate": 1.0197661767484206e-06, "loss": 2.7275, "step": 2562 }, { "epoch": 2.4433321415802647, "grad_norm": 21.881962809606186, "learning_rate": 1.0164116310416127e-06, "loss": 2.698, "step": 2563 }, { "epoch": 2.4442855440352758, "grad_norm": 24.625440347116424, "learning_rate": 1.0130619874967983e-06, "loss": 2.671, "step": 2564 }, { "epoch": 2.4452389464902873, "grad_norm": 20.862087033014614, "learning_rate": 1.009717250236012e-06, "loss": 2.631, "step": 2565 }, { "epoch": 2.4461923489452984, "grad_norm": 21.08764449385942, "learning_rate": 1.0063774233752544e-06, "loss": 2.6685, "step": 2566 }, { "epoch": 2.44714575140031, "grad_norm": 17.092633615996785, "learning_rate": 1.0030425110244785e-06, "loss": 2.3725, "step": 2567 }, { "epoch": 2.4480991538553214, "grad_norm": 19.762342812986006, "learning_rate": 9.997125172875943e-07, "loss": 2.6464, "step": 2568 }, { "epoch": 2.4490525563103325, "grad_norm": 19.756269256260794, "learning_rate": 9.963874462624569e-07, "loss": 2.4947, "step": 2569 }, { "epoch": 2.450005958765344, "grad_norm": 24.64166247859974, "learning_rate": 9.93067302040865e-07, "loss": 2.7675, "step": 2570 }, { "epoch": 2.450959361220355, "grad_norm": 27.155405392989405, "learning_rate": 9.897520887085503e-07, "loss": 2.8572, "step": 2571 }, { "epoch": 2.4519127636753666, "grad_norm": 20.276307706463356, "learning_rate": 9.86441810345183e-07, "loss": 2.4556, "step": 2572 }, { "epoch": 2.4528661661303777, "grad_norm": 22.767430001588885, "learning_rate": 9.831364710243528e-07, "loss": 2.3773, "step": 2573 }, { "epoch": 2.453819568585389, "grad_norm": 20.59720923519802, "learning_rate": 9.798360748135782e-07, "loss": 2.6564, "step": 2574 }, { "epoch": 2.4547729710404003, "grad_norm": 20.160271993462324, "learning_rate": 9.765406257742916e-07, "loss": 2.4959, "step": 2575 }, { "epoch": 2.455726373495412, "grad_norm": 20.674519900878703, "learning_rate": 9.732501279618388e-07, "loss": 2.7111, "step": 2576 }, { "epoch": 2.4566797759504233, "grad_norm": 20.09230738492169, "learning_rate": 9.699645854254718e-07, "loss": 2.7718, "step": 2577 }, { "epoch": 2.4576331784054344, "grad_norm": 24.17557782554948, "learning_rate": 9.666840022083423e-07, "loss": 2.7543, "step": 2578 }, { "epoch": 2.458586580860446, "grad_norm": 21.82865308177498, "learning_rate": 9.634083823475032e-07, "loss": 2.6816, "step": 2579 }, { "epoch": 2.459539983315457, "grad_norm": 20.96306701365859, "learning_rate": 9.60137729873898e-07, "loss": 2.6179, "step": 2580 }, { "epoch": 2.4604933857704685, "grad_norm": 19.666038114686287, "learning_rate": 9.568720488123579e-07, "loss": 2.7043, "step": 2581 }, { "epoch": 2.4614467882254796, "grad_norm": 19.707556839915693, "learning_rate": 9.53611343181594e-07, "loss": 2.6871, "step": 2582 }, { "epoch": 2.462400190680491, "grad_norm": 22.72542839098434, "learning_rate": 9.503556169941985e-07, "loss": 2.7403, "step": 2583 }, { "epoch": 2.463353593135502, "grad_norm": 19.385878231683456, "learning_rate": 9.471048742566313e-07, "loss": 2.5252, "step": 2584 }, { "epoch": 2.4643069955905137, "grad_norm": 19.779934879961406, "learning_rate": 9.438591189692237e-07, "loss": 2.3692, "step": 2585 }, { "epoch": 2.465260398045525, "grad_norm": 17.47218985577599, "learning_rate": 9.406183551261682e-07, "loss": 2.891, "step": 2586 }, { "epoch": 2.4662138005005363, "grad_norm": 22.176222096152944, "learning_rate": 9.373825867155157e-07, "loss": 2.8814, "step": 2587 }, { "epoch": 2.4671672029555474, "grad_norm": 24.648652505528325, "learning_rate": 9.34151817719166e-07, "loss": 2.8329, "step": 2588 }, { "epoch": 2.468120605410559, "grad_norm": 21.654465683753656, "learning_rate": 9.309260521128727e-07, "loss": 2.5243, "step": 2589 }, { "epoch": 2.4690740078655704, "grad_norm": 28.014022565948743, "learning_rate": 9.277052938662262e-07, "loss": 2.9602, "step": 2590 }, { "epoch": 2.4700274103205815, "grad_norm": 22.805038574081706, "learning_rate": 9.24489546942659e-07, "loss": 2.6549, "step": 2591 }, { "epoch": 2.470980812775593, "grad_norm": 18.665206407744446, "learning_rate": 9.212788152994367e-07, "loss": 2.5097, "step": 2592 }, { "epoch": 2.471934215230604, "grad_norm": 17.710136688647573, "learning_rate": 9.180731028876494e-07, "loss": 2.4175, "step": 2593 }, { "epoch": 2.4728876176856156, "grad_norm": 17.001233193750892, "learning_rate": 9.148724136522147e-07, "loss": 2.6541, "step": 2594 }, { "epoch": 2.4738410201406267, "grad_norm": 20.394894840603918, "learning_rate": 9.116767515318681e-07, "loss": 2.5988, "step": 2595 }, { "epoch": 2.474794422595638, "grad_norm": 25.42613072311109, "learning_rate": 9.08486120459155e-07, "loss": 2.5889, "step": 2596 }, { "epoch": 2.4757478250506493, "grad_norm": 19.028649235067075, "learning_rate": 9.053005243604357e-07, "loss": 2.7663, "step": 2597 }, { "epoch": 2.476701227505661, "grad_norm": 20.16995999715842, "learning_rate": 9.021199671558684e-07, "loss": 2.7152, "step": 2598 }, { "epoch": 2.4776546299606723, "grad_norm": 20.372390880518104, "learning_rate": 8.989444527594144e-07, "loss": 2.9081, "step": 2599 }, { "epoch": 2.4786080324156834, "grad_norm": 24.18652866699553, "learning_rate": 8.957739850788288e-07, "loss": 2.7901, "step": 2600 }, { "epoch": 2.479561434870695, "grad_norm": 23.20337313370936, "learning_rate": 8.926085680156554e-07, "loss": 2.4551, "step": 2601 }, { "epoch": 2.480514837325706, "grad_norm": 21.560808632358412, "learning_rate": 8.894482054652248e-07, "loss": 2.6134, "step": 2602 }, { "epoch": 2.4814682397807175, "grad_norm": 25.95677117935624, "learning_rate": 8.862929013166449e-07, "loss": 2.3623, "step": 2603 }, { "epoch": 2.4824216422357286, "grad_norm": 20.717359536767766, "learning_rate": 8.831426594527976e-07, "loss": 2.5724, "step": 2604 }, { "epoch": 2.48337504469074, "grad_norm": 17.19253360535925, "learning_rate": 8.7999748375034e-07, "loss": 2.7019, "step": 2605 }, { "epoch": 2.484328447145751, "grad_norm": 18.636627650736692, "learning_rate": 8.768573780796913e-07, "loss": 2.7381, "step": 2606 }, { "epoch": 2.4852818496007627, "grad_norm": 20.546201067852124, "learning_rate": 8.737223463050337e-07, "loss": 2.744, "step": 2607 }, { "epoch": 2.4862352520557742, "grad_norm": 16.8732351014512, "learning_rate": 8.705923922843041e-07, "loss": 2.6441, "step": 2608 }, { "epoch": 2.4871886545107853, "grad_norm": 18.408490348947545, "learning_rate": 8.67467519869189e-07, "loss": 2.5298, "step": 2609 }, { "epoch": 2.488142056965797, "grad_norm": 21.616199574413358, "learning_rate": 8.643477329051248e-07, "loss": 2.6976, "step": 2610 }, { "epoch": 2.489095459420808, "grad_norm": 19.702159613959303, "learning_rate": 8.612330352312892e-07, "loss": 2.7569, "step": 2611 }, { "epoch": 2.4900488618758194, "grad_norm": 21.881592347195184, "learning_rate": 8.581234306805969e-07, "loss": 2.7842, "step": 2612 }, { "epoch": 2.4910022643308305, "grad_norm": 20.480336710557452, "learning_rate": 8.550189230796924e-07, "loss": 2.5546, "step": 2613 }, { "epoch": 2.491955666785842, "grad_norm": 20.536742964757376, "learning_rate": 8.51919516248953e-07, "loss": 2.6105, "step": 2614 }, { "epoch": 2.492909069240853, "grad_norm": 23.928284891083166, "learning_rate": 8.488252140024734e-07, "loss": 2.5345, "step": 2615 }, { "epoch": 2.4938624716958646, "grad_norm": 19.228502577036362, "learning_rate": 8.457360201480702e-07, "loss": 2.4102, "step": 2616 }, { "epoch": 2.494815874150876, "grad_norm": 19.67149828350983, "learning_rate": 8.426519384872733e-07, "loss": 2.8943, "step": 2617 }, { "epoch": 2.495769276605887, "grad_norm": 23.2596812346357, "learning_rate": 8.395729728153229e-07, "loss": 2.7661, "step": 2618 }, { "epoch": 2.4967226790608987, "grad_norm": 19.69370779695915, "learning_rate": 8.364991269211587e-07, "loss": 2.6458, "step": 2619 }, { "epoch": 2.49767608151591, "grad_norm": 22.470835315179862, "learning_rate": 8.334304045874248e-07, "loss": 2.5192, "step": 2620 }, { "epoch": 2.4986294839709213, "grad_norm": 19.74587398248634, "learning_rate": 8.303668095904594e-07, "loss": 2.4, "step": 2621 }, { "epoch": 2.4995828864259324, "grad_norm": 23.359959816905437, "learning_rate": 8.273083457002884e-07, "loss": 2.4815, "step": 2622 }, { "epoch": 2.500536288880944, "grad_norm": 18.756095336965213, "learning_rate": 8.242550166806274e-07, "loss": 2.7062, "step": 2623 }, { "epoch": 2.501489691335955, "grad_norm": 19.065832537188214, "learning_rate": 8.212068262888684e-07, "loss": 2.528, "step": 2624 }, { "epoch": 2.5024430937909665, "grad_norm": 17.014351463518373, "learning_rate": 8.181637782760843e-07, "loss": 2.6918, "step": 2625 }, { "epoch": 2.503396496245978, "grad_norm": 18.003840305007294, "learning_rate": 8.151258763870179e-07, "loss": 2.8238, "step": 2626 }, { "epoch": 2.504349898700989, "grad_norm": 19.94295298042881, "learning_rate": 8.120931243600799e-07, "loss": 2.749, "step": 2627 }, { "epoch": 2.5053033011560006, "grad_norm": 24.107016512947187, "learning_rate": 8.090655259273428e-07, "loss": 2.6119, "step": 2628 }, { "epoch": 2.5062567036110117, "grad_norm": 22.123032260468126, "learning_rate": 8.060430848145357e-07, "loss": 2.6374, "step": 2629 }, { "epoch": 2.5072101060660232, "grad_norm": 22.804031664869804, "learning_rate": 8.030258047410438e-07, "loss": 2.6895, "step": 2630 }, { "epoch": 2.5081635085210343, "grad_norm": 21.66666451495734, "learning_rate": 8.000136894199007e-07, "loss": 2.6451, "step": 2631 }, { "epoch": 2.509116910976046, "grad_norm": 21.468655522814725, "learning_rate": 7.970067425577849e-07, "loss": 2.7952, "step": 2632 }, { "epoch": 2.510070313431057, "grad_norm": 21.442052726930395, "learning_rate": 7.940049678550127e-07, "loss": 2.8378, "step": 2633 }, { "epoch": 2.5110237158860684, "grad_norm": 22.26276744740255, "learning_rate": 7.910083690055348e-07, "loss": 2.5885, "step": 2634 }, { "epoch": 2.51197711834108, "grad_norm": 21.91609808292946, "learning_rate": 7.880169496969358e-07, "loss": 2.5791, "step": 2635 }, { "epoch": 2.512930520796091, "grad_norm": 20.165056043641837, "learning_rate": 7.850307136104246e-07, "loss": 2.6368, "step": 2636 }, { "epoch": 2.513883923251102, "grad_norm": 19.18924287862411, "learning_rate": 7.820496644208325e-07, "loss": 2.5234, "step": 2637 }, { "epoch": 2.5148373257061136, "grad_norm": 20.476481335969186, "learning_rate": 7.79073805796608e-07, "loss": 2.845, "step": 2638 }, { "epoch": 2.515790728161125, "grad_norm": 21.78310023964299, "learning_rate": 7.761031413998093e-07, "loss": 2.5286, "step": 2639 }, { "epoch": 2.516744130616136, "grad_norm": 22.52142420948594, "learning_rate": 7.731376748861069e-07, "loss": 2.6405, "step": 2640 }, { "epoch": 2.5176975330711477, "grad_norm": 19.51391631001971, "learning_rate": 7.70177409904771e-07, "loss": 2.6625, "step": 2641 }, { "epoch": 2.518650935526159, "grad_norm": 22.213574370732093, "learning_rate": 7.672223500986731e-07, "loss": 2.8133, "step": 2642 }, { "epoch": 2.5196043379811703, "grad_norm": 25.955219888778117, "learning_rate": 7.642724991042805e-07, "loss": 2.8016, "step": 2643 }, { "epoch": 2.520557740436182, "grad_norm": 23.22678884614049, "learning_rate": 7.613278605516455e-07, "loss": 2.8482, "step": 2644 }, { "epoch": 2.521511142891193, "grad_norm": 19.63184605058014, "learning_rate": 7.583884380644119e-07, "loss": 2.7049, "step": 2645 }, { "epoch": 2.522464545346204, "grad_norm": 22.823179189295065, "learning_rate": 7.554542352598021e-07, "loss": 2.8208, "step": 2646 }, { "epoch": 2.5234179478012155, "grad_norm": 18.72037712460119, "learning_rate": 7.525252557486135e-07, "loss": 2.556, "step": 2647 }, { "epoch": 2.524371350256227, "grad_norm": 20.23061604622819, "learning_rate": 7.4960150313522e-07, "loss": 2.5532, "step": 2648 }, { "epoch": 2.525324752711238, "grad_norm": 22.996568591381095, "learning_rate": 7.46682981017558e-07, "loss": 2.6028, "step": 2649 }, { "epoch": 2.5262781551662497, "grad_norm": 20.125569177322596, "learning_rate": 7.437696929871313e-07, "loss": 2.5862, "step": 2650 }, { "epoch": 2.5272315576212607, "grad_norm": 25.77019119300742, "learning_rate": 7.408616426290022e-07, "loss": 2.5486, "step": 2651 }, { "epoch": 2.5281849600762722, "grad_norm": 21.305950502406684, "learning_rate": 7.379588335217875e-07, "loss": 2.455, "step": 2652 }, { "epoch": 2.5291383625312838, "grad_norm": 18.932592708103726, "learning_rate": 7.350612692376519e-07, "loss": 2.8503, "step": 2653 }, { "epoch": 2.530091764986295, "grad_norm": 24.274216574156693, "learning_rate": 7.321689533423093e-07, "loss": 2.8206, "step": 2654 }, { "epoch": 2.531045167441306, "grad_norm": 23.42593531312003, "learning_rate": 7.292818893950104e-07, "loss": 2.6814, "step": 2655 }, { "epoch": 2.5319985698963174, "grad_norm": 25.287572435834694, "learning_rate": 7.264000809485483e-07, "loss": 2.6105, "step": 2656 }, { "epoch": 2.532951972351329, "grad_norm": 20.597624985980776, "learning_rate": 7.235235315492456e-07, "loss": 2.513, "step": 2657 }, { "epoch": 2.53390537480634, "grad_norm": 25.819489240608288, "learning_rate": 7.206522447369546e-07, "loss": 2.8666, "step": 2658 }, { "epoch": 2.5348587772613516, "grad_norm": 24.67474052296654, "learning_rate": 7.177862240450495e-07, "loss": 2.6411, "step": 2659 }, { "epoch": 2.5358121797163626, "grad_norm": 20.788822625704345, "learning_rate": 7.149254730004246e-07, "loss": 2.6611, "step": 2660 }, { "epoch": 2.536765582171374, "grad_norm": 20.376001745139266, "learning_rate": 7.120699951234911e-07, "loss": 2.5474, "step": 2661 }, { "epoch": 2.5377189846263857, "grad_norm": 21.23950636587564, "learning_rate": 7.092197939281697e-07, "loss": 2.7074, "step": 2662 }, { "epoch": 2.5386723870813968, "grad_norm": 18.671652542660247, "learning_rate": 7.0637487292189e-07, "loss": 2.6609, "step": 2663 }, { "epoch": 2.539625789536408, "grad_norm": 19.085282195986878, "learning_rate": 7.035352356055786e-07, "loss": 2.6225, "step": 2664 }, { "epoch": 2.5405791919914193, "grad_norm": 21.507008663093703, "learning_rate": 7.007008854736657e-07, "loss": 2.7118, "step": 2665 }, { "epoch": 2.541532594446431, "grad_norm": 18.645100225265136, "learning_rate": 6.978718260140715e-07, "loss": 2.5647, "step": 2666 }, { "epoch": 2.542485996901442, "grad_norm": 16.51391442711072, "learning_rate": 6.950480607082072e-07, "loss": 2.6582, "step": 2667 }, { "epoch": 2.5434393993564535, "grad_norm": 21.08748892144379, "learning_rate": 6.922295930309691e-07, "loss": 2.7173, "step": 2668 }, { "epoch": 2.5443928018114645, "grad_norm": 20.99183209907391, "learning_rate": 6.894164264507347e-07, "loss": 2.7894, "step": 2669 }, { "epoch": 2.545346204266476, "grad_norm": 25.528457298306996, "learning_rate": 6.866085644293552e-07, "loss": 2.635, "step": 2670 }, { "epoch": 2.5462996067214876, "grad_norm": 22.560609549670684, "learning_rate": 6.838060104221584e-07, "loss": 2.5186, "step": 2671 }, { "epoch": 2.5472530091764987, "grad_norm": 18.347314646037358, "learning_rate": 6.810087678779353e-07, "loss": 2.8264, "step": 2672 }, { "epoch": 2.5482064116315097, "grad_norm": 18.23110528437611, "learning_rate": 6.782168402389444e-07, "loss": 2.708, "step": 2673 }, { "epoch": 2.5491598140865213, "grad_norm": 22.235350092874008, "learning_rate": 6.754302309409034e-07, "loss": 2.681, "step": 2674 }, { "epoch": 2.550113216541533, "grad_norm": 20.378602728681233, "learning_rate": 6.726489434129829e-07, "loss": 2.8465, "step": 2675 }, { "epoch": 2.551066618996544, "grad_norm": 24.771587755240166, "learning_rate": 6.698729810778065e-07, "loss": 2.8626, "step": 2676 }, { "epoch": 2.5520200214515554, "grad_norm": 24.772548438057015, "learning_rate": 6.671023473514449e-07, "loss": 2.5942, "step": 2677 }, { "epoch": 2.5529734239065665, "grad_norm": 21.25499133394311, "learning_rate": 6.643370456434117e-07, "loss": 2.6856, "step": 2678 }, { "epoch": 2.553926826361578, "grad_norm": 24.813503805238856, "learning_rate": 6.615770793566567e-07, "loss": 2.715, "step": 2679 }, { "epoch": 2.554880228816589, "grad_norm": 27.803087064739273, "learning_rate": 6.588224518875647e-07, "loss": 2.5337, "step": 2680 }, { "epoch": 2.5558336312716006, "grad_norm": 24.803090492900544, "learning_rate": 6.560731666259523e-07, "loss": 2.571, "step": 2681 }, { "epoch": 2.5567870337266116, "grad_norm": 18.970279939279415, "learning_rate": 6.533292269550612e-07, "loss": 2.7096, "step": 2682 }, { "epoch": 2.557740436181623, "grad_norm": 25.537266113038662, "learning_rate": 6.505906362515546e-07, "loss": 2.5897, "step": 2683 }, { "epoch": 2.5586938386366347, "grad_norm": 19.108906810148774, "learning_rate": 6.478573978855146e-07, "loss": 2.4852, "step": 2684 }, { "epoch": 2.5596472410916458, "grad_norm": 17.875531386761246, "learning_rate": 6.45129515220434e-07, "loss": 2.481, "step": 2685 }, { "epoch": 2.5606006435466573, "grad_norm": 22.067753174744972, "learning_rate": 6.424069916132164e-07, "loss": 2.4478, "step": 2686 }, { "epoch": 2.5615540460016684, "grad_norm": 22.441642575198372, "learning_rate": 6.396898304141713e-07, "loss": 2.6474, "step": 2687 }, { "epoch": 2.56250744845668, "grad_norm": 24.243792510507195, "learning_rate": 6.369780349670085e-07, "loss": 2.567, "step": 2688 }, { "epoch": 2.563460850911691, "grad_norm": 18.25344885758936, "learning_rate": 6.342716086088363e-07, "loss": 2.7228, "step": 2689 }, { "epoch": 2.5644142533667025, "grad_norm": 18.954454584766527, "learning_rate": 6.31570554670154e-07, "loss": 2.4822, "step": 2690 }, { "epoch": 2.5653676558217136, "grad_norm": 19.758638252476352, "learning_rate": 6.288748764748481e-07, "loss": 2.5898, "step": 2691 }, { "epoch": 2.566321058276725, "grad_norm": 22.69204177655792, "learning_rate": 6.261845773401936e-07, "loss": 2.6423, "step": 2692 }, { "epoch": 2.5672744607317366, "grad_norm": 26.984508937366428, "learning_rate": 6.234996605768446e-07, "loss": 2.7902, "step": 2693 }, { "epoch": 2.5682278631867477, "grad_norm": 19.63508934691394, "learning_rate": 6.208201294888316e-07, "loss": 2.5057, "step": 2694 }, { "epoch": 2.569181265641759, "grad_norm": 20.44613011276063, "learning_rate": 6.181459873735563e-07, "loss": 2.4527, "step": 2695 }, { "epoch": 2.5701346680967703, "grad_norm": 19.693741966553368, "learning_rate": 6.154772375217905e-07, "loss": 2.6387, "step": 2696 }, { "epoch": 2.571088070551782, "grad_norm": 21.966673310715674, "learning_rate": 6.128138832176706e-07, "loss": 2.8088, "step": 2697 }, { "epoch": 2.572041473006793, "grad_norm": 23.325054843250555, "learning_rate": 6.101559277386903e-07, "loss": 2.6162, "step": 2698 }, { "epoch": 2.5729948754618044, "grad_norm": 18.69249686203901, "learning_rate": 6.07503374355703e-07, "loss": 2.4883, "step": 2699 }, { "epoch": 2.5739482779168155, "grad_norm": 20.295692965578148, "learning_rate": 6.048562263329139e-07, "loss": 2.7753, "step": 2700 }, { "epoch": 2.574901680371827, "grad_norm": 19.19158166029469, "learning_rate": 6.022144869278734e-07, "loss": 2.5823, "step": 2701 }, { "epoch": 2.5758550828268385, "grad_norm": 18.755025785246904, "learning_rate": 5.995781593914796e-07, "loss": 2.6313, "step": 2702 }, { "epoch": 2.5768084852818496, "grad_norm": 22.58423053397766, "learning_rate": 5.969472469679704e-07, "loss": 2.7135, "step": 2703 }, { "epoch": 2.5777618877368607, "grad_norm": 20.773892224169366, "learning_rate": 5.943217528949169e-07, "loss": 2.6011, "step": 2704 }, { "epoch": 2.578715290191872, "grad_norm": 22.55739917184861, "learning_rate": 5.917016804032266e-07, "loss": 2.7486, "step": 2705 }, { "epoch": 2.5796686926468837, "grad_norm": 21.429767984794527, "learning_rate": 5.890870327171311e-07, "loss": 2.7302, "step": 2706 }, { "epoch": 2.5806220951018948, "grad_norm": 18.809890479064954, "learning_rate": 5.864778130541893e-07, "loss": 2.6706, "step": 2707 }, { "epoch": 2.5815754975569063, "grad_norm": 22.77066038761372, "learning_rate": 5.838740246252794e-07, "loss": 2.6586, "step": 2708 }, { "epoch": 2.5825289000119174, "grad_norm": 20.596081146916912, "learning_rate": 5.812756706345973e-07, "loss": 2.5434, "step": 2709 }, { "epoch": 2.583482302466929, "grad_norm": 21.710302146398963, "learning_rate": 5.786827542796492e-07, "loss": 2.5783, "step": 2710 }, { "epoch": 2.5844357049219404, "grad_norm": 21.966307313291143, "learning_rate": 5.760952787512492e-07, "loss": 2.6454, "step": 2711 }, { "epoch": 2.5853891073769515, "grad_norm": 17.839306341935366, "learning_rate": 5.735132472335192e-07, "loss": 2.5258, "step": 2712 }, { "epoch": 2.5863425098319626, "grad_norm": 21.012456218634135, "learning_rate": 5.709366629038799e-07, "loss": 2.6884, "step": 2713 }, { "epoch": 2.587295912286974, "grad_norm": 21.96700683581432, "learning_rate": 5.683655289330481e-07, "loss": 2.4331, "step": 2714 }, { "epoch": 2.5882493147419856, "grad_norm": 21.173442422405607, "learning_rate": 5.657998484850369e-07, "loss": 2.8481, "step": 2715 }, { "epoch": 2.5892027171969967, "grad_norm": 24.889801823244863, "learning_rate": 5.632396247171429e-07, "loss": 2.3234, "step": 2716 }, { "epoch": 2.590156119652008, "grad_norm": 19.98335356427011, "learning_rate": 5.606848607799509e-07, "loss": 2.7817, "step": 2717 }, { "epoch": 2.5911095221070193, "grad_norm": 22.269297591202193, "learning_rate": 5.581355598173266e-07, "loss": 2.7288, "step": 2718 }, { "epoch": 2.592062924562031, "grad_norm": 24.45856542270923, "learning_rate": 5.555917249664133e-07, "loss": 2.7394, "step": 2719 }, { "epoch": 2.5930163270170423, "grad_norm": 21.296995403871573, "learning_rate": 5.530533593576292e-07, "loss": 2.7727, "step": 2720 }, { "epoch": 2.5939697294720534, "grad_norm": 21.293397986900988, "learning_rate": 5.505204661146573e-07, "loss": 2.8211, "step": 2721 }, { "epoch": 2.5949231319270645, "grad_norm": 20.507893701818105, "learning_rate": 5.479930483544521e-07, "loss": 2.8104, "step": 2722 }, { "epoch": 2.595876534382076, "grad_norm": 22.062579278959102, "learning_rate": 5.454711091872245e-07, "loss": 2.6587, "step": 2723 }, { "epoch": 2.5968299368370875, "grad_norm": 21.583632979042132, "learning_rate": 5.429546517164486e-07, "loss": 2.7213, "step": 2724 }, { "epoch": 2.5977833392920986, "grad_norm": 23.38827528774213, "learning_rate": 5.404436790388501e-07, "loss": 2.7227, "step": 2725 }, { "epoch": 2.59873674174711, "grad_norm": 22.668915836077083, "learning_rate": 5.37938194244405e-07, "loss": 2.5873, "step": 2726 }, { "epoch": 2.599690144202121, "grad_norm": 21.589042746789872, "learning_rate": 5.354382004163367e-07, "loss": 2.3107, "step": 2727 }, { "epoch": 2.6006435466571327, "grad_norm": 21.66502824694637, "learning_rate": 5.329437006311122e-07, "loss": 2.663, "step": 2728 }, { "epoch": 2.6015969491121442, "grad_norm": 21.942795989832685, "learning_rate": 5.304546979584352e-07, "loss": 2.5138, "step": 2729 }, { "epoch": 2.6025503515671553, "grad_norm": 21.972137907594607, "learning_rate": 5.279711954612471e-07, "loss": 2.6638, "step": 2730 }, { "epoch": 2.6035037540221664, "grad_norm": 24.23157106039767, "learning_rate": 5.25493196195721e-07, "loss": 2.8022, "step": 2731 }, { "epoch": 2.604457156477178, "grad_norm": 22.270166177409212, "learning_rate": 5.230207032112549e-07, "loss": 2.6163, "step": 2732 }, { "epoch": 2.6054105589321894, "grad_norm": 19.20359288488419, "learning_rate": 5.205537195504735e-07, "loss": 2.6224, "step": 2733 }, { "epoch": 2.6063639613872005, "grad_norm": 18.175376559394337, "learning_rate": 5.180922482492218e-07, "loss": 2.3713, "step": 2734 }, { "epoch": 2.607317363842212, "grad_norm": 24.109461412753483, "learning_rate": 5.156362923365587e-07, "loss": 2.5831, "step": 2735 }, { "epoch": 2.608270766297223, "grad_norm": 21.61748381098763, "learning_rate": 5.131858548347596e-07, "loss": 2.6112, "step": 2736 }, { "epoch": 2.6092241687522346, "grad_norm": 20.811268905914798, "learning_rate": 5.107409387593054e-07, "loss": 2.6976, "step": 2737 }, { "epoch": 2.6101775712072457, "grad_norm": 24.523302472072515, "learning_rate": 5.083015471188852e-07, "loss": 2.6735, "step": 2738 }, { "epoch": 2.611130973662257, "grad_norm": 21.838885531035043, "learning_rate": 5.058676829153886e-07, "loss": 2.7235, "step": 2739 }, { "epoch": 2.6120843761172683, "grad_norm": 19.225019848356084, "learning_rate": 5.034393491439044e-07, "loss": 2.7111, "step": 2740 }, { "epoch": 2.61303777857228, "grad_norm": 24.515801928853787, "learning_rate": 5.010165487927132e-07, "loss": 2.8457, "step": 2741 }, { "epoch": 2.6139911810272913, "grad_norm": 23.164455933403925, "learning_rate": 4.985992848432869e-07, "loss": 2.4738, "step": 2742 }, { "epoch": 2.6149445834823024, "grad_norm": 18.77968891279513, "learning_rate": 4.961875602702865e-07, "loss": 2.6281, "step": 2743 }, { "epoch": 2.615897985937314, "grad_norm": 24.711224792120298, "learning_rate": 4.93781378041554e-07, "loss": 2.8029, "step": 2744 }, { "epoch": 2.616851388392325, "grad_norm": 28.0987930636082, "learning_rate": 4.913807411181143e-07, "loss": 2.7577, "step": 2745 }, { "epoch": 2.6178047908473365, "grad_norm": 22.711353317819643, "learning_rate": 4.889856524541625e-07, "loss": 2.7632, "step": 2746 }, { "epoch": 2.6187581933023476, "grad_norm": 23.939327814649328, "learning_rate": 4.865961149970727e-07, "loss": 2.4156, "step": 2747 }, { "epoch": 2.619711595757359, "grad_norm": 22.894231499621295, "learning_rate": 4.842121316873821e-07, "loss": 2.8148, "step": 2748 }, { "epoch": 2.62066499821237, "grad_norm": 22.196169806442015, "learning_rate": 4.818337054587968e-07, "loss": 2.7376, "step": 2749 }, { "epoch": 2.6216184006673817, "grad_norm": 21.277998516003045, "learning_rate": 4.794608392381828e-07, "loss": 2.4367, "step": 2750 }, { "epoch": 2.6225718031223932, "grad_norm": 19.345848135073147, "learning_rate": 4.770935359455653e-07, "loss": 2.5933, "step": 2751 }, { "epoch": 2.6235252055774043, "grad_norm": 19.870659604319137, "learning_rate": 4.747317984941213e-07, "loss": 2.775, "step": 2752 }, { "epoch": 2.624478608032416, "grad_norm": 21.48343234718971, "learning_rate": 4.723756297901816e-07, "loss": 2.6409, "step": 2753 }, { "epoch": 2.625432010487427, "grad_norm": 20.220595091901863, "learning_rate": 4.7002503273322064e-07, "loss": 2.8215, "step": 2754 }, { "epoch": 2.6263854129424384, "grad_norm": 22.78087935243385, "learning_rate": 4.676800102158602e-07, "loss": 2.5351, "step": 2755 }, { "epoch": 2.6273388153974495, "grad_norm": 22.20566860984834, "learning_rate": 4.653405651238607e-07, "loss": 2.5236, "step": 2756 }, { "epoch": 2.628292217852461, "grad_norm": 24.372829989753978, "learning_rate": 4.630067003361166e-07, "loss": 2.7017, "step": 2757 }, { "epoch": 2.629245620307472, "grad_norm": 23.650132024913162, "learning_rate": 4.606784187246588e-07, "loss": 2.6296, "step": 2758 }, { "epoch": 2.6301990227624836, "grad_norm": 23.484071791605597, "learning_rate": 4.5835572315464596e-07, "loss": 2.8469, "step": 2759 }, { "epoch": 2.631152425217495, "grad_norm": 22.356621338697554, "learning_rate": 4.560386164843639e-07, "loss": 2.6415, "step": 2760 }, { "epoch": 2.6321058276725062, "grad_norm": 21.368443159742494, "learning_rate": 4.5372710156521836e-07, "loss": 2.6469, "step": 2761 }, { "epoch": 2.6330592301275173, "grad_norm": 20.979987888048026, "learning_rate": 4.5142118124173515e-07, "loss": 2.5274, "step": 2762 }, { "epoch": 2.634012632582529, "grad_norm": 21.23227338601886, "learning_rate": 4.491208583515561e-07, "loss": 2.5309, "step": 2763 }, { "epoch": 2.6349660350375403, "grad_norm": 20.23205739986225, "learning_rate": 4.468261357254339e-07, "loss": 2.7132, "step": 2764 }, { "epoch": 2.6359194374925514, "grad_norm": 21.88467317896486, "learning_rate": 4.4453701618723087e-07, "loss": 2.5951, "step": 2765 }, { "epoch": 2.636872839947563, "grad_norm": 23.00880770796486, "learning_rate": 4.422535025539143e-07, "loss": 2.5304, "step": 2766 }, { "epoch": 2.637826242402574, "grad_norm": 18.27907587385846, "learning_rate": 4.3997559763555134e-07, "loss": 2.7652, "step": 2767 }, { "epoch": 2.6387796448575855, "grad_norm": 17.502036871843053, "learning_rate": 4.3770330423530626e-07, "loss": 2.5795, "step": 2768 }, { "epoch": 2.639733047312597, "grad_norm": 19.87550274545107, "learning_rate": 4.354366251494402e-07, "loss": 2.731, "step": 2769 }, { "epoch": 2.640686449767608, "grad_norm": 21.13108867325016, "learning_rate": 4.331755631673057e-07, "loss": 2.7527, "step": 2770 }, { "epoch": 2.641639852222619, "grad_norm": 21.134317529287358, "learning_rate": 4.3092012107134205e-07, "loss": 2.676, "step": 2771 }, { "epoch": 2.6425932546776307, "grad_norm": 19.007655733128367, "learning_rate": 4.286703016370719e-07, "loss": 2.8171, "step": 2772 }, { "epoch": 2.6435466571326423, "grad_norm": 19.155501310426942, "learning_rate": 4.264261076330983e-07, "loss": 2.6049, "step": 2773 }, { "epoch": 2.6445000595876533, "grad_norm": 18.60535388207361, "learning_rate": 4.241875418211039e-07, "loss": 2.4757, "step": 2774 }, { "epoch": 2.645453462042665, "grad_norm": 17.22749300546411, "learning_rate": 4.219546069558439e-07, "loss": 2.7318, "step": 2775 }, { "epoch": 2.646406864497676, "grad_norm": 17.414810476781, "learning_rate": 4.197273057851464e-07, "loss": 2.5497, "step": 2776 }, { "epoch": 2.6473602669526874, "grad_norm": 19.211104963127198, "learning_rate": 4.175056410499018e-07, "loss": 2.7112, "step": 2777 }, { "epoch": 2.648313669407699, "grad_norm": 20.241299427034217, "learning_rate": 4.152896154840691e-07, "loss": 2.6484, "step": 2778 }, { "epoch": 2.64926707186271, "grad_norm": 22.056804148644357, "learning_rate": 4.1307923181466645e-07, "loss": 2.5679, "step": 2779 }, { "epoch": 2.650220474317721, "grad_norm": 22.730570274281778, "learning_rate": 4.108744927617669e-07, "loss": 2.4325, "step": 2780 }, { "epoch": 2.6511738767727326, "grad_norm": 22.302123713676316, "learning_rate": 4.086754010385008e-07, "loss": 2.5459, "step": 2781 }, { "epoch": 2.652127279227744, "grad_norm": 26.99677358540133, "learning_rate": 4.064819593510477e-07, "loss": 2.613, "step": 2782 }, { "epoch": 2.6530806816827552, "grad_norm": 22.120029326333515, "learning_rate": 4.042941703986325e-07, "loss": 2.428, "step": 2783 }, { "epoch": 2.6540340841377668, "grad_norm": 21.812896906885314, "learning_rate": 4.021120368735254e-07, "loss": 2.6544, "step": 2784 }, { "epoch": 2.654987486592778, "grad_norm": 21.770216981276477, "learning_rate": 3.9993556146103893e-07, "loss": 2.4672, "step": 2785 }, { "epoch": 2.6559408890477894, "grad_norm": 21.213318980188703, "learning_rate": 3.9776474683951796e-07, "loss": 2.4131, "step": 2786 }, { "epoch": 2.656894291502801, "grad_norm": 19.818771326081762, "learning_rate": 3.955995956803466e-07, "loss": 2.6903, "step": 2787 }, { "epoch": 2.657847693957812, "grad_norm": 20.640735298652157, "learning_rate": 3.934401106479352e-07, "loss": 2.7048, "step": 2788 }, { "epoch": 2.658801096412823, "grad_norm": 22.541997457109357, "learning_rate": 3.9128629439972476e-07, "loss": 2.5859, "step": 2789 }, { "epoch": 2.6597544988678345, "grad_norm": 26.173235495250573, "learning_rate": 3.891381495861779e-07, "loss": 2.8748, "step": 2790 }, { "epoch": 2.660707901322846, "grad_norm": 19.410881309795766, "learning_rate": 3.869956788507806e-07, "loss": 2.6584, "step": 2791 }, { "epoch": 2.661661303777857, "grad_norm": 18.38847325614397, "learning_rate": 3.8485888483003384e-07, "loss": 2.5924, "step": 2792 }, { "epoch": 2.6626147062328687, "grad_norm": 17.622176719835792, "learning_rate": 3.827277701534532e-07, "loss": 2.561, "step": 2793 }, { "epoch": 2.6635681086878797, "grad_norm": 20.232947977223716, "learning_rate": 3.8060233744356634e-07, "loss": 2.4684, "step": 2794 }, { "epoch": 2.6645215111428913, "grad_norm": 23.723387619270724, "learning_rate": 3.784825893159089e-07, "loss": 2.6408, "step": 2795 }, { "epoch": 2.665474913597903, "grad_norm": 21.784431539620705, "learning_rate": 3.763685283790208e-07, "loss": 2.3838, "step": 2796 }, { "epoch": 2.666428316052914, "grad_norm": 20.262024723369542, "learning_rate": 3.742601572344434e-07, "loss": 2.7156, "step": 2797 }, { "epoch": 2.667381718507925, "grad_norm": 21.698582180919505, "learning_rate": 3.7215747847671626e-07, "loss": 2.4895, "step": 2798 }, { "epoch": 2.6683351209629365, "grad_norm": 22.153207117264888, "learning_rate": 3.700604946933717e-07, "loss": 2.97, "step": 2799 }, { "epoch": 2.669288523417948, "grad_norm": 18.683746992243833, "learning_rate": 3.679692084649372e-07, "loss": 2.7382, "step": 2800 }, { "epoch": 2.670241925872959, "grad_norm": 18.248539411965364, "learning_rate": 3.6588362236492816e-07, "loss": 2.5163, "step": 2801 }, { "epoch": 2.6711953283279706, "grad_norm": 20.889942722872124, "learning_rate": 3.638037389598453e-07, "loss": 2.5254, "step": 2802 }, { "epoch": 2.6721487307829817, "grad_norm": 20.38644648645321, "learning_rate": 3.6172956080916977e-07, "loss": 3.0152, "step": 2803 }, { "epoch": 2.673102133237993, "grad_norm": 18.610576838061988, "learning_rate": 3.596610904653652e-07, "loss": 2.3322, "step": 2804 }, { "epoch": 2.6740555356930042, "grad_norm": 20.217495970912747, "learning_rate": 3.575983304738673e-07, "loss": 2.6849, "step": 2805 }, { "epoch": 2.6750089381480158, "grad_norm": 21.36413351738478, "learning_rate": 3.555412833730881e-07, "loss": 2.7607, "step": 2806 }, { "epoch": 2.675962340603027, "grad_norm": 21.93607012872241, "learning_rate": 3.5348995169440905e-07, "loss": 2.6901, "step": 2807 }, { "epoch": 2.6769157430580384, "grad_norm": 22.67298438654768, "learning_rate": 3.5144433796217515e-07, "loss": 2.6797, "step": 2808 }, { "epoch": 2.67786914551305, "grad_norm": 21.647677851556523, "learning_rate": 3.494044446936984e-07, "loss": 2.7085, "step": 2809 }, { "epoch": 2.678822547968061, "grad_norm": 22.650460034000297, "learning_rate": 3.4737027439925073e-07, "loss": 2.6374, "step": 2810 }, { "epoch": 2.6797759504230725, "grad_norm": 19.74674379194145, "learning_rate": 3.453418295820593e-07, "loss": 2.6912, "step": 2811 }, { "epoch": 2.6807293528780836, "grad_norm": 21.59470456664645, "learning_rate": 3.433191127383079e-07, "loss": 2.7723, "step": 2812 }, { "epoch": 2.681682755333095, "grad_norm": 21.994588122054278, "learning_rate": 3.413021263571309e-07, "loss": 2.4794, "step": 2813 }, { "epoch": 2.682636157788106, "grad_norm": 21.599014048021253, "learning_rate": 3.3929087292060904e-07, "loss": 2.6426, "step": 2814 }, { "epoch": 2.6835895602431177, "grad_norm": 21.381054888481238, "learning_rate": 3.3728535490377135e-07, "loss": 2.5972, "step": 2815 }, { "epoch": 2.6845429626981288, "grad_norm": 18.70705110767074, "learning_rate": 3.352855747745859e-07, "loss": 2.8213, "step": 2816 }, { "epoch": 2.6854963651531403, "grad_norm": 24.015097266869454, "learning_rate": 3.3329153499396304e-07, "loss": 2.844, "step": 2817 }, { "epoch": 2.686449767608152, "grad_norm": 23.478539964205158, "learning_rate": 3.313032380157455e-07, "loss": 2.588, "step": 2818 }, { "epoch": 2.687403170063163, "grad_norm": 24.251670781978227, "learning_rate": 3.293206862867104e-07, "loss": 2.7447, "step": 2819 }, { "epoch": 2.6883565725181744, "grad_norm": 16.63902914220044, "learning_rate": 3.2734388224656575e-07, "loss": 2.6479, "step": 2820 }, { "epoch": 2.6893099749731855, "grad_norm": 22.781478924863773, "learning_rate": 3.253728283279456e-07, "loss": 2.5093, "step": 2821 }, { "epoch": 2.690263377428197, "grad_norm": 23.10345183027775, "learning_rate": 3.2340752695640966e-07, "loss": 2.6719, "step": 2822 }, { "epoch": 2.691216779883208, "grad_norm": 23.254179461149242, "learning_rate": 3.2144798055043556e-07, "loss": 2.586, "step": 2823 }, { "epoch": 2.6921701823382196, "grad_norm": 18.26244626074522, "learning_rate": 3.1949419152142e-07, "loss": 2.7685, "step": 2824 }, { "epoch": 2.6931235847932307, "grad_norm": 20.337131849336938, "learning_rate": 3.1754616227367585e-07, "loss": 2.852, "step": 2825 }, { "epoch": 2.694076987248242, "grad_norm": 22.053596329988906, "learning_rate": 3.1560389520442827e-07, "loss": 2.7869, "step": 2826 }, { "epoch": 2.6950303897032537, "grad_norm": 21.722565708502454, "learning_rate": 3.136673927038097e-07, "loss": 2.6142, "step": 2827 }, { "epoch": 2.695983792158265, "grad_norm": 20.195135345482157, "learning_rate": 3.1173665715486076e-07, "loss": 2.7963, "step": 2828 }, { "epoch": 2.696937194613276, "grad_norm": 22.517009162743328, "learning_rate": 3.0981169093352415e-07, "loss": 2.792, "step": 2829 }, { "epoch": 2.6978905970682874, "grad_norm": 24.142156702688983, "learning_rate": 3.078924964086416e-07, "loss": 2.6821, "step": 2830 }, { "epoch": 2.698843999523299, "grad_norm": 21.570075904113228, "learning_rate": 3.059790759419551e-07, "loss": 2.717, "step": 2831 }, { "epoch": 2.69979740197831, "grad_norm": 21.617257571840895, "learning_rate": 3.0407143188809885e-07, "loss": 2.5082, "step": 2832 }, { "epoch": 2.7007508044333215, "grad_norm": 18.88335260701895, "learning_rate": 3.0216956659460175e-07, "loss": 2.6782, "step": 2833 }, { "epoch": 2.7017042068883326, "grad_norm": 19.052263133361823, "learning_rate": 3.002734824018766e-07, "loss": 2.6356, "step": 2834 }, { "epoch": 2.702657609343344, "grad_norm": 19.76458958599874, "learning_rate": 2.9838318164322533e-07, "loss": 2.7607, "step": 2835 }, { "epoch": 2.7036110117983556, "grad_norm": 20.892120654913622, "learning_rate": 2.9649866664483387e-07, "loss": 2.5762, "step": 2836 }, { "epoch": 2.7045644142533667, "grad_norm": 19.839787653338764, "learning_rate": 2.946199397257643e-07, "loss": 2.7821, "step": 2837 }, { "epoch": 2.7055178167083778, "grad_norm": 19.70591231823761, "learning_rate": 2.9274700319796066e-07, "loss": 3.0052, "step": 2838 }, { "epoch": 2.7064712191633893, "grad_norm": 23.689513039626373, "learning_rate": 2.9087985936623596e-07, "loss": 2.6082, "step": 2839 }, { "epoch": 2.707424621618401, "grad_norm": 19.71640538368514, "learning_rate": 2.8901851052828e-07, "loss": 2.6801, "step": 2840 }, { "epoch": 2.708378024073412, "grad_norm": 21.381452869306546, "learning_rate": 2.871629589746483e-07, "loss": 2.6472, "step": 2841 }, { "epoch": 2.7093314265284234, "grad_norm": 20.396998283438943, "learning_rate": 2.853132069887643e-07, "loss": 2.703, "step": 2842 }, { "epoch": 2.7102848289834345, "grad_norm": 18.022707671085694, "learning_rate": 2.8346925684691106e-07, "loss": 2.5901, "step": 2843 }, { "epoch": 2.711238231438446, "grad_norm": 22.8247159238661, "learning_rate": 2.816311108182368e-07, "loss": 2.554, "step": 2844 }, { "epoch": 2.7121916338934575, "grad_norm": 21.16194301704844, "learning_rate": 2.797987711647426e-07, "loss": 2.4889, "step": 2845 }, { "epoch": 2.7131450363484686, "grad_norm": 18.937578299397437, "learning_rate": 2.779722401412871e-07, "loss": 2.5656, "step": 2846 }, { "epoch": 2.7140984388034797, "grad_norm": 22.51950208332591, "learning_rate": 2.7615151999558056e-07, "loss": 2.6768, "step": 2847 }, { "epoch": 2.715051841258491, "grad_norm": 20.164865170094515, "learning_rate": 2.743366129681824e-07, "loss": 2.7703, "step": 2848 }, { "epoch": 2.7160052437135027, "grad_norm": 27.57692097489754, "learning_rate": 2.725275212924977e-07, "loss": 2.6432, "step": 2849 }, { "epoch": 2.716958646168514, "grad_norm": 20.89015185043222, "learning_rate": 2.707242471947746e-07, "loss": 2.6597, "step": 2850 }, { "epoch": 2.7179120486235253, "grad_norm": 17.528856670255724, "learning_rate": 2.689267928941047e-07, "loss": 2.7281, "step": 2851 }, { "epoch": 2.7188654510785364, "grad_norm": 25.191411186878472, "learning_rate": 2.671351606024153e-07, "loss": 2.6384, "step": 2852 }, { "epoch": 2.719818853533548, "grad_norm": 18.52268010763568, "learning_rate": 2.653493525244721e-07, "loss": 2.6111, "step": 2853 }, { "epoch": 2.7207722559885594, "grad_norm": 21.479062804056856, "learning_rate": 2.635693708578696e-07, "loss": 2.7658, "step": 2854 }, { "epoch": 2.7217256584435705, "grad_norm": 23.30678235301262, "learning_rate": 2.6179521779303607e-07, "loss": 2.7606, "step": 2855 }, { "epoch": 2.7226790608985816, "grad_norm": 25.076365762244727, "learning_rate": 2.6002689551322403e-07, "loss": 2.8675, "step": 2856 }, { "epoch": 2.723632463353593, "grad_norm": 24.90874395908941, "learning_rate": 2.582644061945139e-07, "loss": 2.5029, "step": 2857 }, { "epoch": 2.7245858658086046, "grad_norm": 23.7407535340214, "learning_rate": 2.5650775200580626e-07, "loss": 2.6424, "step": 2858 }, { "epoch": 2.7255392682636157, "grad_norm": 23.578886093583794, "learning_rate": 2.5475693510882027e-07, "loss": 2.5784, "step": 2859 }, { "epoch": 2.726492670718627, "grad_norm": 20.55341940431505, "learning_rate": 2.530119576580936e-07, "loss": 2.68, "step": 2860 }, { "epoch": 2.7274460731736383, "grad_norm": 19.827652119208512, "learning_rate": 2.5127282180097745e-07, "loss": 2.5691, "step": 2861 }, { "epoch": 2.72839947562865, "grad_norm": 19.23185949348741, "learning_rate": 2.4953952967763317e-07, "loss": 2.7081, "step": 2862 }, { "epoch": 2.7293528780836613, "grad_norm": 19.506751092667407, "learning_rate": 2.4781208342103237e-07, "loss": 2.8235, "step": 2863 }, { "epoch": 2.7303062805386724, "grad_norm": 21.127615718606137, "learning_rate": 2.460904851569534e-07, "loss": 2.7613, "step": 2864 }, { "epoch": 2.7312596829936835, "grad_norm": 20.483739634253496, "learning_rate": 2.4437473700397453e-07, "loss": 2.5406, "step": 2865 }, { "epoch": 2.732213085448695, "grad_norm": 18.985571116979585, "learning_rate": 2.4266484107347943e-07, "loss": 2.7875, "step": 2866 }, { "epoch": 2.7331664879037065, "grad_norm": 24.891693624971964, "learning_rate": 2.409607994696478e-07, "loss": 2.5491, "step": 2867 }, { "epoch": 2.7341198903587176, "grad_norm": 23.88079172157017, "learning_rate": 2.3926261428945386e-07, "loss": 2.8009, "step": 2868 }, { "epoch": 2.735073292813729, "grad_norm": 21.38505763295656, "learning_rate": 2.3757028762266875e-07, "loss": 2.629, "step": 2869 }, { "epoch": 2.73602669526874, "grad_norm": 20.811227583546014, "learning_rate": 2.3588382155184997e-07, "loss": 2.5122, "step": 2870 }, { "epoch": 2.7369800977237517, "grad_norm": 22.20253922922268, "learning_rate": 2.3420321815234514e-07, "loss": 2.8238, "step": 2871 }, { "epoch": 2.737933500178763, "grad_norm": 24.1801279670913, "learning_rate": 2.325284794922883e-07, "loss": 2.7924, "step": 2872 }, { "epoch": 2.7388869026337743, "grad_norm": 19.97712242193172, "learning_rate": 2.3085960763259475e-07, "loss": 2.7632, "step": 2873 }, { "epoch": 2.7398403050887854, "grad_norm": 19.080514788497283, "learning_rate": 2.2919660462696058e-07, "loss": 2.5617, "step": 2874 }, { "epoch": 2.740793707543797, "grad_norm": 22.225885687797994, "learning_rate": 2.2753947252185938e-07, "loss": 2.5572, "step": 2875 }, { "epoch": 2.7417471099988084, "grad_norm": 20.623419563208714, "learning_rate": 2.2588821335654044e-07, "loss": 2.7009, "step": 2876 }, { "epoch": 2.7427005124538195, "grad_norm": 23.132704517105655, "learning_rate": 2.2424282916302665e-07, "loss": 2.5571, "step": 2877 }, { "epoch": 2.743653914908831, "grad_norm": 21.62663755831446, "learning_rate": 2.2260332196611e-07, "loss": 2.6124, "step": 2878 }, { "epoch": 2.744607317363842, "grad_norm": 22.253125550123965, "learning_rate": 2.2096969378335221e-07, "loss": 2.6537, "step": 2879 }, { "epoch": 2.7455607198188536, "grad_norm": 19.857368620198322, "learning_rate": 2.1934194662507736e-07, "loss": 2.6174, "step": 2880 }, { "epoch": 2.7465141222738647, "grad_norm": 20.263220032684192, "learning_rate": 2.1772008249437427e-07, "loss": 2.5249, "step": 2881 }, { "epoch": 2.7474675247288762, "grad_norm": 19.678833676720142, "learning_rate": 2.1610410338709143e-07, "loss": 2.7284, "step": 2882 }, { "epoch": 2.7484209271838873, "grad_norm": 22.802076790464987, "learning_rate": 2.14494011291837e-07, "loss": 2.4433, "step": 2883 }, { "epoch": 2.749374329638899, "grad_norm": 24.120280185170508, "learning_rate": 2.1288980818997272e-07, "loss": 2.8066, "step": 2884 }, { "epoch": 2.7503277320939103, "grad_norm": 19.19394073371829, "learning_rate": 2.1129149605561394e-07, "loss": 2.556, "step": 2885 }, { "epoch": 2.7512811345489214, "grad_norm": 20.04692409641624, "learning_rate": 2.0969907685562786e-07, "loss": 2.5557, "step": 2886 }, { "epoch": 2.752234537003933, "grad_norm": 21.532219268622818, "learning_rate": 2.0811255254962692e-07, "loss": 2.8631, "step": 2887 }, { "epoch": 2.753187939458944, "grad_norm": 22.02777191811598, "learning_rate": 2.0653192508997222e-07, "loss": 2.8418, "step": 2888 }, { "epoch": 2.7541413419139555, "grad_norm": 21.26665416549597, "learning_rate": 2.0495719642176838e-07, "loss": 2.4156, "step": 2889 }, { "epoch": 2.7550947443689666, "grad_norm": 22.030315832196187, "learning_rate": 2.0338836848285805e-07, "loss": 2.6434, "step": 2890 }, { "epoch": 2.756048146823978, "grad_norm": 20.685724344236657, "learning_rate": 2.0182544320382523e-07, "loss": 2.4986, "step": 2891 }, { "epoch": 2.757001549278989, "grad_norm": 21.21266417912492, "learning_rate": 2.0026842250799038e-07, "loss": 2.5957, "step": 2892 }, { "epoch": 2.7579549517340007, "grad_norm": 22.56120259927208, "learning_rate": 1.9871730831140523e-07, "loss": 2.5861, "step": 2893 }, { "epoch": 2.7589083541890123, "grad_norm": 19.486707670162968, "learning_rate": 1.9717210252285513e-07, "loss": 2.7687, "step": 2894 }, { "epoch": 2.7598617566440233, "grad_norm": 20.333976830646694, "learning_rate": 1.9563280704385458e-07, "loss": 2.7327, "step": 2895 }, { "epoch": 2.7608151590990344, "grad_norm": 25.92323294324921, "learning_rate": 1.9409942376864333e-07, "loss": 2.567, "step": 2896 }, { "epoch": 2.761768561554046, "grad_norm": 21.35974923978891, "learning_rate": 1.9257195458418754e-07, "loss": 2.7498, "step": 2897 }, { "epoch": 2.7627219640090575, "grad_norm": 21.192410256460455, "learning_rate": 1.910504013701747e-07, "loss": 2.384, "step": 2898 }, { "epoch": 2.7636753664640685, "grad_norm": 24.305928905749393, "learning_rate": 1.8953476599901321e-07, "loss": 2.7233, "step": 2899 }, { "epoch": 2.76462876891908, "grad_norm": 23.028087724436862, "learning_rate": 1.8802505033582608e-07, "loss": 2.7745, "step": 2900 }, { "epoch": 2.765582171374091, "grad_norm": 21.62305338985803, "learning_rate": 1.865212562384544e-07, "loss": 2.7463, "step": 2901 }, { "epoch": 2.7665355738291026, "grad_norm": 18.773183933174636, "learning_rate": 1.8502338555745125e-07, "loss": 2.7498, "step": 2902 }, { "epoch": 2.767488976284114, "grad_norm": 27.084009568627298, "learning_rate": 1.8353144013608104e-07, "loss": 2.6989, "step": 2903 }, { "epoch": 2.7684423787391252, "grad_norm": 21.862174276435503, "learning_rate": 1.8204542181031572e-07, "loss": 2.6428, "step": 2904 }, { "epoch": 2.7693957811941363, "grad_norm": 19.209702620586107, "learning_rate": 1.8056533240883468e-07, "loss": 2.501, "step": 2905 }, { "epoch": 2.770349183649148, "grad_norm": 18.719840300768684, "learning_rate": 1.790911737530182e-07, "loss": 2.5417, "step": 2906 }, { "epoch": 2.7713025861041594, "grad_norm": 20.14882991885091, "learning_rate": 1.7762294765695242e-07, "loss": 2.6014, "step": 2907 }, { "epoch": 2.7722559885591704, "grad_norm": 20.708716086045868, "learning_rate": 1.7616065592742038e-07, "loss": 2.8829, "step": 2908 }, { "epoch": 2.773209391014182, "grad_norm": 19.64228283564355, "learning_rate": 1.7470430036390262e-07, "loss": 2.6909, "step": 2909 }, { "epoch": 2.774162793469193, "grad_norm": 20.345638858896155, "learning_rate": 1.7325388275857612e-07, "loss": 2.5724, "step": 2910 }, { "epoch": 2.7751161959242046, "grad_norm": 22.33525154751729, "learning_rate": 1.718094048963087e-07, "loss": 2.7092, "step": 2911 }, { "epoch": 2.776069598379216, "grad_norm": 20.094522020274923, "learning_rate": 1.7037086855465902e-07, "loss": 2.7327, "step": 2912 }, { "epoch": 2.777023000834227, "grad_norm": 20.39841651173616, "learning_rate": 1.6893827550387543e-07, "loss": 2.7451, "step": 2913 }, { "epoch": 2.7779764032892382, "grad_norm": 19.875296044103173, "learning_rate": 1.6751162750689164e-07, "loss": 2.6714, "step": 2914 }, { "epoch": 2.7789298057442497, "grad_norm": 20.917645654699175, "learning_rate": 1.6609092631932665e-07, "loss": 2.608, "step": 2915 }, { "epoch": 2.7798832081992613, "grad_norm": 21.67657032271436, "learning_rate": 1.6467617368947918e-07, "loss": 2.633, "step": 2916 }, { "epoch": 2.7808366106542723, "grad_norm": 21.785511041215752, "learning_rate": 1.6326737135832993e-07, "loss": 2.5522, "step": 2917 }, { "epoch": 2.781790013109284, "grad_norm": 20.62130714839259, "learning_rate": 1.61864521059536e-07, "loss": 2.8249, "step": 2918 }, { "epoch": 2.782743415564295, "grad_norm": 25.533722775176347, "learning_rate": 1.604676245194292e-07, "loss": 2.8825, "step": 2919 }, { "epoch": 2.7836968180193065, "grad_norm": 25.722721663870026, "learning_rate": 1.5907668345701732e-07, "loss": 2.7189, "step": 2920 }, { "epoch": 2.784650220474318, "grad_norm": 18.932114114569245, "learning_rate": 1.5769169958397612e-07, "loss": 2.6117, "step": 2921 }, { "epoch": 2.785603622929329, "grad_norm": 23.73817916145622, "learning_rate": 1.5631267460465393e-07, "loss": 2.6404, "step": 2922 }, { "epoch": 2.78655702538434, "grad_norm": 20.469414282141347, "learning_rate": 1.5493961021606275e-07, "loss": 2.6957, "step": 2923 }, { "epoch": 2.7875104278393517, "grad_norm": 18.025958973133033, "learning_rate": 1.5357250810788316e-07, "loss": 2.4228, "step": 2924 }, { "epoch": 2.788463830294363, "grad_norm": 18.891415031427876, "learning_rate": 1.52211369962455e-07, "loss": 2.7315, "step": 2925 }, { "epoch": 2.7894172327493743, "grad_norm": 22.673568634820963, "learning_rate": 1.5085619745478119e-07, "loss": 2.538, "step": 2926 }, { "epoch": 2.7903706352043858, "grad_norm": 22.31130518817819, "learning_rate": 1.495069922525222e-07, "loss": 2.4504, "step": 2927 }, { "epoch": 2.791324037659397, "grad_norm": 21.783989483847826, "learning_rate": 1.4816375601599653e-07, "loss": 2.7122, "step": 2928 }, { "epoch": 2.7922774401144084, "grad_norm": 21.305072259045957, "learning_rate": 1.4682649039817642e-07, "loss": 3.005, "step": 2929 }, { "epoch": 2.79323084256942, "grad_norm": 17.24229100686165, "learning_rate": 1.4549519704468718e-07, "loss": 2.6202, "step": 2930 }, { "epoch": 2.794184245024431, "grad_norm": 18.2365515291243, "learning_rate": 1.4416987759380385e-07, "loss": 2.8816, "step": 2931 }, { "epoch": 2.795137647479442, "grad_norm": 21.44918847955937, "learning_rate": 1.4285053367645074e-07, "loss": 2.5799, "step": 2932 }, { "epoch": 2.7960910499344536, "grad_norm": 18.581290525703384, "learning_rate": 1.41537166916198e-07, "loss": 2.7134, "step": 2933 }, { "epoch": 2.797044452389465, "grad_norm": 22.748842297811866, "learning_rate": 1.4022977892926226e-07, "loss": 2.5415, "step": 2934 }, { "epoch": 2.797997854844476, "grad_norm": 24.166668620294654, "learning_rate": 1.3892837132450098e-07, "loss": 2.7747, "step": 2935 }, { "epoch": 2.7989512572994877, "grad_norm": 20.697126428935036, "learning_rate": 1.37632945703412e-07, "loss": 2.5431, "step": 2936 }, { "epoch": 2.7999046597544988, "grad_norm": 17.525370616802384, "learning_rate": 1.363435036601335e-07, "loss": 2.6893, "step": 2937 }, { "epoch": 2.8008580622095103, "grad_norm": 21.07584342233188, "learning_rate": 1.3506004678143835e-07, "loss": 2.586, "step": 2938 }, { "epoch": 2.8018114646645214, "grad_norm": 27.65052201988896, "learning_rate": 1.33782576646736e-07, "loss": 2.6879, "step": 2939 }, { "epoch": 2.802764867119533, "grad_norm": 19.55185485045472, "learning_rate": 1.3251109482806667e-07, "loss": 2.8886, "step": 2940 }, { "epoch": 2.803718269574544, "grad_norm": 20.664929636790397, "learning_rate": 1.3124560289010436e-07, "loss": 2.673, "step": 2941 }, { "epoch": 2.8046716720295555, "grad_norm": 21.74912577154918, "learning_rate": 1.2998610239014775e-07, "loss": 2.7244, "step": 2942 }, { "epoch": 2.805625074484567, "grad_norm": 21.041548896927843, "learning_rate": 1.2873259487812705e-07, "loss": 2.5557, "step": 2943 }, { "epoch": 2.806578476939578, "grad_norm": 21.221162920200467, "learning_rate": 1.2748508189659447e-07, "loss": 2.6922, "step": 2944 }, { "epoch": 2.8075318793945896, "grad_norm": 21.900381665651516, "learning_rate": 1.2624356498072588e-07, "loss": 2.6683, "step": 2945 }, { "epoch": 2.8084852818496007, "grad_norm": 23.094431259933085, "learning_rate": 1.2500804565832026e-07, "loss": 2.4747, "step": 2946 }, { "epoch": 2.809438684304612, "grad_norm": 22.89728174358239, "learning_rate": 1.237785254497931e-07, "loss": 2.4685, "step": 2947 }, { "epoch": 2.8103920867596233, "grad_norm": 18.87060794461175, "learning_rate": 1.2255500586818015e-07, "loss": 2.4595, "step": 2948 }, { "epoch": 2.811345489214635, "grad_norm": 19.543128523701746, "learning_rate": 1.2133748841913207e-07, "loss": 2.8341, "step": 2949 }, { "epoch": 2.812298891669646, "grad_norm": 24.23666359442925, "learning_rate": 1.2012597460091202e-07, "loss": 2.7267, "step": 2950 }, { "epoch": 2.8132522941246574, "grad_norm": 22.71609048795378, "learning_rate": 1.1892046590439743e-07, "loss": 2.6842, "step": 2951 }, { "epoch": 2.814205696579669, "grad_norm": 22.05463523387467, "learning_rate": 1.177209638130733e-07, "loss": 2.6891, "step": 2952 }, { "epoch": 2.81515909903468, "grad_norm": 24.17393735710147, "learning_rate": 1.1652746980303442e-07, "loss": 2.5556, "step": 2953 }, { "epoch": 2.8161125014896915, "grad_norm": 21.477341691774093, "learning_rate": 1.1533998534298263e-07, "loss": 2.6484, "step": 2954 }, { "epoch": 2.8170659039447026, "grad_norm": 19.725734956999567, "learning_rate": 1.1415851189422344e-07, "loss": 2.5494, "step": 2955 }, { "epoch": 2.818019306399714, "grad_norm": 20.794484011365572, "learning_rate": 1.1298305091066664e-07, "loss": 2.6497, "step": 2956 }, { "epoch": 2.818972708854725, "grad_norm": 21.361732064668285, "learning_rate": 1.1181360383882123e-07, "loss": 2.5128, "step": 2957 }, { "epoch": 2.8199261113097367, "grad_norm": 18.528046544983496, "learning_rate": 1.1065017211779661e-07, "loss": 2.7177, "step": 2958 }, { "epoch": 2.8208795137647478, "grad_norm": 17.690953054113237, "learning_rate": 1.094927571792992e-07, "loss": 2.5254, "step": 2959 }, { "epoch": 2.8218329162197593, "grad_norm": 18.4796822239951, "learning_rate": 1.0834136044763188e-07, "loss": 2.4718, "step": 2960 }, { "epoch": 2.822786318674771, "grad_norm": 20.840708652825285, "learning_rate": 1.0719598333969239e-07, "loss": 2.4476, "step": 2961 }, { "epoch": 2.823739721129782, "grad_norm": 18.77246915983147, "learning_rate": 1.060566272649688e-07, "loss": 2.644, "step": 2962 }, { "epoch": 2.824693123584793, "grad_norm": 21.806346022863455, "learning_rate": 1.0492329362554066e-07, "loss": 2.6229, "step": 2963 }, { "epoch": 2.8256465260398045, "grad_norm": 20.38165063469708, "learning_rate": 1.0379598381607681e-07, "loss": 2.6895, "step": 2964 }, { "epoch": 2.826599928494816, "grad_norm": 17.85927801748129, "learning_rate": 1.02674699223832e-07, "loss": 2.7857, "step": 2965 }, { "epoch": 2.827553330949827, "grad_norm": 22.30184166051497, "learning_rate": 1.0155944122864913e-07, "loss": 2.7165, "step": 2966 }, { "epoch": 2.8285067334048386, "grad_norm": 18.260601613167225, "learning_rate": 1.004502112029515e-07, "loss": 2.8305, "step": 2967 }, { "epoch": 2.8294601358598497, "grad_norm": 19.939309824467603, "learning_rate": 9.93470105117461e-08, "loss": 2.7284, "step": 2968 }, { "epoch": 2.830413538314861, "grad_norm": 23.597870215024546, "learning_rate": 9.824984051262031e-08, "loss": 2.8334, "step": 2969 }, { "epoch": 2.8313669407698727, "grad_norm": 20.898990027090903, "learning_rate": 9.715870255573967e-08, "loss": 2.5049, "step": 2970 }, { "epoch": 2.832320343224884, "grad_norm": 24.123136476665326, "learning_rate": 9.607359798384785e-08, "loss": 2.6044, "step": 2971 }, { "epoch": 2.833273745679895, "grad_norm": 25.233276207143756, "learning_rate": 9.499452813226284e-08, "loss": 2.5745, "step": 2972 }, { "epoch": 2.8342271481349064, "grad_norm": 23.41723981480023, "learning_rate": 9.392149432887576e-08, "loss": 2.5424, "step": 2973 }, { "epoch": 2.835180550589918, "grad_norm": 21.961185859626653, "learning_rate": 9.285449789415147e-08, "loss": 2.707, "step": 2974 }, { "epoch": 2.836133953044929, "grad_norm": 20.865053470206924, "learning_rate": 9.179354014112574e-08, "loss": 2.5741, "step": 2975 }, { "epoch": 2.8370873554999405, "grad_norm": 22.636084385045706, "learning_rate": 9.073862237539977e-08, "loss": 2.6769, "step": 2976 }, { "epoch": 2.8380407579549516, "grad_norm": 21.275753162187858, "learning_rate": 8.968974589514567e-08, "loss": 2.8034, "step": 2977 }, { "epoch": 2.838994160409963, "grad_norm": 25.356633260140974, "learning_rate": 8.864691199109931e-08, "loss": 2.7845, "step": 2978 }, { "epoch": 2.8399475628649746, "grad_norm": 21.28569425420609, "learning_rate": 8.761012194656193e-08, "loss": 2.6663, "step": 2979 }, { "epoch": 2.8409009653199857, "grad_norm": 19.984433854191025, "learning_rate": 8.657937703739516e-08, "loss": 2.8433, "step": 2980 }, { "epoch": 2.841854367774997, "grad_norm": 19.80004852344742, "learning_rate": 8.555467853202437e-08, "loss": 2.5536, "step": 2981 }, { "epoch": 2.8428077702300083, "grad_norm": 21.727050986061727, "learning_rate": 8.453602769143144e-08, "loss": 2.7033, "step": 2982 }, { "epoch": 2.84376117268502, "grad_norm": 21.19489442522277, "learning_rate": 8.352342576915807e-08, "loss": 2.7409, "step": 2983 }, { "epoch": 2.844714575140031, "grad_norm": 22.081630274071173, "learning_rate": 8.251687401130137e-08, "loss": 2.6397, "step": 2984 }, { "epoch": 2.8456679775950424, "grad_norm": 20.817768885925876, "learning_rate": 8.151637365651332e-08, "loss": 2.8828, "step": 2985 }, { "epoch": 2.8466213800500535, "grad_norm": 22.25747541358279, "learning_rate": 8.052192593599906e-08, "loss": 2.6517, "step": 2986 }, { "epoch": 2.847574782505065, "grad_norm": 17.471257375411664, "learning_rate": 7.953353207351633e-08, "loss": 2.5857, "step": 2987 }, { "epoch": 2.8485281849600765, "grad_norm": 18.567761453289446, "learning_rate": 7.855119328537109e-08, "loss": 2.6097, "step": 2988 }, { "epoch": 2.8494815874150876, "grad_norm": 19.240872337387252, "learning_rate": 7.757491078041968e-08, "loss": 2.568, "step": 2989 }, { "epoch": 2.8504349898700987, "grad_norm": 20.186690790234827, "learning_rate": 7.660468576006441e-08, "loss": 2.5819, "step": 2990 }, { "epoch": 2.85138839232511, "grad_norm": 21.044644103093553, "learning_rate": 7.564051941825523e-08, "loss": 2.7346, "step": 2991 }, { "epoch": 2.8523417947801217, "grad_norm": 22.99389847159462, "learning_rate": 7.468241294148471e-08, "loss": 2.5732, "step": 2992 }, { "epoch": 2.853295197235133, "grad_norm": 19.696780703041682, "learning_rate": 7.373036750878804e-08, "loss": 2.4976, "step": 2993 }, { "epoch": 2.8542485996901443, "grad_norm": 23.79535681566453, "learning_rate": 7.278438429174306e-08, "loss": 2.8614, "step": 2994 }, { "epoch": 2.8552020021451554, "grad_norm": 19.659722183138676, "learning_rate": 7.184446445446581e-08, "loss": 2.8093, "step": 2995 }, { "epoch": 2.856155404600167, "grad_norm": 21.117679329581197, "learning_rate": 7.09106091536127e-08, "loss": 2.769, "step": 2996 }, { "epoch": 2.857108807055178, "grad_norm": 22.421612806951686, "learning_rate": 6.998281953837616e-08, "loss": 2.7894, "step": 2997 }, { "epoch": 2.8580622095101895, "grad_norm": 20.563495105617925, "learning_rate": 6.906109675048345e-08, "loss": 2.6734, "step": 2998 }, { "epoch": 2.8590156119652006, "grad_norm": 21.885243294783542, "learning_rate": 6.814544192419781e-08, "loss": 2.8297, "step": 2999 }, { "epoch": 2.859969014420212, "grad_norm": 19.34238653174178, "learning_rate": 6.723585618631456e-08, "loss": 2.4109, "step": 3000 }, { "epoch": 2.8609224168752236, "grad_norm": 23.384319519088198, "learning_rate": 6.633234065616e-08, "loss": 2.659, "step": 3001 }, { "epoch": 2.8618758193302347, "grad_norm": 20.554897196462, "learning_rate": 6.543489644559086e-08, "loss": 2.7065, "step": 3002 }, { "epoch": 2.8628292217852462, "grad_norm": 21.04592787501544, "learning_rate": 6.454352465899205e-08, "loss": 2.5187, "step": 3003 }, { "epoch": 2.8637826242402573, "grad_norm": 22.828997361836933, "learning_rate": 6.365822639327724e-08, "loss": 2.6832, "step": 3004 }, { "epoch": 2.864736026695269, "grad_norm": 20.864479852752243, "learning_rate": 6.27790027378844e-08, "loss": 2.6585, "step": 3005 }, { "epoch": 2.86568942915028, "grad_norm": 22.029887707184102, "learning_rate": 6.190585477477806e-08, "loss": 2.8288, "step": 3006 }, { "epoch": 2.8666428316052914, "grad_norm": 25.354147154953793, "learning_rate": 6.103878357844372e-08, "loss": 2.5235, "step": 3007 }, { "epoch": 2.8675962340603025, "grad_norm": 23.81786861487126, "learning_rate": 6.017779021589065e-08, "loss": 2.561, "step": 3008 }, { "epoch": 2.868549636515314, "grad_norm": 21.321919019835793, "learning_rate": 5.932287574664797e-08, "loss": 2.6371, "step": 3009 }, { "epoch": 2.8695030389703255, "grad_norm": 19.940531284725154, "learning_rate": 5.847404122276412e-08, "loss": 2.4578, "step": 3010 }, { "epoch": 2.8704564414253366, "grad_norm": 18.286980721353697, "learning_rate": 5.763128768880688e-08, "loss": 2.7912, "step": 3011 }, { "epoch": 2.871409843880348, "grad_norm": 22.654395610161828, "learning_rate": 5.679461618185944e-08, "loss": 2.8403, "step": 3012 }, { "epoch": 2.872363246335359, "grad_norm": 20.555523721801798, "learning_rate": 5.5964027731521545e-08, "loss": 2.6978, "step": 3013 }, { "epoch": 2.8733166487903707, "grad_norm": 19.8725349579056, "learning_rate": 5.513952335990502e-08, "loss": 2.5564, "step": 3014 }, { "epoch": 2.874270051245382, "grad_norm": 23.489674173607316, "learning_rate": 5.432110408163771e-08, "loss": 2.6472, "step": 3015 }, { "epoch": 2.8752234537003933, "grad_norm": 20.688195798044926, "learning_rate": 5.350877090385731e-08, "loss": 2.8705, "step": 3016 }, { "epoch": 2.8761768561554044, "grad_norm": 22.73624832414736, "learning_rate": 5.270252482621252e-08, "loss": 2.7368, "step": 3017 }, { "epoch": 2.877130258610416, "grad_norm": 19.52781505831709, "learning_rate": 5.190236684086136e-08, "loss": 2.8752, "step": 3018 }, { "epoch": 2.8780836610654275, "grad_norm": 21.331078855249565, "learning_rate": 5.110829793247007e-08, "loss": 2.7511, "step": 3019 }, { "epoch": 2.8790370635204385, "grad_norm": 21.267883912276098, "learning_rate": 5.032031907821089e-08, "loss": 2.439, "step": 3020 }, { "epoch": 2.8799904659754496, "grad_norm": 21.95760547822613, "learning_rate": 4.953843124776259e-08, "loss": 2.613, "step": 3021 }, { "epoch": 2.880943868430461, "grad_norm": 21.44383967964471, "learning_rate": 4.8762635403308277e-08, "loss": 2.7831, "step": 3022 }, { "epoch": 2.8818972708854727, "grad_norm": 19.950860783894953, "learning_rate": 4.7992932499534296e-08, "loss": 2.5104, "step": 3023 }, { "epoch": 2.8828506733404837, "grad_norm": 19.96110539184598, "learning_rate": 4.722932348362852e-08, "loss": 2.7037, "step": 3024 }, { "epoch": 2.8838040757954952, "grad_norm": 18.6201361407244, "learning_rate": 4.647180929528094e-08, "loss": 2.6238, "step": 3025 }, { "epoch": 2.8847574782505063, "grad_norm": 18.432252641997447, "learning_rate": 4.5720390866679235e-08, "loss": 2.4544, "step": 3026 }, { "epoch": 2.885710880705518, "grad_norm": 20.574966558922416, "learning_rate": 4.4975069122512036e-08, "loss": 2.6981, "step": 3027 }, { "epoch": 2.8866642831605294, "grad_norm": 23.250482624283574, "learning_rate": 4.423584497996458e-08, "loss": 2.6006, "step": 3028 }, { "epoch": 2.8876176856155404, "grad_norm": 18.81309839617213, "learning_rate": 4.3502719348717524e-08, "loss": 2.5428, "step": 3029 }, { "epoch": 2.8885710880705515, "grad_norm": 24.66662326083493, "learning_rate": 4.2775693130948094e-08, "loss": 2.7922, "step": 3030 }, { "epoch": 2.889524490525563, "grad_norm": 20.86857636937723, "learning_rate": 4.205476722132673e-08, "loss": 2.5818, "step": 3031 }, { "epoch": 2.8904778929805746, "grad_norm": 18.91506597783647, "learning_rate": 4.1339942507018225e-08, "loss": 2.7483, "step": 3032 }, { "epoch": 2.8914312954355856, "grad_norm": 20.94442463435255, "learning_rate": 4.0631219867677816e-08, "loss": 2.5571, "step": 3033 }, { "epoch": 2.892384697890597, "grad_norm": 21.826926691290215, "learning_rate": 3.992860017545119e-08, "loss": 2.5103, "step": 3034 }, { "epoch": 2.8933381003456082, "grad_norm": 19.74337267003032, "learning_rate": 3.923208429497616e-08, "loss": 2.5952, "step": 3035 }, { "epoch": 2.8942915028006198, "grad_norm": 17.06492826711586, "learning_rate": 3.8541673083377086e-08, "loss": 2.6114, "step": 3036 }, { "epoch": 2.8952449052556313, "grad_norm": 18.731123873167277, "learning_rate": 3.785736739026713e-08, "loss": 2.5137, "step": 3037 }, { "epoch": 2.8961983077106424, "grad_norm": 16.413794289986225, "learning_rate": 3.717916805774602e-08, "loss": 2.6812, "step": 3038 }, { "epoch": 2.8971517101656534, "grad_norm": 21.89572886359567, "learning_rate": 3.650707592039782e-08, "loss": 2.7932, "step": 3039 }, { "epoch": 2.898105112620665, "grad_norm": 20.691064744587223, "learning_rate": 3.584109180529205e-08, "loss": 2.44, "step": 3040 }, { "epoch": 2.8990585150756765, "grad_norm": 19.12736309668983, "learning_rate": 3.518121653198259e-08, "loss": 2.6821, "step": 3041 }, { "epoch": 2.9000119175306875, "grad_norm": 20.369332759329108, "learning_rate": 3.452745091250431e-08, "loss": 2.5978, "step": 3042 }, { "epoch": 2.900965319985699, "grad_norm": 23.880952388662752, "learning_rate": 3.3879795751375346e-08, "loss": 2.3595, "step": 3043 }, { "epoch": 2.90191872244071, "grad_norm": 19.440066351163484, "learning_rate": 3.323825184559204e-08, "loss": 2.5892, "step": 3044 }, { "epoch": 2.9028721248957217, "grad_norm": 19.25476404945357, "learning_rate": 3.260281998463233e-08, "loss": 2.5894, "step": 3045 }, { "epoch": 2.903825527350733, "grad_norm": 20.908252975516643, "learning_rate": 3.197350095045126e-08, "loss": 2.6763, "step": 3046 }, { "epoch": 2.9047789298057443, "grad_norm": 23.77369935025236, "learning_rate": 3.1350295517483256e-08, "loss": 2.5202, "step": 3047 }, { "epoch": 2.9057323322607553, "grad_norm": 23.388601356162322, "learning_rate": 3.073320445263817e-08, "loss": 2.7385, "step": 3048 }, { "epoch": 2.906685734715767, "grad_norm": 23.809158772567763, "learning_rate": 3.0122228515300775e-08, "loss": 2.599, "step": 3049 }, { "epoch": 2.9076391371707784, "grad_norm": 23.010277954576875, "learning_rate": 2.9517368457332994e-08, "loss": 2.7148, "step": 3050 }, { "epoch": 2.9085925396257895, "grad_norm": 23.32077243305957, "learning_rate": 2.8918625023068302e-08, "loss": 2.4236, "step": 3051 }, { "epoch": 2.909545942080801, "grad_norm": 24.635507975507455, "learning_rate": 2.8325998949314536e-08, "loss": 2.5642, "step": 3052 }, { "epoch": 2.910499344535812, "grad_norm": 19.522873176760896, "learning_rate": 2.7739490965350558e-08, "loss": 2.6381, "step": 3053 }, { "epoch": 2.9114527469908236, "grad_norm": 21.898066009954466, "learning_rate": 2.715910179292791e-08, "loss": 2.7481, "step": 3054 }, { "epoch": 2.912406149445835, "grad_norm": 19.265626563439543, "learning_rate": 2.6584832146266393e-08, "loss": 2.9119, "step": 3055 }, { "epoch": 2.913359551900846, "grad_norm": 19.4448965650976, "learning_rate": 2.6016682732057375e-08, "loss": 2.5472, "step": 3056 }, { "epoch": 2.9143129543558572, "grad_norm": 20.377580260330376, "learning_rate": 2.5454654249458255e-08, "loss": 2.718, "step": 3057 }, { "epoch": 2.9152663568108688, "grad_norm": 21.81875105988914, "learning_rate": 2.489874739009579e-08, "loss": 2.7481, "step": 3058 }, { "epoch": 2.9162197592658803, "grad_norm": 21.28339133164087, "learning_rate": 2.434896283806387e-08, "loss": 2.5663, "step": 3059 }, { "epoch": 2.9171731617208914, "grad_norm": 21.269845690351417, "learning_rate": 2.3805301269920754e-08, "loss": 2.6142, "step": 3060 }, { "epoch": 2.918126564175903, "grad_norm": 16.421404884213665, "learning_rate": 2.326776335469072e-08, "loss": 2.592, "step": 3061 }, { "epoch": 2.919079966630914, "grad_norm": 20.20846629100004, "learning_rate": 2.2736349753862964e-08, "loss": 2.7791, "step": 3062 }, { "epoch": 2.9200333690859255, "grad_norm": 22.018109778017212, "learning_rate": 2.221106112138882e-08, "loss": 2.5433, "step": 3063 }, { "epoch": 2.9209867715409366, "grad_norm": 22.908606587451494, "learning_rate": 2.1691898103682885e-08, "loss": 2.5173, "step": 3064 }, { "epoch": 2.921940173995948, "grad_norm": 22.663085063160334, "learning_rate": 2.1178861339622435e-08, "loss": 2.8273, "step": 3065 }, { "epoch": 2.922893576450959, "grad_norm": 21.05892007964229, "learning_rate": 2.0671951460544128e-08, "loss": 2.5615, "step": 3066 }, { "epoch": 2.9238469789059707, "grad_norm": 18.989317995064756, "learning_rate": 2.0171169090246745e-08, "loss": 2.6446, "step": 3067 }, { "epoch": 2.924800381360982, "grad_norm": 22.98609844150043, "learning_rate": 1.9676514844987338e-08, "loss": 2.5332, "step": 3068 }, { "epoch": 2.9257537838159933, "grad_norm": 26.09405863015272, "learning_rate": 1.9187989333482317e-08, "loss": 2.7103, "step": 3069 }, { "epoch": 2.926707186271005, "grad_norm": 27.764759091637842, "learning_rate": 1.8705593156906343e-08, "loss": 2.6656, "step": 3070 }, { "epoch": 2.927660588726016, "grad_norm": 18.696881829442926, "learning_rate": 1.8229326908890677e-08, "loss": 2.6132, "step": 3071 }, { "epoch": 2.9286139911810274, "grad_norm": 22.33979115368809, "learning_rate": 1.775919117552427e-08, "loss": 2.5935, "step": 3072 }, { "epoch": 2.9295673936360385, "grad_norm": 20.566175378032046, "learning_rate": 1.7295186535349896e-08, "loss": 2.7156, "step": 3073 }, { "epoch": 2.93052079609105, "grad_norm": 21.376159818472104, "learning_rate": 1.6837313559368572e-08, "loss": 2.7005, "step": 3074 }, { "epoch": 2.931474198546061, "grad_norm": 21.17933810455834, "learning_rate": 1.6385572811032367e-08, "loss": 2.755, "step": 3075 }, { "epoch": 2.9324276010010726, "grad_norm": 20.194388098903953, "learning_rate": 1.593996484624938e-08, "loss": 2.3841, "step": 3076 }, { "epoch": 2.933381003456084, "grad_norm": 20.92688961741611, "learning_rate": 1.550049021337985e-08, "loss": 2.6317, "step": 3077 }, { "epoch": 2.934334405911095, "grad_norm": 25.95276200406615, "learning_rate": 1.5067149453237285e-08, "loss": 2.8903, "step": 3078 }, { "epoch": 2.9352878083661067, "grad_norm": 20.828870399110528, "learning_rate": 1.4639943099085673e-08, "loss": 2.3873, "step": 3079 }, { "epoch": 2.9362412108211178, "grad_norm": 19.174006375145034, "learning_rate": 1.42188716766406e-08, "loss": 2.7318, "step": 3080 }, { "epoch": 2.9371946132761293, "grad_norm": 22.38287834326002, "learning_rate": 1.3803935704068683e-08, "loss": 2.5076, "step": 3081 }, { "epoch": 2.9381480157311404, "grad_norm": 19.804824025654057, "learning_rate": 1.3395135691985361e-08, "loss": 2.6646, "step": 3082 }, { "epoch": 2.939101418186152, "grad_norm": 19.459553701247106, "learning_rate": 1.2992472143455449e-08, "loss": 2.5034, "step": 3083 }, { "epoch": 2.940054820641163, "grad_norm": 21.041062981080525, "learning_rate": 1.2595945553992572e-08, "loss": 2.7467, "step": 3084 }, { "epoch": 2.9410082230961745, "grad_norm": 20.240310885540787, "learning_rate": 1.2205556411558628e-08, "loss": 2.726, "step": 3085 }, { "epoch": 2.941961625551186, "grad_norm": 19.72213639746697, "learning_rate": 1.1821305196562105e-08, "loss": 2.69, "step": 3086 }, { "epoch": 2.942915028006197, "grad_norm": 20.27523350653406, "learning_rate": 1.1443192381858647e-08, "loss": 2.536, "step": 3087 }, { "epoch": 2.943868430461208, "grad_norm": 21.329420476570746, "learning_rate": 1.1071218432749942e-08, "loss": 2.7747, "step": 3088 }, { "epoch": 2.9448218329162197, "grad_norm": 23.934305687926887, "learning_rate": 1.0705383806982606e-08, "loss": 2.3942, "step": 3089 }, { "epoch": 2.945775235371231, "grad_norm": 18.906995071718278, "learning_rate": 1.03456889547493e-08, "loss": 2.6104, "step": 3090 }, { "epoch": 2.9467286378262423, "grad_norm": 24.23519093359619, "learning_rate": 9.992134318687063e-09, "loss": 2.6968, "step": 3091 }, { "epoch": 2.947682040281254, "grad_norm": 22.331342931585596, "learning_rate": 9.6447203338762e-09, "loss": 2.7455, "step": 3092 }, { "epoch": 2.948635442736265, "grad_norm": 25.26789637043759, "learning_rate": 9.303447427840839e-09, "loss": 2.8091, "step": 3093 }, { "epoch": 2.9495888451912764, "grad_norm": 25.36015477012787, "learning_rate": 8.968316020547263e-09, "loss": 2.5015, "step": 3094 }, { "epoch": 2.950542247646288, "grad_norm": 24.20680273626574, "learning_rate": 8.639326524405577e-09, "loss": 2.5738, "step": 3095 }, { "epoch": 2.951495650101299, "grad_norm": 20.72162599951166, "learning_rate": 8.316479344266382e-09, "loss": 2.5859, "step": 3096 }, { "epoch": 2.95244905255631, "grad_norm": 22.132664448625377, "learning_rate": 7.999774877421873e-09, "loss": 2.7231, "step": 3097 }, { "epoch": 2.9534024550113216, "grad_norm": 18.25300186949887, "learning_rate": 7.689213513605298e-09, "loss": 2.5097, "step": 3098 }, { "epoch": 2.954355857466333, "grad_norm": 18.410557804793356, "learning_rate": 7.384795634990394e-09, "loss": 2.5396, "step": 3099 }, { "epoch": 2.955309259921344, "grad_norm": 20.45033797520845, "learning_rate": 7.0865216161902785e-09, "loss": 2.9224, "step": 3100 }, { "epoch": 2.9562626623763557, "grad_norm": 23.567824637703115, "learning_rate": 6.7943918242580065e-09, "loss": 2.3592, "step": 3101 }, { "epoch": 2.957216064831367, "grad_norm": 20.32555459469005, "learning_rate": 6.508406618686014e-09, "loss": 2.9035, "step": 3102 }, { "epoch": 2.9581694672863783, "grad_norm": 25.410350822207892, "learning_rate": 6.228566351403342e-09, "loss": 2.4937, "step": 3103 }, { "epoch": 2.95912286974139, "grad_norm": 22.146730372472195, "learning_rate": 5.954871366779525e-09, "loss": 2.6921, "step": 3104 }, { "epoch": 2.960076272196401, "grad_norm": 21.936472207570848, "learning_rate": 5.687322001620699e-09, "loss": 2.7266, "step": 3105 }, { "epoch": 2.961029674651412, "grad_norm": 21.635104789637612, "learning_rate": 5.425918585170164e-09, "loss": 2.6894, "step": 3106 }, { "epoch": 2.9619830771064235, "grad_norm": 20.588676696005408, "learning_rate": 5.170661439107827e-09, "loss": 2.7392, "step": 3107 }, { "epoch": 2.962936479561435, "grad_norm": 22.988095514121763, "learning_rate": 4.921550877550752e-09, "loss": 2.3585, "step": 3108 }, { "epoch": 2.963889882016446, "grad_norm": 25.528466787083197, "learning_rate": 4.678587207052055e-09, "loss": 2.5958, "step": 3109 }, { "epoch": 2.9648432844714576, "grad_norm": 22.492363942873162, "learning_rate": 4.441770726599792e-09, "loss": 2.3747, "step": 3110 }, { "epoch": 2.9657966869264687, "grad_norm": 23.88857676829528, "learning_rate": 4.211101727618072e-09, "loss": 2.5564, "step": 3111 }, { "epoch": 2.96675008938148, "grad_norm": 22.57921421284056, "learning_rate": 3.9865804939659414e-09, "loss": 2.4457, "step": 3112 }, { "epoch": 2.9677034918364917, "grad_norm": 21.133619516810924, "learning_rate": 3.768207301936278e-09, "loss": 2.6699, "step": 3113 }, { "epoch": 2.968656894291503, "grad_norm": 18.98085219535749, "learning_rate": 3.5559824202574533e-09, "loss": 2.7173, "step": 3114 }, { "epoch": 2.969610296746514, "grad_norm": 23.79024687692563, "learning_rate": 3.3499061100911167e-09, "loss": 2.9072, "step": 3115 }, { "epoch": 2.9705636992015254, "grad_norm": 23.5266036010111, "learning_rate": 3.1499786250321904e-09, "loss": 2.6772, "step": 3116 }, { "epoch": 2.971517101656537, "grad_norm": 22.600982319284427, "learning_rate": 2.956200211109428e-09, "loss": 2.5699, "step": 3117 }, { "epoch": 2.972470504111548, "grad_norm": 22.77014435846837, "learning_rate": 2.7685711067848564e-09, "loss": 2.7385, "step": 3118 }, { "epoch": 2.9734239065665595, "grad_norm": 20.77078138431692, "learning_rate": 2.587091542952669e-09, "loss": 2.5512, "step": 3119 }, { "epoch": 2.9743773090215706, "grad_norm": 22.584819950684302, "learning_rate": 2.411761742939778e-09, "loss": 2.6394, "step": 3120 }, { "epoch": 2.975330711476582, "grad_norm": 21.79053514225879, "learning_rate": 2.242581922504705e-09, "loss": 2.8506, "step": 3121 }, { "epoch": 2.9762841139315936, "grad_norm": 23.20607368798548, "learning_rate": 2.0795522898392486e-09, "loss": 2.5727, "step": 3122 }, { "epoch": 2.9772375163866047, "grad_norm": 22.745400922174184, "learning_rate": 1.922673045565149e-09, "loss": 2.727, "step": 3123 }, { "epoch": 2.978190918841616, "grad_norm": 24.5230457188168, "learning_rate": 1.7719443827368677e-09, "loss": 2.8481, "step": 3124 }, { "epoch": 2.9791443212966273, "grad_norm": 21.35808845663882, "learning_rate": 1.6273664868399209e-09, "loss": 2.6197, "step": 3125 }, { "epoch": 2.980097723751639, "grad_norm": 22.28904453058906, "learning_rate": 1.4889395357892134e-09, "loss": 2.6558, "step": 3126 }, { "epoch": 2.98105112620665, "grad_norm": 19.146439055614437, "learning_rate": 1.356663699931815e-09, "loss": 2.6826, "step": 3127 }, { "epoch": 2.9820045286616614, "grad_norm": 20.609004610301515, "learning_rate": 1.2305391420458502e-09, "loss": 2.5949, "step": 3128 }, { "epoch": 2.9829579311166725, "grad_norm": 18.26248014402715, "learning_rate": 1.110566017337722e-09, "loss": 2.7458, "step": 3129 }, { "epoch": 2.983911333571684, "grad_norm": 22.22042631553787, "learning_rate": 9.967444734459985e-10, "loss": 2.7814, "step": 3130 }, { "epoch": 2.984864736026695, "grad_norm": 22.600901308061406, "learning_rate": 8.890746504375269e-10, "loss": 2.6931, "step": 3131 }, { "epoch": 2.9858181384817066, "grad_norm": 21.2336524853353, "learning_rate": 7.875566808107638e-10, "loss": 2.5847, "step": 3132 }, { "epoch": 2.9867715409367177, "grad_norm": 20.773070069803218, "learning_rate": 6.921906894913344e-10, "loss": 2.491, "step": 3133 }, { "epoch": 2.9877249433917292, "grad_norm": 20.714276605871632, "learning_rate": 6.029767938364739e-10, "loss": 2.8442, "step": 3134 }, { "epoch": 2.9886783458467407, "grad_norm": 18.61027354474083, "learning_rate": 5.199151036311412e-10, "loss": 2.7437, "step": 3135 }, { "epoch": 2.989631748301752, "grad_norm": 19.669732264070618, "learning_rate": 4.4300572109134965e-10, "loss": 2.5945, "step": 3136 }, { "epoch": 2.9905851507567633, "grad_norm": 21.056529869206653, "learning_rate": 3.722487408597264e-10, "loss": 2.672, "step": 3137 }, { "epoch": 2.9915385532117744, "grad_norm": 22.135812177963018, "learning_rate": 3.07644250009953e-10, "loss": 2.6671, "step": 3138 }, { "epoch": 2.992491955666786, "grad_norm": 21.757466152206444, "learning_rate": 2.4919232804287985e-10, "loss": 2.5688, "step": 3139 }, { "epoch": 2.993445358121797, "grad_norm": 22.456983480291118, "learning_rate": 1.9689304688985667e-10, "loss": 2.846, "step": 3140 }, { "epoch": 2.9943987605768085, "grad_norm": 21.83246420443493, "learning_rate": 1.5074647090884688e-10, "loss": 2.6887, "step": 3141 }, { "epoch": 2.9953521630318196, "grad_norm": 19.97310513907745, "learning_rate": 1.1075265688775816e-10, "loss": 2.7623, "step": 3142 }, { "epoch": 2.996305565486831, "grad_norm": 25.934681642188828, "learning_rate": 7.691165404277723e-11, "loss": 2.842, "step": 3143 }, { "epoch": 2.9972589679418427, "grad_norm": 21.933864965766542, "learning_rate": 4.922350401781461e-11, "loss": 2.8244, "step": 3144 }, { "epoch": 2.9982123703968537, "grad_norm": 23.01166089383451, "learning_rate": 2.7688240885614947e-11, "loss": 2.5724, "step": 3145 }, { "epoch": 2.9991657728518653, "grad_norm": 20.441634711697766, "learning_rate": 1.2305891147756932e-11, "loss": 2.8254, "step": 3146 }, { "epoch": 3.0, "grad_norm": 24.832368204794868, "learning_rate": 3.0764737335431394e-12, "loss": 2.8047, "step": 3147 }, { "epoch": 3.0, "step": 3147, "total_flos": 1606685134080.0, "train_loss": 3.1898732178545317, "train_runtime": 9803.7016, "train_samples_per_second": 10.27, "train_steps_per_second": 0.321 } ], "logging_steps": 1, "max_steps": 3147, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1606685134080.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }