Text3DSAM / trainer_state.json
MagicXin's picture
Upload folder using huggingface_hub
7c0e021 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 30.0,
"eval_steps": 500,
"global_step": 23130,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0311284046692607,
"grad_norm": 11.856830596923828,
"learning_rate": 3.314121037463977e-06,
"loss": 2.0132,
"step": 24
},
{
"epoch": 0.0622568093385214,
"grad_norm": 1.1055241823196411,
"learning_rate": 6.7723342939481265e-06,
"loss": 1.0751,
"step": 48
},
{
"epoch": 0.0933852140077821,
"grad_norm": 1.4284316301345825,
"learning_rate": 1.0230547550432277e-05,
"loss": 1.038,
"step": 72
},
{
"epoch": 0.1245136186770428,
"grad_norm": 0.6491210460662842,
"learning_rate": 1.3688760806916426e-05,
"loss": 0.9988,
"step": 96
},
{
"epoch": 0.1556420233463035,
"grad_norm": 0.39960888028144836,
"learning_rate": 1.7146974063400578e-05,
"loss": 0.9849,
"step": 120
},
{
"epoch": 0.1867704280155642,
"grad_norm": 0.678686797618866,
"learning_rate": 2.060518731988473e-05,
"loss": 0.9767,
"step": 144
},
{
"epoch": 0.2178988326848249,
"grad_norm": 0.5720846056938171,
"learning_rate": 2.406340057636888e-05,
"loss": 0.9684,
"step": 168
},
{
"epoch": 0.2490272373540856,
"grad_norm": 0.6077919602394104,
"learning_rate": 2.7521613832853026e-05,
"loss": 0.967,
"step": 192
},
{
"epoch": 0.2801556420233463,
"grad_norm": 0.7459629774093628,
"learning_rate": 3.097982708933718e-05,
"loss": 0.9553,
"step": 216
},
{
"epoch": 0.311284046692607,
"grad_norm": 0.8133582472801208,
"learning_rate": 3.443804034582133e-05,
"loss": 0.9474,
"step": 240
},
{
"epoch": 0.3424124513618677,
"grad_norm": 0.7748175263404846,
"learning_rate": 3.7896253602305474e-05,
"loss": 0.9404,
"step": 264
},
{
"epoch": 0.3735408560311284,
"grad_norm": 0.9244363903999329,
"learning_rate": 4.135446685878963e-05,
"loss": 0.9326,
"step": 288
},
{
"epoch": 0.4046692607003891,
"grad_norm": 1.0879474878311157,
"learning_rate": 4.4812680115273775e-05,
"loss": 0.9112,
"step": 312
},
{
"epoch": 0.4357976653696498,
"grad_norm": 1.3521575927734375,
"learning_rate": 4.827089337175792e-05,
"loss": 0.9033,
"step": 336
},
{
"epoch": 0.4669260700389105,
"grad_norm": 1.4220598936080933,
"learning_rate": 5.1729106628242076e-05,
"loss": 0.8877,
"step": 360
},
{
"epoch": 0.4980544747081712,
"grad_norm": 1.345544457435608,
"learning_rate": 5.518731988472623e-05,
"loss": 0.8726,
"step": 384
},
{
"epoch": 0.5291828793774319,
"grad_norm": 1.7283053398132324,
"learning_rate": 5.864553314121038e-05,
"loss": 0.8553,
"step": 408
},
{
"epoch": 0.5603112840466926,
"grad_norm": 1.2822779417037964,
"learning_rate": 6.210374639769453e-05,
"loss": 0.841,
"step": 432
},
{
"epoch": 0.5914396887159533,
"grad_norm": 2.8578877449035645,
"learning_rate": 6.556195965417868e-05,
"loss": 0.8262,
"step": 456
},
{
"epoch": 0.622568093385214,
"grad_norm": 1.603874683380127,
"learning_rate": 6.902017291066282e-05,
"loss": 0.7989,
"step": 480
},
{
"epoch": 0.6536964980544747,
"grad_norm": 2.2428903579711914,
"learning_rate": 7.247838616714697e-05,
"loss": 0.7958,
"step": 504
},
{
"epoch": 0.6848249027237354,
"grad_norm": 1.6760625839233398,
"learning_rate": 7.593659942363113e-05,
"loss": 0.7799,
"step": 528
},
{
"epoch": 0.7159533073929961,
"grad_norm": 2.1145055294036865,
"learning_rate": 7.939481268011528e-05,
"loss": 0.7671,
"step": 552
},
{
"epoch": 0.7470817120622568,
"grad_norm": 1.3805097341537476,
"learning_rate": 8.285302593659943e-05,
"loss": 0.7563,
"step": 576
},
{
"epoch": 0.7782101167315175,
"grad_norm": 2.1005349159240723,
"learning_rate": 8.631123919308359e-05,
"loss": 0.7396,
"step": 600
},
{
"epoch": 0.8093385214007782,
"grad_norm": 1.6995466947555542,
"learning_rate": 8.976945244956772e-05,
"loss": 0.738,
"step": 624
},
{
"epoch": 0.8404669260700389,
"grad_norm": 1.5165631771087646,
"learning_rate": 9.322766570605188e-05,
"loss": 0.7208,
"step": 648
},
{
"epoch": 0.8715953307392996,
"grad_norm": 1.4923312664031982,
"learning_rate": 9.668587896253603e-05,
"loss": 0.7126,
"step": 672
},
{
"epoch": 0.9027237354085603,
"grad_norm": 1.7333067655563354,
"learning_rate": 9.999999950982757e-05,
"loss": 0.6998,
"step": 696
},
{
"epoch": 0.933852140077821,
"grad_norm": 1.5790561437606812,
"learning_rate": 9.999969364253642e-05,
"loss": 0.6943,
"step": 720
},
{
"epoch": 0.9649805447470817,
"grad_norm": 2.4895715713500977,
"learning_rate": 9.999882310058304e-05,
"loss": 0.6887,
"step": 744
},
{
"epoch": 0.9961089494163424,
"grad_norm": 1.2938724756240845,
"learning_rate": 9.999738789379896e-05,
"loss": 0.6728,
"step": 768
},
{
"epoch": 1.027237354085603,
"grad_norm": 2.313992738723755,
"learning_rate": 9.999538803839277e-05,
"loss": 0.6704,
"step": 792
},
{
"epoch": 1.0583657587548638,
"grad_norm": 2.3841969966888428,
"learning_rate": 9.999282355694997e-05,
"loss": 0.6683,
"step": 816
},
{
"epoch": 1.0894941634241244,
"grad_norm": 1.7238163948059082,
"learning_rate": 9.998969447843267e-05,
"loss": 0.6598,
"step": 840
},
{
"epoch": 1.1206225680933852,
"grad_norm": 1.889561653137207,
"learning_rate": 9.998600083817934e-05,
"loss": 0.6469,
"step": 864
},
{
"epoch": 1.1517509727626458,
"grad_norm": 2.078350305557251,
"learning_rate": 9.998174267790433e-05,
"loss": 0.6394,
"step": 888
},
{
"epoch": 1.1828793774319066,
"grad_norm": 3.088223934173584,
"learning_rate": 9.99769200456974e-05,
"loss": 0.642,
"step": 912
},
{
"epoch": 1.2140077821011672,
"grad_norm": 1.8867013454437256,
"learning_rate": 9.997153299602332e-05,
"loss": 0.6365,
"step": 936
},
{
"epoch": 1.245136186770428,
"grad_norm": 2.187405586242676,
"learning_rate": 9.9965581589721e-05,
"loss": 0.6216,
"step": 960
},
{
"epoch": 1.2762645914396886,
"grad_norm": 1.5248736143112183,
"learning_rate": 9.995906589400307e-05,
"loss": 0.6208,
"step": 984
},
{
"epoch": 1.3073929961089494,
"grad_norm": 1.3533403873443604,
"learning_rate": 9.995198598245492e-05,
"loss": 0.6143,
"step": 1008
},
{
"epoch": 1.3385214007782102,
"grad_norm": 1.9436872005462646,
"learning_rate": 9.994434193503399e-05,
"loss": 0.6101,
"step": 1032
},
{
"epoch": 1.3696498054474708,
"grad_norm": 1.5890527963638306,
"learning_rate": 9.993613383806879e-05,
"loss": 0.6011,
"step": 1056
},
{
"epoch": 1.4007782101167314,
"grad_norm": 1.6845647096633911,
"learning_rate": 9.9927361784258e-05,
"loss": 0.6022,
"step": 1080
},
{
"epoch": 1.4319066147859922,
"grad_norm": 1.5048511028289795,
"learning_rate": 9.991802587266932e-05,
"loss": 0.6078,
"step": 1104
},
{
"epoch": 1.463035019455253,
"grad_norm": 1.8788032531738281,
"learning_rate": 9.990812620873848e-05,
"loss": 0.6014,
"step": 1128
},
{
"epoch": 1.4941634241245136,
"grad_norm": 2.0226938724517822,
"learning_rate": 9.989766290426795e-05,
"loss": 0.5912,
"step": 1152
},
{
"epoch": 1.5252918287937742,
"grad_norm": 1.9385308027267456,
"learning_rate": 9.98866360774257e-05,
"loss": 0.5812,
"step": 1176
},
{
"epoch": 1.556420233463035,
"grad_norm": 1.2753961086273193,
"learning_rate": 9.98750458527439e-05,
"loss": 0.5825,
"step": 1200
},
{
"epoch": 1.5875486381322959,
"grad_norm": 1.6889104843139648,
"learning_rate": 9.986289236111747e-05,
"loss": 0.58,
"step": 1224
},
{
"epoch": 1.6186770428015564,
"grad_norm": 2.130415916442871,
"learning_rate": 9.985017573980262e-05,
"loss": 0.5853,
"step": 1248
},
{
"epoch": 1.649805447470817,
"grad_norm": 1.8879446983337402,
"learning_rate": 9.983689613241531e-05,
"loss": 0.5806,
"step": 1272
},
{
"epoch": 1.6809338521400778,
"grad_norm": 1.2330986261367798,
"learning_rate": 9.982305368892964e-05,
"loss": 0.574,
"step": 1296
},
{
"epoch": 1.7120622568093387,
"grad_norm": 1.389142632484436,
"learning_rate": 9.980864856567606e-05,
"loss": 0.5743,
"step": 1320
},
{
"epoch": 1.7431906614785992,
"grad_norm": 1.184691309928894,
"learning_rate": 9.979368092533978e-05,
"loss": 0.5691,
"step": 1344
},
{
"epoch": 1.7743190661478598,
"grad_norm": 1.3246943950653076,
"learning_rate": 9.977815093695875e-05,
"loss": 0.5669,
"step": 1368
},
{
"epoch": 1.8054474708171206,
"grad_norm": 1.5033084154129028,
"learning_rate": 9.976205877592189e-05,
"loss": 0.5636,
"step": 1392
},
{
"epoch": 1.8365758754863815,
"grad_norm": 2.675381660461426,
"learning_rate": 9.974540462396697e-05,
"loss": 0.5554,
"step": 1416
},
{
"epoch": 1.867704280155642,
"grad_norm": 1.4676384925842285,
"learning_rate": 9.972818866917877e-05,
"loss": 0.5526,
"step": 1440
},
{
"epoch": 1.8988326848249026,
"grad_norm": 2.269249200820923,
"learning_rate": 9.971041110598669e-05,
"loss": 0.556,
"step": 1464
},
{
"epoch": 1.9299610894941635,
"grad_norm": 1.7739601135253906,
"learning_rate": 9.969207213516279e-05,
"loss": 0.5546,
"step": 1488
},
{
"epoch": 1.9610894941634243,
"grad_norm": 1.2574249505996704,
"learning_rate": 9.967317196381936e-05,
"loss": 0.549,
"step": 1512
},
{
"epoch": 1.9922178988326849,
"grad_norm": 1.65413236618042,
"learning_rate": 9.965371080540666e-05,
"loss": 0.5537,
"step": 1536
},
{
"epoch": 2.0233463035019454,
"grad_norm": 1.6155718564987183,
"learning_rate": 9.96336888797105e-05,
"loss": 0.5424,
"step": 1560
},
{
"epoch": 2.054474708171206,
"grad_norm": 1.556755542755127,
"learning_rate": 9.961310641284977e-05,
"loss": 0.5396,
"step": 1584
},
{
"epoch": 2.085603112840467,
"grad_norm": 1.5641894340515137,
"learning_rate": 9.959196363727383e-05,
"loss": 0.5465,
"step": 1608
},
{
"epoch": 2.1167315175097277,
"grad_norm": 1.4483375549316406,
"learning_rate": 9.957026079175996e-05,
"loss": 0.5401,
"step": 1632
},
{
"epoch": 2.1478599221789882,
"grad_norm": 1.8051731586456299,
"learning_rate": 9.954799812141054e-05,
"loss": 0.541,
"step": 1656
},
{
"epoch": 2.178988326848249,
"grad_norm": 2.337942361831665,
"learning_rate": 9.952517587765049e-05,
"loss": 0.5359,
"step": 1680
},
{
"epoch": 2.21011673151751,
"grad_norm": 1.5796310901641846,
"learning_rate": 9.950179431822421e-05,
"loss": 0.5361,
"step": 1704
},
{
"epoch": 2.2412451361867705,
"grad_norm": 1.3433961868286133,
"learning_rate": 9.947785370719281e-05,
"loss": 0.5254,
"step": 1728
},
{
"epoch": 2.272373540856031,
"grad_norm": 1.8424466848373413,
"learning_rate": 9.945335431493108e-05,
"loss": 0.5278,
"step": 1752
},
{
"epoch": 2.3035019455252916,
"grad_norm": 1.280912160873413,
"learning_rate": 9.942829641812445e-05,
"loss": 0.5314,
"step": 1776
},
{
"epoch": 2.3346303501945527,
"grad_norm": 2.389176368713379,
"learning_rate": 9.94026802997658e-05,
"loss": 0.5272,
"step": 1800
},
{
"epoch": 2.3657587548638133,
"grad_norm": 1.804115653038025,
"learning_rate": 9.93765062491524e-05,
"loss": 0.5214,
"step": 1824
},
{
"epoch": 2.396887159533074,
"grad_norm": 2.4799587726593018,
"learning_rate": 9.934977456188253e-05,
"loss": 0.5228,
"step": 1848
},
{
"epoch": 2.4280155642023344,
"grad_norm": 1.3502540588378906,
"learning_rate": 9.932248553985213e-05,
"loss": 0.5269,
"step": 1872
},
{
"epoch": 2.4591439688715955,
"grad_norm": 1.9639521837234497,
"learning_rate": 9.929463949125151e-05,
"loss": 0.5244,
"step": 1896
},
{
"epoch": 2.490272373540856,
"grad_norm": 1.5300196409225464,
"learning_rate": 9.926623673056173e-05,
"loss": 0.5163,
"step": 1920
},
{
"epoch": 2.5214007782101167,
"grad_norm": 1.3195667266845703,
"learning_rate": 9.923727757855117e-05,
"loss": 0.5155,
"step": 1944
},
{
"epoch": 2.5525291828793772,
"grad_norm": 1.3704023361206055,
"learning_rate": 9.920776236227181e-05,
"loss": 0.5164,
"step": 1968
},
{
"epoch": 2.5836575875486383,
"grad_norm": 1.2443211078643799,
"learning_rate": 9.917769141505557e-05,
"loss": 0.5119,
"step": 1992
},
{
"epoch": 2.614785992217899,
"grad_norm": 1.7005102634429932,
"learning_rate": 9.91470650765106e-05,
"loss": 0.5191,
"step": 2016
},
{
"epoch": 2.6459143968871595,
"grad_norm": 1.742263674736023,
"learning_rate": 9.911588369251736e-05,
"loss": 0.5207,
"step": 2040
},
{
"epoch": 2.6770428015564205,
"grad_norm": 2.342224597930908,
"learning_rate": 9.908414761522473e-05,
"loss": 0.5116,
"step": 2064
},
{
"epoch": 2.708171206225681,
"grad_norm": 1.481919765472412,
"learning_rate": 9.905185720304612e-05,
"loss": 0.5169,
"step": 2088
},
{
"epoch": 2.7392996108949417,
"grad_norm": 2.477743148803711,
"learning_rate": 9.901901282065529e-05,
"loss": 0.5125,
"step": 2112
},
{
"epoch": 2.7704280155642023,
"grad_norm": 1.231108546257019,
"learning_rate": 9.898561483898233e-05,
"loss": 0.5119,
"step": 2136
},
{
"epoch": 2.801556420233463,
"grad_norm": 1.6876734495162964,
"learning_rate": 9.895166363520943e-05,
"loss": 0.5098,
"step": 2160
},
{
"epoch": 2.832684824902724,
"grad_norm": 1.886053442955017,
"learning_rate": 9.891715959276664e-05,
"loss": 0.509,
"step": 2184
},
{
"epoch": 2.8638132295719845,
"grad_norm": 2.044147253036499,
"learning_rate": 9.88821031013275e-05,
"loss": 0.5081,
"step": 2208
},
{
"epoch": 2.894941634241245,
"grad_norm": 1.8339983224868774,
"learning_rate": 9.88464945568047e-05,
"loss": 0.5031,
"step": 2232
},
{
"epoch": 2.926070038910506,
"grad_norm": 2.0237860679626465,
"learning_rate": 9.881033436134555e-05,
"loss": 0.5026,
"step": 2256
},
{
"epoch": 2.9571984435797667,
"grad_norm": 1.222092866897583,
"learning_rate": 9.877362292332749e-05,
"loss": 0.4922,
"step": 2280
},
{
"epoch": 2.9883268482490273,
"grad_norm": 2.8668859004974365,
"learning_rate": 9.873636065735343e-05,
"loss": 0.4978,
"step": 2304
},
{
"epoch": 3.019455252918288,
"grad_norm": 1.8704198598861694,
"learning_rate": 9.869854798424709e-05,
"loss": 0.4999,
"step": 2328
},
{
"epoch": 3.0505836575875485,
"grad_norm": 1.3280694484710693,
"learning_rate": 9.866018533104826e-05,
"loss": 0.4979,
"step": 2352
},
{
"epoch": 3.0817120622568095,
"grad_norm": 1.6099941730499268,
"learning_rate": 9.862127313100795e-05,
"loss": 0.4966,
"step": 2376
},
{
"epoch": 3.11284046692607,
"grad_norm": 1.797253131866455,
"learning_rate": 9.858181182358355e-05,
"loss": 0.4913,
"step": 2400
},
{
"epoch": 3.1439688715953307,
"grad_norm": 1.4523372650146484,
"learning_rate": 9.854180185443378e-05,
"loss": 0.494,
"step": 2424
},
{
"epoch": 3.1750972762645913,
"grad_norm": 1.665285587310791,
"learning_rate": 9.850124367541371e-05,
"loss": 0.495,
"step": 2448
},
{
"epoch": 3.2062256809338523,
"grad_norm": 1.2931227684020996,
"learning_rate": 9.84601377445697e-05,
"loss": 0.4949,
"step": 2472
},
{
"epoch": 3.237354085603113,
"grad_norm": 2.0045413970947266,
"learning_rate": 9.841848452613412e-05,
"loss": 0.4901,
"step": 2496
},
{
"epoch": 3.2684824902723735,
"grad_norm": 1.2784613370895386,
"learning_rate": 9.83762844905202e-05,
"loss": 0.4967,
"step": 2520
},
{
"epoch": 3.299610894941634,
"grad_norm": 1.485795497894287,
"learning_rate": 9.833353811431669e-05,
"loss": 0.4921,
"step": 2544
},
{
"epoch": 3.330739299610895,
"grad_norm": 2.1288626194000244,
"learning_rate": 9.829024588028244e-05,
"loss": 0.4912,
"step": 2568
},
{
"epoch": 3.3618677042801557,
"grad_norm": 1.5102566480636597,
"learning_rate": 9.824640827734102e-05,
"loss": 0.4938,
"step": 2592
},
{
"epoch": 3.3929961089494163,
"grad_norm": 2.126574993133545,
"learning_rate": 9.820202580057512e-05,
"loss": 0.4881,
"step": 2616
},
{
"epoch": 3.424124513618677,
"grad_norm": 1.1427215337753296,
"learning_rate": 9.8157098951221e-05,
"loss": 0.4956,
"step": 2640
},
{
"epoch": 3.455252918287938,
"grad_norm": 1.847524881362915,
"learning_rate": 9.811162823666287e-05,
"loss": 0.4883,
"step": 2664
},
{
"epoch": 3.4863813229571985,
"grad_norm": 1.3941086530685425,
"learning_rate": 9.806561417042706e-05,
"loss": 0.488,
"step": 2688
},
{
"epoch": 3.517509727626459,
"grad_norm": 1.7835474014282227,
"learning_rate": 9.801905727217631e-05,
"loss": 0.4796,
"step": 2712
},
{
"epoch": 3.5486381322957197,
"grad_norm": 2.4145917892456055,
"learning_rate": 9.797195806770387e-05,
"loss": 0.4856,
"step": 2736
},
{
"epoch": 3.5797665369649807,
"grad_norm": 1.6567249298095703,
"learning_rate": 9.792431708892752e-05,
"loss": 0.4799,
"step": 2760
},
{
"epoch": 3.6108949416342413,
"grad_norm": 1.7985295057296753,
"learning_rate": 9.787613487388365e-05,
"loss": 0.4886,
"step": 2784
},
{
"epoch": 3.642023346303502,
"grad_norm": 1.7581013441085815,
"learning_rate": 9.78274119667211e-05,
"loss": 0.4835,
"step": 2808
},
{
"epoch": 3.673151750972763,
"grad_norm": 1.6254545450210571,
"learning_rate": 9.777814891769507e-05,
"loss": 0.4841,
"step": 2832
},
{
"epoch": 3.7042801556420235,
"grad_norm": 1.745969295501709,
"learning_rate": 9.772834628316087e-05,
"loss": 0.4848,
"step": 2856
},
{
"epoch": 3.735408560311284,
"grad_norm": 1.762830138206482,
"learning_rate": 9.767800462556769e-05,
"loss": 0.476,
"step": 2880
},
{
"epoch": 3.7665369649805447,
"grad_norm": 1.6283063888549805,
"learning_rate": 9.762712451345217e-05,
"loss": 0.48,
"step": 2904
},
{
"epoch": 3.7976653696498053,
"grad_norm": 1.7204512357711792,
"learning_rate": 9.757570652143202e-05,
"loss": 0.4746,
"step": 2928
},
{
"epoch": 3.8287937743190663,
"grad_norm": 2.6043598651885986,
"learning_rate": 9.752375123019956e-05,
"loss": 0.4805,
"step": 2952
},
{
"epoch": 3.859922178988327,
"grad_norm": 2.134938955307007,
"learning_rate": 9.74712592265151e-05,
"loss": 0.4776,
"step": 2976
},
{
"epoch": 3.8910505836575875,
"grad_norm": 1.4748331308364868,
"learning_rate": 9.741823110320037e-05,
"loss": 0.4725,
"step": 3000
},
{
"epoch": 3.9221789883268485,
"grad_norm": 2.2188987731933594,
"learning_rate": 9.73646674591318e-05,
"loss": 0.4781,
"step": 3024
},
{
"epoch": 3.953307392996109,
"grad_norm": 1.2936460971832275,
"learning_rate": 9.731056889923374e-05,
"loss": 0.4808,
"step": 3048
},
{
"epoch": 3.9844357976653697,
"grad_norm": 2.5133862495422363,
"learning_rate": 9.725593603447166e-05,
"loss": 0.4839,
"step": 3072
},
{
"epoch": 4.01556420233463,
"grad_norm": 2.2660224437713623,
"learning_rate": 9.720076948184522e-05,
"loss": 0.4709,
"step": 3096
},
{
"epoch": 4.046692607003891,
"grad_norm": 1.573203444480896,
"learning_rate": 9.714506986438134e-05,
"loss": 0.4762,
"step": 3120
},
{
"epoch": 4.0778210116731515,
"grad_norm": 1.9054023027420044,
"learning_rate": 9.70888378111271e-05,
"loss": 0.4796,
"step": 3144
},
{
"epoch": 4.108949416342412,
"grad_norm": 2.2776753902435303,
"learning_rate": 9.703207395714274e-05,
"loss": 0.4705,
"step": 3168
},
{
"epoch": 4.1400778210116735,
"grad_norm": 1.614623785018921,
"learning_rate": 9.697477894349438e-05,
"loss": 0.4713,
"step": 3192
},
{
"epoch": 4.171206225680934,
"grad_norm": 2.478569269180298,
"learning_rate": 9.691695341724681e-05,
"loss": 0.4719,
"step": 3216
},
{
"epoch": 4.202334630350195,
"grad_norm": 1.3797364234924316,
"learning_rate": 9.685859803145625e-05,
"loss": 0.4663,
"step": 3240
},
{
"epoch": 4.233463035019455,
"grad_norm": 2.49601674079895,
"learning_rate": 9.679971344516288e-05,
"loss": 0.4827,
"step": 3264
},
{
"epoch": 4.264591439688716,
"grad_norm": 1.5913656949996948,
"learning_rate": 9.674030032338346e-05,
"loss": 0.4869,
"step": 3288
},
{
"epoch": 4.2957198443579765,
"grad_norm": 1.5114320516586304,
"learning_rate": 9.668035933710378e-05,
"loss": 0.4794,
"step": 3312
},
{
"epoch": 4.326848249027237,
"grad_norm": 1.905714750289917,
"learning_rate": 9.661989116327112e-05,
"loss": 0.4702,
"step": 3336
},
{
"epoch": 4.357976653696498,
"grad_norm": 1.6932348012924194,
"learning_rate": 9.655889648478657e-05,
"loss": 0.4693,
"step": 3360
},
{
"epoch": 4.389105058365759,
"grad_norm": 1.9976513385772705,
"learning_rate": 9.649737599049736e-05,
"loss": 0.4705,
"step": 3384
},
{
"epoch": 4.42023346303502,
"grad_norm": 1.4826905727386475,
"learning_rate": 9.643533037518899e-05,
"loss": 0.4697,
"step": 3408
},
{
"epoch": 4.45136186770428,
"grad_norm": 1.617922306060791,
"learning_rate": 9.637276033957755e-05,
"loss": 0.4684,
"step": 3432
},
{
"epoch": 4.482490272373541,
"grad_norm": 2.4124162197113037,
"learning_rate": 9.630966659030158e-05,
"loss": 0.462,
"step": 3456
},
{
"epoch": 4.5136186770428015,
"grad_norm": 1.8999947309494019,
"learning_rate": 9.624604983991434e-05,
"loss": 0.4614,
"step": 3480
},
{
"epoch": 4.544747081712062,
"grad_norm": 2.2038631439208984,
"learning_rate": 9.618191080687552e-05,
"loss": 0.473,
"step": 3504
},
{
"epoch": 4.575875486381323,
"grad_norm": 1.5659903287887573,
"learning_rate": 9.611725021554333e-05,
"loss": 0.4632,
"step": 3528
},
{
"epoch": 4.607003891050583,
"grad_norm": 2.38783597946167,
"learning_rate": 9.605206879616617e-05,
"loss": 0.4547,
"step": 3552
},
{
"epoch": 4.638132295719844,
"grad_norm": 1.5512051582336426,
"learning_rate": 9.59863672848745e-05,
"loss": 0.4623,
"step": 3576
},
{
"epoch": 4.669260700389105,
"grad_norm": 3.2371737957000732,
"learning_rate": 9.592014642367243e-05,
"loss": 0.4635,
"step": 3600
},
{
"epoch": 4.700389105058366,
"grad_norm": 1.7594435214996338,
"learning_rate": 9.585340696042935e-05,
"loss": 0.4674,
"step": 3624
},
{
"epoch": 4.7315175097276265,
"grad_norm": 1.3836287260055542,
"learning_rate": 9.57861496488716e-05,
"loss": 0.4611,
"step": 3648
},
{
"epoch": 4.762645914396887,
"grad_norm": 1.7907147407531738,
"learning_rate": 9.571837524857384e-05,
"loss": 0.4609,
"step": 3672
},
{
"epoch": 4.793774319066148,
"grad_norm": 1.7246521711349487,
"learning_rate": 9.565008452495046e-05,
"loss": 0.4588,
"step": 3696
},
{
"epoch": 4.824902723735408,
"grad_norm": 2.1095101833343506,
"learning_rate": 9.558127824924701e-05,
"loss": 0.4623,
"step": 3720
},
{
"epoch": 4.856031128404669,
"grad_norm": 1.1277464628219604,
"learning_rate": 9.551195719853147e-05,
"loss": 0.4568,
"step": 3744
},
{
"epoch": 4.88715953307393,
"grad_norm": 1.2232158184051514,
"learning_rate": 9.544212215568547e-05,
"loss": 0.459,
"step": 3768
},
{
"epoch": 4.918287937743191,
"grad_norm": 1.9220589399337769,
"learning_rate": 9.53717739093954e-05,
"loss": 0.4539,
"step": 3792
},
{
"epoch": 4.9494163424124515,
"grad_norm": 1.6076886653900146,
"learning_rate": 9.530091325414359e-05,
"loss": 0.4583,
"step": 3816
},
{
"epoch": 4.980544747081712,
"grad_norm": 1.2246028184890747,
"learning_rate": 9.522954099019927e-05,
"loss": 0.4567,
"step": 3840
},
{
"epoch": 5.011673151750973,
"grad_norm": 1.4004205465316772,
"learning_rate": 9.515765792360955e-05,
"loss": 0.4535,
"step": 3864
},
{
"epoch": 5.042801556420233,
"grad_norm": 1.30203378200531,
"learning_rate": 9.508526486619036e-05,
"loss": 0.452,
"step": 3888
},
{
"epoch": 5.073929961089494,
"grad_norm": 1.538682222366333,
"learning_rate": 9.501236263551719e-05,
"loss": 0.4511,
"step": 3912
},
{
"epoch": 5.1050583657587545,
"grad_norm": 1.3054372072219849,
"learning_rate": 9.493895205491595e-05,
"loss": 0.4489,
"step": 3936
},
{
"epoch": 5.136186770428016,
"grad_norm": 1.4001922607421875,
"learning_rate": 9.486503395345358e-05,
"loss": 0.4577,
"step": 3960
},
{
"epoch": 5.167315175097277,
"grad_norm": 1.3580487966537476,
"learning_rate": 9.47906091659288e-05,
"loss": 0.4519,
"step": 3984
},
{
"epoch": 5.198443579766537,
"grad_norm": 1.929853081703186,
"learning_rate": 9.47156785328626e-05,
"loss": 0.4562,
"step": 4008
},
{
"epoch": 5.229571984435798,
"grad_norm": 1.9883568286895752,
"learning_rate": 9.464024290048879e-05,
"loss": 0.4573,
"step": 4032
},
{
"epoch": 5.260700389105058,
"grad_norm": 1.7795013189315796,
"learning_rate": 9.456430312074432e-05,
"loss": 0.4513,
"step": 4056
},
{
"epoch": 5.291828793774319,
"grad_norm": 1.1019718647003174,
"learning_rate": 9.44878600512599e-05,
"loss": 0.4475,
"step": 4080
},
{
"epoch": 5.3229571984435795,
"grad_norm": 1.604556918144226,
"learning_rate": 9.441091455535007e-05,
"loss": 0.4466,
"step": 4104
},
{
"epoch": 5.35408560311284,
"grad_norm": 1.8707294464111328,
"learning_rate": 9.433346750200363e-05,
"loss": 0.4501,
"step": 4128
},
{
"epoch": 5.385214007782102,
"grad_norm": 1.6021867990493774,
"learning_rate": 9.425551976587366e-05,
"loss": 0.4443,
"step": 4152
},
{
"epoch": 5.416342412451362,
"grad_norm": 1.7186486721038818,
"learning_rate": 9.417707222726784e-05,
"loss": 0.4374,
"step": 4176
},
{
"epoch": 5.447470817120623,
"grad_norm": 2.0640745162963867,
"learning_rate": 9.409812577213833e-05,
"loss": 0.4468,
"step": 4200
},
{
"epoch": 5.478599221789883,
"grad_norm": 2.1669087409973145,
"learning_rate": 9.401868129207181e-05,
"loss": 0.4501,
"step": 4224
},
{
"epoch": 5.509727626459144,
"grad_norm": 2.237527847290039,
"learning_rate": 9.393873968427953e-05,
"loss": 0.4469,
"step": 4248
},
{
"epoch": 5.5408560311284045,
"grad_norm": 1.5120989084243774,
"learning_rate": 9.385830185158701e-05,
"loss": 0.4425,
"step": 4272
},
{
"epoch": 5.571984435797665,
"grad_norm": 2.029425621032715,
"learning_rate": 9.377736870242393e-05,
"loss": 0.4509,
"step": 4296
},
{
"epoch": 5.603112840466926,
"grad_norm": 1.609480857849121,
"learning_rate": 9.369594115081386e-05,
"loss": 0.4528,
"step": 4320
},
{
"epoch": 5.634241245136186,
"grad_norm": 1.126060128211975,
"learning_rate": 9.361402011636395e-05,
"loss": 0.4435,
"step": 4344
},
{
"epoch": 5.665369649805448,
"grad_norm": 3.637361526489258,
"learning_rate": 9.353160652425452e-05,
"loss": 0.4466,
"step": 4368
},
{
"epoch": 5.696498054474708,
"grad_norm": 3.3293521404266357,
"learning_rate": 9.344870130522863e-05,
"loss": 0.4495,
"step": 4392
},
{
"epoch": 5.727626459143969,
"grad_norm": 1.1623419523239136,
"learning_rate": 9.33653053955815e-05,
"loss": 0.4362,
"step": 4416
},
{
"epoch": 5.7587548638132295,
"grad_norm": 1.3908815383911133,
"learning_rate": 9.328141973715008e-05,
"loss": 0.445,
"step": 4440
},
{
"epoch": 5.78988326848249,
"grad_norm": 1.1905103921890259,
"learning_rate": 9.31970452773023e-05,
"loss": 0.4399,
"step": 4464
},
{
"epoch": 5.821011673151751,
"grad_norm": 1.7141236066818237,
"learning_rate": 9.311218296892636e-05,
"loss": 0.4396,
"step": 4488
},
{
"epoch": 5.852140077821011,
"grad_norm": 1.5528429746627808,
"learning_rate": 9.302683377042007e-05,
"loss": 0.4369,
"step": 4512
},
{
"epoch": 5.883268482490273,
"grad_norm": 1.206060528755188,
"learning_rate": 9.29409986456799e-05,
"loss": 0.443,
"step": 4536
},
{
"epoch": 5.914396887159533,
"grad_norm": 1.2948627471923828,
"learning_rate": 9.285467856409023e-05,
"loss": 0.4421,
"step": 4560
},
{
"epoch": 5.945525291828794,
"grad_norm": 1.6690411567687988,
"learning_rate": 9.276787450051225e-05,
"loss": 0.4393,
"step": 4584
},
{
"epoch": 5.976653696498055,
"grad_norm": 1.4727965593338013,
"learning_rate": 9.26805874352731e-05,
"loss": 0.443,
"step": 4608
},
{
"epoch": 6.007782101167315,
"grad_norm": 2.1878299713134766,
"learning_rate": 9.25928183541547e-05,
"loss": 0.4359,
"step": 4632
},
{
"epoch": 6.038910505836576,
"grad_norm": 1.5079774856567383,
"learning_rate": 9.250456824838263e-05,
"loss": 0.438,
"step": 4656
},
{
"epoch": 6.070038910505836,
"grad_norm": 1.5700092315673828,
"learning_rate": 9.241583811461498e-05,
"loss": 0.4355,
"step": 4680
},
{
"epoch": 6.101167315175097,
"grad_norm": 1.0717116594314575,
"learning_rate": 9.232662895493107e-05,
"loss": 0.4337,
"step": 4704
},
{
"epoch": 6.132295719844358,
"grad_norm": 1.775414228439331,
"learning_rate": 9.223694177682009e-05,
"loss": 0.4398,
"step": 4728
},
{
"epoch": 6.163424124513619,
"grad_norm": 3.057781457901001,
"learning_rate": 9.214677759316982e-05,
"loss": 0.4367,
"step": 4752
},
{
"epoch": 6.19455252918288,
"grad_norm": 1.2848880290985107,
"learning_rate": 9.205613742225507e-05,
"loss": 0.433,
"step": 4776
},
{
"epoch": 6.22568093385214,
"grad_norm": 1.5465294122695923,
"learning_rate": 9.196502228772626e-05,
"loss": 0.442,
"step": 4800
},
{
"epoch": 6.256809338521401,
"grad_norm": 1.1864486932754517,
"learning_rate": 9.18734332185979e-05,
"loss": 0.4356,
"step": 4824
},
{
"epoch": 6.287937743190661,
"grad_norm": 1.6817840337753296,
"learning_rate": 9.17813712492368e-05,
"loss": 0.4386,
"step": 4848
},
{
"epoch": 6.319066147859922,
"grad_norm": 1.285474181175232,
"learning_rate": 9.16888374193506e-05,
"loss": 0.4306,
"step": 4872
},
{
"epoch": 6.3501945525291825,
"grad_norm": 1.5364230871200562,
"learning_rate": 9.159583277397587e-05,
"loss": 0.4333,
"step": 4896
},
{
"epoch": 6.381322957198444,
"grad_norm": 1.8164541721343994,
"learning_rate": 9.150235836346639e-05,
"loss": 0.4285,
"step": 4920
},
{
"epoch": 6.412451361867705,
"grad_norm": 1.5146026611328125,
"learning_rate": 9.140841524348125e-05,
"loss": 0.4354,
"step": 4944
},
{
"epoch": 6.443579766536965,
"grad_norm": 1.238393783569336,
"learning_rate": 9.131400447497294e-05,
"loss": 0.4257,
"step": 4968
},
{
"epoch": 6.474708171206226,
"grad_norm": 1.4109466075897217,
"learning_rate": 9.121912712417536e-05,
"loss": 0.43,
"step": 4992
},
{
"epoch": 6.505836575875486,
"grad_norm": 1.8265984058380127,
"learning_rate": 9.11237842625918e-05,
"loss": 0.4373,
"step": 5016
},
{
"epoch": 6.536964980544747,
"grad_norm": 1.5519527196884155,
"learning_rate": 9.102797696698284e-05,
"loss": 0.4347,
"step": 5040
},
{
"epoch": 6.5680933852140075,
"grad_norm": 1.314172387123108,
"learning_rate": 9.093170631935412e-05,
"loss": 0.4348,
"step": 5064
},
{
"epoch": 6.599221789883268,
"grad_norm": 1.7968671321868896,
"learning_rate": 9.083497340694425e-05,
"loss": 0.4379,
"step": 5088
},
{
"epoch": 6.630350194552529,
"grad_norm": 1.166242003440857,
"learning_rate": 9.073777932221239e-05,
"loss": 0.4313,
"step": 5112
},
{
"epoch": 6.66147859922179,
"grad_norm": 1.9698489904403687,
"learning_rate": 9.064012516282601e-05,
"loss": 0.441,
"step": 5136
},
{
"epoch": 6.692607003891051,
"grad_norm": 1.2938389778137207,
"learning_rate": 9.054201203164845e-05,
"loss": 0.4301,
"step": 5160
},
{
"epoch": 6.723735408560311,
"grad_norm": 5.220723628997803,
"learning_rate": 9.044344103672651e-05,
"loss": 0.4232,
"step": 5184
},
{
"epoch": 6.754863813229572,
"grad_norm": 1.7442070245742798,
"learning_rate": 9.034441329127783e-05,
"loss": 0.4343,
"step": 5208
},
{
"epoch": 6.785992217898833,
"grad_norm": 4.927098274230957,
"learning_rate": 9.024492991367848e-05,
"loss": 0.4279,
"step": 5232
},
{
"epoch": 6.817120622568093,
"grad_norm": 1.1979647874832153,
"learning_rate": 9.014499202745019e-05,
"loss": 0.4312,
"step": 5256
},
{
"epoch": 6.848249027237354,
"grad_norm": 1.6905076503753662,
"learning_rate": 9.004460076124768e-05,
"loss": 0.432,
"step": 5280
},
{
"epoch": 6.879377431906615,
"grad_norm": 1.388134241104126,
"learning_rate": 8.994375724884604e-05,
"loss": 0.4314,
"step": 5304
},
{
"epoch": 6.910505836575876,
"grad_norm": 2.4431025981903076,
"learning_rate": 8.984246262912774e-05,
"loss": 0.4341,
"step": 5328
},
{
"epoch": 6.941634241245136,
"grad_norm": 2.5521421432495117,
"learning_rate": 8.974071804606989e-05,
"loss": 0.4251,
"step": 5352
},
{
"epoch": 6.972762645914397,
"grad_norm": 1.6180981397628784,
"learning_rate": 8.96385246487313e-05,
"loss": 0.4332,
"step": 5376
},
{
"epoch": 7.003891050583658,
"grad_norm": 1.673168659210205,
"learning_rate": 8.95358835912395e-05,
"loss": 0.4258,
"step": 5400
},
{
"epoch": 7.035019455252918,
"grad_norm": 2.032773733139038,
"learning_rate": 8.943279603277767e-05,
"loss": 0.4337,
"step": 5424
},
{
"epoch": 7.066147859922179,
"grad_norm": 1.7290483713150024,
"learning_rate": 8.932926313757157e-05,
"loss": 0.4312,
"step": 5448
},
{
"epoch": 7.097276264591439,
"grad_norm": 4.685028076171875,
"learning_rate": 8.922528607487645e-05,
"loss": 0.4416,
"step": 5472
},
{
"epoch": 7.1284046692607,
"grad_norm": 1.5580335855484009,
"learning_rate": 8.912086601896372e-05,
"loss": 0.4358,
"step": 5496
},
{
"epoch": 7.159533073929961,
"grad_norm": 1.332607388496399,
"learning_rate": 8.901600414910785e-05,
"loss": 0.4288,
"step": 5520
},
{
"epoch": 7.190661478599222,
"grad_norm": 1.2149999141693115,
"learning_rate": 8.891070164957288e-05,
"loss": 0.4238,
"step": 5544
},
{
"epoch": 7.221789883268483,
"grad_norm": 1.4633874893188477,
"learning_rate": 8.880495970959917e-05,
"loss": 0.4278,
"step": 5568
},
{
"epoch": 7.252918287937743,
"grad_norm": 1.4801607131958008,
"learning_rate": 8.869877952338991e-05,
"loss": 0.4227,
"step": 5592
},
{
"epoch": 7.284046692607004,
"grad_norm": 1.8194708824157715,
"learning_rate": 8.85921622900977e-05,
"loss": 0.4192,
"step": 5616
},
{
"epoch": 7.315175097276264,
"grad_norm": 1.111076831817627,
"learning_rate": 8.848510921381089e-05,
"loss": 0.4231,
"step": 5640
},
{
"epoch": 7.346303501945525,
"grad_norm": 1.4320513010025024,
"learning_rate": 8.83776215035401e-05,
"loss": 0.4224,
"step": 5664
},
{
"epoch": 7.377431906614786,
"grad_norm": 1.80966317653656,
"learning_rate": 8.826970037320448e-05,
"loss": 0.4183,
"step": 5688
},
{
"epoch": 7.408560311284047,
"grad_norm": 1.843509554862976,
"learning_rate": 8.816134704161807e-05,
"loss": 0.417,
"step": 5712
},
{
"epoch": 7.439688715953308,
"grad_norm": 1.2015341520309448,
"learning_rate": 8.805256273247598e-05,
"loss": 0.4177,
"step": 5736
},
{
"epoch": 7.470817120622568,
"grad_norm": 1.6432462930679321,
"learning_rate": 8.794334867434059e-05,
"loss": 0.4236,
"step": 5760
},
{
"epoch": 7.501945525291829,
"grad_norm": 1.354224443435669,
"learning_rate": 8.783370610062769e-05,
"loss": 0.4142,
"step": 5784
},
{
"epoch": 7.533073929961089,
"grad_norm": 1.6838608980178833,
"learning_rate": 8.772363624959255e-05,
"loss": 0.4173,
"step": 5808
},
{
"epoch": 7.56420233463035,
"grad_norm": 1.8743314743041992,
"learning_rate": 8.761314036431588e-05,
"loss": 0.4248,
"step": 5832
},
{
"epoch": 7.595330739299611,
"grad_norm": 1.4311802387237549,
"learning_rate": 8.750221969268985e-05,
"loss": 0.4204,
"step": 5856
},
{
"epoch": 7.626459143968871,
"grad_norm": 1.4219359159469604,
"learning_rate": 8.739087548740404e-05,
"loss": 0.4201,
"step": 5880
},
{
"epoch": 7.657587548638133,
"grad_norm": 2.070533275604248,
"learning_rate": 8.727910900593114e-05,
"loss": 0.4229,
"step": 5904
},
{
"epoch": 7.688715953307393,
"grad_norm": 1.4531338214874268,
"learning_rate": 8.716692151051293e-05,
"loss": 0.42,
"step": 5928
},
{
"epoch": 7.719844357976654,
"grad_norm": 2.2621729373931885,
"learning_rate": 8.705431426814585e-05,
"loss": 0.4171,
"step": 5952
},
{
"epoch": 7.750972762645914,
"grad_norm": 1.242394208908081,
"learning_rate": 8.694128855056683e-05,
"loss": 0.4133,
"step": 5976
},
{
"epoch": 7.782101167315175,
"grad_norm": 1.2939616441726685,
"learning_rate": 8.68278456342389e-05,
"loss": 0.4185,
"step": 6000
},
{
"epoch": 7.813229571984436,
"grad_norm": 2.0788450241088867,
"learning_rate": 8.671398680033668e-05,
"loss": 0.4183,
"step": 6024
},
{
"epoch": 7.844357976653696,
"grad_norm": 2.538680076599121,
"learning_rate": 8.659971333473206e-05,
"loss": 0.4246,
"step": 6048
},
{
"epoch": 7.875486381322958,
"grad_norm": 2.1128950119018555,
"learning_rate": 8.648502652797954e-05,
"loss": 0.4156,
"step": 6072
},
{
"epoch": 7.906614785992218,
"grad_norm": 2.2612478733062744,
"learning_rate": 8.636992767530171e-05,
"loss": 0.409,
"step": 6096
},
{
"epoch": 7.937743190661479,
"grad_norm": 2.0751936435699463,
"learning_rate": 8.625441807657471e-05,
"loss": 0.4264,
"step": 6120
},
{
"epoch": 7.968871595330739,
"grad_norm": 2.009459972381592,
"learning_rate": 8.613849903631334e-05,
"loss": 0.4255,
"step": 6144
},
{
"epoch": 8.0,
"grad_norm": 1.8576405048370361,
"learning_rate": 8.602217186365655e-05,
"loss": 0.4211,
"step": 6168
},
{
"epoch": 8.03112840466926,
"grad_norm": 2.817073345184326,
"learning_rate": 8.590543787235252e-05,
"loss": 0.4156,
"step": 6192
},
{
"epoch": 8.062256809338521,
"grad_norm": 1.5011825561523438,
"learning_rate": 8.578829838074389e-05,
"loss": 0.41,
"step": 6216
},
{
"epoch": 8.093385214007782,
"grad_norm": 1.2293556928634644,
"learning_rate": 8.567075471175281e-05,
"loss": 0.417,
"step": 6240
},
{
"epoch": 8.124513618677042,
"grad_norm": 1.415345549583435,
"learning_rate": 8.555280819286603e-05,
"loss": 0.4148,
"step": 6264
},
{
"epoch": 8.155642023346303,
"grad_norm": 2.2379307746887207,
"learning_rate": 8.543446015611995e-05,
"loss": 0.4104,
"step": 6288
},
{
"epoch": 8.186770428015564,
"grad_norm": 1.0670602321624756,
"learning_rate": 8.531571193808549e-05,
"loss": 0.4131,
"step": 6312
},
{
"epoch": 8.217898832684824,
"grad_norm": 1.0915449857711792,
"learning_rate": 8.519656487985309e-05,
"loss": 0.4073,
"step": 6336
},
{
"epoch": 8.249027237354085,
"grad_norm": 1.4844944477081299,
"learning_rate": 8.507702032701748e-05,
"loss": 0.4109,
"step": 6360
},
{
"epoch": 8.280155642023347,
"grad_norm": 1.1173604726791382,
"learning_rate": 8.495707962966253e-05,
"loss": 0.4145,
"step": 6384
},
{
"epoch": 8.311284046692608,
"grad_norm": 1.5978012084960938,
"learning_rate": 8.4836744142346e-05,
"loss": 0.4108,
"step": 6408
},
{
"epoch": 8.342412451361868,
"grad_norm": 1.7912710905075073,
"learning_rate": 8.471601522408422e-05,
"loss": 0.4155,
"step": 6432
},
{
"epoch": 8.373540856031129,
"grad_norm": 2.182061195373535,
"learning_rate": 8.459489423833678e-05,
"loss": 0.4117,
"step": 6456
},
{
"epoch": 8.40466926070039,
"grad_norm": 1.8379067182540894,
"learning_rate": 8.447338255299106e-05,
"loss": 0.4104,
"step": 6480
},
{
"epoch": 8.43579766536965,
"grad_norm": 1.4474197626113892,
"learning_rate": 8.435148154034694e-05,
"loss": 0.4142,
"step": 6504
},
{
"epoch": 8.46692607003891,
"grad_norm": 2.309518575668335,
"learning_rate": 8.422919257710104e-05,
"loss": 0.4079,
"step": 6528
},
{
"epoch": 8.498054474708171,
"grad_norm": 1.2606794834136963,
"learning_rate": 8.410651704433146e-05,
"loss": 0.4125,
"step": 6552
},
{
"epoch": 8.529182879377432,
"grad_norm": 1.683693766593933,
"learning_rate": 8.398345632748194e-05,
"loss": 0.4132,
"step": 6576
},
{
"epoch": 8.560311284046692,
"grad_norm": 2.342796802520752,
"learning_rate": 8.386001181634642e-05,
"loss": 0.4125,
"step": 6600
},
{
"epoch": 8.591439688715953,
"grad_norm": 0.9687896370887756,
"learning_rate": 8.373618490505315e-05,
"loss": 0.4082,
"step": 6624
},
{
"epoch": 8.622568093385214,
"grad_norm": 1.2769346237182617,
"learning_rate": 8.361197699204911e-05,
"loss": 0.413,
"step": 6648
},
{
"epoch": 8.653696498054474,
"grad_norm": 1.4064596891403198,
"learning_rate": 8.348738948008413e-05,
"loss": 0.4172,
"step": 6672
},
{
"epoch": 8.684824902723735,
"grad_norm": 1.0059700012207031,
"learning_rate": 8.336242377619501e-05,
"loss": 0.4132,
"step": 6696
},
{
"epoch": 8.715953307392995,
"grad_norm": 1.5852705240249634,
"learning_rate": 8.323708129168979e-05,
"loss": 0.4129,
"step": 6720
},
{
"epoch": 8.747081712062258,
"grad_norm": 1.879469394683838,
"learning_rate": 8.31113634421316e-05,
"loss": 0.4104,
"step": 6744
},
{
"epoch": 8.778210116731518,
"grad_norm": 1.1461695432662964,
"learning_rate": 8.298527164732283e-05,
"loss": 0.4068,
"step": 6768
},
{
"epoch": 8.809338521400779,
"grad_norm": 1.1254854202270508,
"learning_rate": 8.285880733128907e-05,
"loss": 0.4118,
"step": 6792
},
{
"epoch": 8.84046692607004,
"grad_norm": 1.7840899229049683,
"learning_rate": 8.273197192226294e-05,
"loss": 0.4113,
"step": 6816
},
{
"epoch": 8.8715953307393,
"grad_norm": 1.618880271911621,
"learning_rate": 8.260476685266807e-05,
"loss": 0.4065,
"step": 6840
},
{
"epoch": 8.90272373540856,
"grad_norm": 1.2630411386489868,
"learning_rate": 8.247719355910284e-05,
"loss": 0.4029,
"step": 6864
},
{
"epoch": 8.933852140077821,
"grad_norm": 1.138664960861206,
"learning_rate": 8.234925348232421e-05,
"loss": 0.4012,
"step": 6888
},
{
"epoch": 8.964980544747082,
"grad_norm": 1.4435471296310425,
"learning_rate": 8.222094806723143e-05,
"loss": 0.4068,
"step": 6912
},
{
"epoch": 8.996108949416342,
"grad_norm": 1.9499974250793457,
"learning_rate": 8.209227876284972e-05,
"loss": 0.4092,
"step": 6936
},
{
"epoch": 9.027237354085603,
"grad_norm": 2.3621513843536377,
"learning_rate": 8.196324702231389e-05,
"loss": 0.4048,
"step": 6960
},
{
"epoch": 9.058365758754864,
"grad_norm": 1.2890691757202148,
"learning_rate": 8.183385430285197e-05,
"loss": 0.3996,
"step": 6984
},
{
"epoch": 9.089494163424124,
"grad_norm": 1.3257933855056763,
"learning_rate": 8.170410206576872e-05,
"loss": 0.3985,
"step": 7008
},
{
"epoch": 9.120622568093385,
"grad_norm": 1.485418677330017,
"learning_rate": 8.157399177642914e-05,
"loss": 0.3994,
"step": 7032
},
{
"epoch": 9.151750972762645,
"grad_norm": 1.115235686302185,
"learning_rate": 8.144352490424187e-05,
"loss": 0.3997,
"step": 7056
},
{
"epoch": 9.182879377431906,
"grad_norm": 1.565184473991394,
"learning_rate": 8.131270292264272e-05,
"loss": 0.4059,
"step": 7080
},
{
"epoch": 9.214007782101167,
"grad_norm": 1.3453902006149292,
"learning_rate": 8.118152730907788e-05,
"loss": 0.406,
"step": 7104
},
{
"epoch": 9.245136186770427,
"grad_norm": 1.4093341827392578,
"learning_rate": 8.104999954498734e-05,
"loss": 0.4029,
"step": 7128
},
{
"epoch": 9.27626459143969,
"grad_norm": 1.1250804662704468,
"learning_rate": 8.091812111578812e-05,
"loss": 0.4097,
"step": 7152
},
{
"epoch": 9.30739299610895,
"grad_norm": 1.6016291379928589,
"learning_rate": 8.07858935108575e-05,
"loss": 0.4078,
"step": 7176
},
{
"epoch": 9.33852140077821,
"grad_norm": 1.8599820137023926,
"learning_rate": 8.065331822351618e-05,
"loss": 0.4029,
"step": 7200
},
{
"epoch": 9.369649805447471,
"grad_norm": 1.2994579076766968,
"learning_rate": 8.052039675101143e-05,
"loss": 0.4079,
"step": 7224
},
{
"epoch": 9.400778210116732,
"grad_norm": 1.200239896774292,
"learning_rate": 8.038713059450026e-05,
"loss": 0.4017,
"step": 7248
},
{
"epoch": 9.431906614785992,
"grad_norm": 3.8246068954467773,
"learning_rate": 8.025352125903227e-05,
"loss": 0.4006,
"step": 7272
},
{
"epoch": 9.463035019455253,
"grad_norm": 1.4172035455703735,
"learning_rate": 8.011957025353287e-05,
"loss": 0.4028,
"step": 7296
},
{
"epoch": 9.494163424124514,
"grad_norm": 2.0654618740081787,
"learning_rate": 7.998527909078607e-05,
"loss": 0.4014,
"step": 7320
},
{
"epoch": 9.525291828793774,
"grad_norm": 1.3547816276550293,
"learning_rate": 7.985064928741754e-05,
"loss": 0.3981,
"step": 7344
},
{
"epoch": 9.556420233463035,
"grad_norm": 1.3812025785446167,
"learning_rate": 7.971568236387734e-05,
"loss": 0.406,
"step": 7368
},
{
"epoch": 9.587548638132295,
"grad_norm": 1.438240885734558,
"learning_rate": 7.958037984442285e-05,
"loss": 0.4011,
"step": 7392
},
{
"epoch": 9.618677042801556,
"grad_norm": 1.7840272188186646,
"learning_rate": 7.944474325710154e-05,
"loss": 0.401,
"step": 7416
},
{
"epoch": 9.649805447470817,
"grad_norm": 1.251658320426941,
"learning_rate": 7.930877413373367e-05,
"loss": 0.3969,
"step": 7440
},
{
"epoch": 9.680933852140077,
"grad_norm": 2.252761125564575,
"learning_rate": 7.917247400989505e-05,
"loss": 0.4049,
"step": 7464
},
{
"epoch": 9.712062256809338,
"grad_norm": 1.476012110710144,
"learning_rate": 7.903584442489958e-05,
"loss": 0.401,
"step": 7488
},
{
"epoch": 9.7431906614786,
"grad_norm": 2.692723035812378,
"learning_rate": 7.889888692178207e-05,
"loss": 0.4017,
"step": 7512
},
{
"epoch": 9.77431906614786,
"grad_norm": 3.0412638187408447,
"learning_rate": 7.87616030472806e-05,
"loss": 0.4093,
"step": 7536
},
{
"epoch": 9.805447470817121,
"grad_norm": 1.527076244354248,
"learning_rate": 7.862399435181917e-05,
"loss": 0.3988,
"step": 7560
},
{
"epoch": 9.836575875486382,
"grad_norm": 1.2038588523864746,
"learning_rate": 7.848606238949021e-05,
"loss": 0.4058,
"step": 7584
},
{
"epoch": 9.867704280155642,
"grad_norm": 1.9050565958023071,
"learning_rate": 7.834780871803693e-05,
"loss": 0.3943,
"step": 7608
},
{
"epoch": 9.898832684824903,
"grad_norm": 1.483185887336731,
"learning_rate": 7.82092348988358e-05,
"loss": 0.3992,
"step": 7632
},
{
"epoch": 9.929961089494164,
"grad_norm": 1.5043606758117676,
"learning_rate": 7.80703424968789e-05,
"loss": 0.3989,
"step": 7656
},
{
"epoch": 9.961089494163424,
"grad_norm": 1.194094181060791,
"learning_rate": 7.793113308075626e-05,
"loss": 0.4007,
"step": 7680
},
{
"epoch": 9.992217898832685,
"grad_norm": 1.5360095500946045,
"learning_rate": 7.77916082226381e-05,
"loss": 0.395,
"step": 7704
},
{
"epoch": 10.023346303501945,
"grad_norm": 1.1073459386825562,
"learning_rate": 7.76517694982571e-05,
"loss": 0.3989,
"step": 7728
},
{
"epoch": 10.054474708171206,
"grad_norm": 1.4059771299362183,
"learning_rate": 7.751161848689063e-05,
"loss": 0.3964,
"step": 7752
},
{
"epoch": 10.085603112840467,
"grad_norm": 1.8619714975357056,
"learning_rate": 7.737115677134294e-05,
"loss": 0.3964,
"step": 7776
},
{
"epoch": 10.116731517509727,
"grad_norm": 0.8621863722801208,
"learning_rate": 7.723038593792712e-05,
"loss": 0.4019,
"step": 7800
},
{
"epoch": 10.147859922178988,
"grad_norm": 1.542912483215332,
"learning_rate": 7.708930757644739e-05,
"loss": 0.3957,
"step": 7824
},
{
"epoch": 10.178988326848248,
"grad_norm": 1.8078597784042358,
"learning_rate": 7.694792328018106e-05,
"loss": 0.3991,
"step": 7848
},
{
"epoch": 10.210116731517509,
"grad_norm": 1.4210093021392822,
"learning_rate": 7.680623464586048e-05,
"loss": 0.3925,
"step": 7872
},
{
"epoch": 10.24124513618677,
"grad_norm": 1.6985816955566406,
"learning_rate": 7.66642432736551e-05,
"loss": 0.3984,
"step": 7896
},
{
"epoch": 10.272373540856032,
"grad_norm": 1.4291504621505737,
"learning_rate": 7.652195076715332e-05,
"loss": 0.4016,
"step": 7920
},
{
"epoch": 10.303501945525293,
"grad_norm": 1.3934870958328247,
"learning_rate": 7.637935873334448e-05,
"loss": 0.3992,
"step": 7944
},
{
"epoch": 10.334630350194553,
"grad_norm": 1.5841765403747559,
"learning_rate": 7.623646878260062e-05,
"loss": 0.3989,
"step": 7968
},
{
"epoch": 10.365758754863814,
"grad_norm": 1.1344020366668701,
"learning_rate": 7.60932825286583e-05,
"loss": 0.3934,
"step": 7992
},
{
"epoch": 10.396887159533074,
"grad_norm": 1.1252238750457764,
"learning_rate": 7.594980158860043e-05,
"loss": 0.3947,
"step": 8016
},
{
"epoch": 10.428015564202335,
"grad_norm": 1.5455870628356934,
"learning_rate": 7.580602758283796e-05,
"loss": 0.3897,
"step": 8040
},
{
"epoch": 10.459143968871595,
"grad_norm": 2.1351683139801025,
"learning_rate": 7.566196213509163e-05,
"loss": 0.3911,
"step": 8064
},
{
"epoch": 10.490272373540856,
"grad_norm": 1.9759098291397095,
"learning_rate": 7.551760687237351e-05,
"loss": 0.3973,
"step": 8088
},
{
"epoch": 10.521400778210117,
"grad_norm": 1.0132018327713013,
"learning_rate": 7.537296342496884e-05,
"loss": 0.3957,
"step": 8112
},
{
"epoch": 10.552529182879377,
"grad_norm": 2.219759464263916,
"learning_rate": 7.522803342641737e-05,
"loss": 0.3887,
"step": 8136
},
{
"epoch": 10.583657587548638,
"grad_norm": 2.361774206161499,
"learning_rate": 7.508281851349512e-05,
"loss": 0.3975,
"step": 8160
},
{
"epoch": 10.614785992217898,
"grad_norm": 1.4584128856658936,
"learning_rate": 7.493732032619578e-05,
"loss": 0.4,
"step": 8184
},
{
"epoch": 10.645914396887159,
"grad_norm": 1.375190019607544,
"learning_rate": 7.47915405077122e-05,
"loss": 0.4021,
"step": 8208
},
{
"epoch": 10.67704280155642,
"grad_norm": 1.5501540899276733,
"learning_rate": 7.464548070441785e-05,
"loss": 0.3943,
"step": 8232
},
{
"epoch": 10.70817120622568,
"grad_norm": 1.5805977582931519,
"learning_rate": 7.449914256584828e-05,
"loss": 0.3915,
"step": 8256
},
{
"epoch": 10.739299610894943,
"grad_norm": 1.0127402544021606,
"learning_rate": 7.435252774468237e-05,
"loss": 0.3899,
"step": 8280
},
{
"epoch": 10.770428015564203,
"grad_norm": 1.5114730596542358,
"learning_rate": 7.420563789672375e-05,
"loss": 0.3922,
"step": 8304
},
{
"epoch": 10.801556420233464,
"grad_norm": 1.1805211305618286,
"learning_rate": 7.405847468088209e-05,
"loss": 0.3951,
"step": 8328
},
{
"epoch": 10.832684824902724,
"grad_norm": 1.1337734460830688,
"learning_rate": 7.391103975915436e-05,
"loss": 0.3954,
"step": 8352
},
{
"epoch": 10.863813229571985,
"grad_norm": 1.024134874343872,
"learning_rate": 7.376333479660607e-05,
"loss": 0.3829,
"step": 8376
},
{
"epoch": 10.894941634241246,
"grad_norm": 1.2885181903839111,
"learning_rate": 7.361536146135243e-05,
"loss": 0.3904,
"step": 8400
},
{
"epoch": 10.926070038910506,
"grad_norm": 1.2240935564041138,
"learning_rate": 7.346712142453954e-05,
"loss": 0.3904,
"step": 8424
},
{
"epoch": 10.957198443579767,
"grad_norm": 1.2982319593429565,
"learning_rate": 7.33186163603255e-05,
"loss": 0.3944,
"step": 8448
},
{
"epoch": 10.988326848249027,
"grad_norm": 1.0359567403793335,
"learning_rate": 7.316984794586155e-05,
"loss": 0.3989,
"step": 8472
},
{
"epoch": 11.019455252918288,
"grad_norm": 2.0623931884765625,
"learning_rate": 7.302081786127304e-05,
"loss": 0.3853,
"step": 8496
},
{
"epoch": 11.050583657587548,
"grad_norm": 1.2377070188522339,
"learning_rate": 7.287152778964055e-05,
"loss": 0.3913,
"step": 8520
},
{
"epoch": 11.081712062256809,
"grad_norm": 1.016614556312561,
"learning_rate": 7.272197941698084e-05,
"loss": 0.3882,
"step": 8544
},
{
"epoch": 11.11284046692607,
"grad_norm": 1.5649337768554688,
"learning_rate": 7.257217443222777e-05,
"loss": 0.378,
"step": 8568
},
{
"epoch": 11.14396887159533,
"grad_norm": 1.4619653224945068,
"learning_rate": 7.242211452721331e-05,
"loss": 0.3874,
"step": 8592
},
{
"epoch": 11.17509727626459,
"grad_norm": 1.6870439052581787,
"learning_rate": 7.227180139664836e-05,
"loss": 0.3867,
"step": 8616
},
{
"epoch": 11.206225680933851,
"grad_norm": 1.0460180044174194,
"learning_rate": 7.212123673810363e-05,
"loss": 0.394,
"step": 8640
},
{
"epoch": 11.237354085603112,
"grad_norm": 1.0444591045379639,
"learning_rate": 7.19704222519905e-05,
"loss": 0.3877,
"step": 8664
},
{
"epoch": 11.268482490272374,
"grad_norm": 1.3924522399902344,
"learning_rate": 7.181935964154182e-05,
"loss": 0.3836,
"step": 8688
},
{
"epoch": 11.299610894941635,
"grad_norm": 2.0957131385803223,
"learning_rate": 7.166805061279257e-05,
"loss": 0.3879,
"step": 8712
},
{
"epoch": 11.330739299610896,
"grad_norm": 1.5147196054458618,
"learning_rate": 7.151649687456074e-05,
"loss": 0.3888,
"step": 8736
},
{
"epoch": 11.361867704280156,
"grad_norm": 1.5958192348480225,
"learning_rate": 7.136470013842791e-05,
"loss": 0.3883,
"step": 8760
},
{
"epoch": 11.392996108949417,
"grad_norm": 1.494354248046875,
"learning_rate": 7.121266211872004e-05,
"loss": 0.3847,
"step": 8784
},
{
"epoch": 11.424124513618677,
"grad_norm": 1.3116648197174072,
"learning_rate": 7.106038453248794e-05,
"loss": 0.3913,
"step": 8808
},
{
"epoch": 11.455252918287938,
"grad_norm": 2.947636842727661,
"learning_rate": 7.090786909948809e-05,
"loss": 0.3837,
"step": 8832
},
{
"epoch": 11.486381322957198,
"grad_norm": 1.8480781316757202,
"learning_rate": 7.075511754216304e-05,
"loss": 0.3816,
"step": 8856
},
{
"epoch": 11.517509727626459,
"grad_norm": 1.5083237886428833,
"learning_rate": 7.060213158562205e-05,
"loss": 0.3856,
"step": 8880
},
{
"epoch": 11.54863813229572,
"grad_norm": 1.2127504348754883,
"learning_rate": 7.044891295762154e-05,
"loss": 0.3861,
"step": 8904
},
{
"epoch": 11.57976653696498,
"grad_norm": 1.0090476274490356,
"learning_rate": 7.029546338854569e-05,
"loss": 0.3894,
"step": 8928
},
{
"epoch": 11.61089494163424,
"grad_norm": 0.9990460872650146,
"learning_rate": 7.014178461138676e-05,
"loss": 0.388,
"step": 8952
},
{
"epoch": 11.642023346303501,
"grad_norm": 1.7229726314544678,
"learning_rate": 6.998787836172564e-05,
"loss": 0.3883,
"step": 8976
},
{
"epoch": 11.673151750972762,
"grad_norm": 1.0046260356903076,
"learning_rate": 6.983374637771217e-05,
"loss": 0.3853,
"step": 9000
},
{
"epoch": 11.704280155642023,
"grad_norm": 1.4152393341064453,
"learning_rate": 6.967939040004551e-05,
"loss": 0.3829,
"step": 9024
},
{
"epoch": 11.735408560311285,
"grad_norm": 1.2723467350006104,
"learning_rate": 6.952481217195456e-05,
"loss": 0.3879,
"step": 9048
},
{
"epoch": 11.766536964980546,
"grad_norm": 1.7674216032028198,
"learning_rate": 6.937001343917818e-05,
"loss": 0.3909,
"step": 9072
},
{
"epoch": 11.797665369649806,
"grad_norm": 1.4604827165603638,
"learning_rate": 6.92149959499455e-05,
"loss": 0.3878,
"step": 9096
},
{
"epoch": 11.828793774319067,
"grad_norm": 1.5532753467559814,
"learning_rate": 6.905976145495628e-05,
"loss": 0.3884,
"step": 9120
},
{
"epoch": 11.859922178988327,
"grad_norm": 1.1423866748809814,
"learning_rate": 6.890431170736091e-05,
"loss": 0.3861,
"step": 9144
},
{
"epoch": 11.891050583657588,
"grad_norm": 1.350380778312683,
"learning_rate": 6.874864846274087e-05,
"loss": 0.3813,
"step": 9168
},
{
"epoch": 11.922178988326849,
"grad_norm": 1.2758312225341797,
"learning_rate": 6.85927734790887e-05,
"loss": 0.3877,
"step": 9192
},
{
"epoch": 11.95330739299611,
"grad_norm": 1.970986247062683,
"learning_rate": 6.843668851678831e-05,
"loss": 0.3828,
"step": 9216
},
{
"epoch": 11.98443579766537,
"grad_norm": 1.340889811515808,
"learning_rate": 6.828039533859489e-05,
"loss": 0.3875,
"step": 9240
},
{
"epoch": 12.01556420233463,
"grad_norm": 1.2335118055343628,
"learning_rate": 6.812389570961525e-05,
"loss": 0.3809,
"step": 9264
},
{
"epoch": 12.04669260700389,
"grad_norm": 1.2043426036834717,
"learning_rate": 6.796719139728777e-05,
"loss": 0.3835,
"step": 9288
},
{
"epoch": 12.077821011673151,
"grad_norm": 1.197809100151062,
"learning_rate": 6.781028417136231e-05,
"loss": 0.3792,
"step": 9312
},
{
"epoch": 12.108949416342412,
"grad_norm": 1.2524584531784058,
"learning_rate": 6.765317580388046e-05,
"loss": 0.3842,
"step": 9336
},
{
"epoch": 12.140077821011673,
"grad_norm": 1.082410454750061,
"learning_rate": 6.749586806915535e-05,
"loss": 0.3827,
"step": 9360
},
{
"epoch": 12.171206225680933,
"grad_norm": 1.2853772640228271,
"learning_rate": 6.733836274375176e-05,
"loss": 0.3755,
"step": 9384
},
{
"epoch": 12.202334630350194,
"grad_norm": 1.6849515438079834,
"learning_rate": 6.718066160646585e-05,
"loss": 0.38,
"step": 9408
},
{
"epoch": 12.233463035019454,
"grad_norm": 2.0715172290802,
"learning_rate": 6.702276643830531e-05,
"loss": 0.3799,
"step": 9432
},
{
"epoch": 12.264591439688717,
"grad_norm": 1.7511128187179565,
"learning_rate": 6.686467902246909e-05,
"loss": 0.3752,
"step": 9456
},
{
"epoch": 12.295719844357977,
"grad_norm": 1.1407638788223267,
"learning_rate": 6.670640114432724e-05,
"loss": 0.3834,
"step": 9480
},
{
"epoch": 12.326848249027238,
"grad_norm": 1.0695194005966187,
"learning_rate": 6.654793459140089e-05,
"loss": 0.3835,
"step": 9504
},
{
"epoch": 12.357976653696499,
"grad_norm": 1.285834789276123,
"learning_rate": 6.638928115334196e-05,
"loss": 0.3904,
"step": 9528
},
{
"epoch": 12.38910505836576,
"grad_norm": 1.508699893951416,
"learning_rate": 6.623044262191293e-05,
"loss": 0.3964,
"step": 9552
},
{
"epoch": 12.42023346303502,
"grad_norm": 1.287642002105713,
"learning_rate": 6.607142079096668e-05,
"loss": 0.3819,
"step": 9576
},
{
"epoch": 12.45136186770428,
"grad_norm": 2.893951892852783,
"learning_rate": 6.591221745642621e-05,
"loss": 0.3805,
"step": 9600
},
{
"epoch": 12.482490272373541,
"grad_norm": 1.4402974843978882,
"learning_rate": 6.575283441626433e-05,
"loss": 0.376,
"step": 9624
},
{
"epoch": 12.513618677042802,
"grad_norm": 1.156258225440979,
"learning_rate": 6.559327347048331e-05,
"loss": 0.3778,
"step": 9648
},
{
"epoch": 12.544747081712062,
"grad_norm": 1.5183446407318115,
"learning_rate": 6.543353642109469e-05,
"loss": 0.382,
"step": 9672
},
{
"epoch": 12.575875486381323,
"grad_norm": 1.611879825592041,
"learning_rate": 6.527362507209879e-05,
"loss": 0.3791,
"step": 9696
},
{
"epoch": 12.607003891050583,
"grad_norm": 1.3625446557998657,
"learning_rate": 6.511354122946443e-05,
"loss": 0.379,
"step": 9720
},
{
"epoch": 12.638132295719844,
"grad_norm": 1.2298206090927124,
"learning_rate": 6.495328670110848e-05,
"loss": 0.3773,
"step": 9744
},
{
"epoch": 12.669260700389104,
"grad_norm": 1.0427093505859375,
"learning_rate": 6.479286329687543e-05,
"loss": 0.3752,
"step": 9768
},
{
"epoch": 12.700389105058365,
"grad_norm": 1.6555167436599731,
"learning_rate": 6.463227282851708e-05,
"loss": 0.3771,
"step": 9792
},
{
"epoch": 12.731517509727626,
"grad_norm": 1.3086024522781372,
"learning_rate": 6.447151710967187e-05,
"loss": 0.377,
"step": 9816
},
{
"epoch": 12.762645914396888,
"grad_norm": 1.3003504276275635,
"learning_rate": 6.431059795584453e-05,
"loss": 0.3812,
"step": 9840
},
{
"epoch": 12.793774319066149,
"grad_norm": 1.4847590923309326,
"learning_rate": 6.414951718438561e-05,
"loss": 0.3778,
"step": 9864
},
{
"epoch": 12.82490272373541,
"grad_norm": 1.3426965475082397,
"learning_rate": 6.398827661447084e-05,
"loss": 0.3794,
"step": 9888
},
{
"epoch": 12.85603112840467,
"grad_norm": 1.2530086040496826,
"learning_rate": 6.382687806708067e-05,
"loss": 0.3728,
"step": 9912
},
{
"epoch": 12.88715953307393,
"grad_norm": 1.8029588460922241,
"learning_rate": 6.366532336497968e-05,
"loss": 0.3795,
"step": 9936
},
{
"epoch": 12.918287937743191,
"grad_norm": 1.9585580825805664,
"learning_rate": 6.350361433269599e-05,
"loss": 0.3769,
"step": 9960
},
{
"epoch": 12.949416342412452,
"grad_norm": 1.7418956756591797,
"learning_rate": 6.334175279650062e-05,
"loss": 0.3778,
"step": 9984
},
{
"epoch": 12.980544747081712,
"grad_norm": 1.6264042854309082,
"learning_rate": 6.317974058438697e-05,
"loss": 0.3821,
"step": 10008
},
{
"epoch": 13.011673151750973,
"grad_norm": 0.9489176869392395,
"learning_rate": 6.301757952605007e-05,
"loss": 0.374,
"step": 10032
},
{
"epoch": 13.042801556420233,
"grad_norm": 2.183706045150757,
"learning_rate": 6.285527145286594e-05,
"loss": 0.3736,
"step": 10056
},
{
"epoch": 13.073929961089494,
"grad_norm": 1.3998112678527832,
"learning_rate": 6.269281819787095e-05,
"loss": 0.3726,
"step": 10080
},
{
"epoch": 13.105058365758754,
"grad_norm": 1.5030006170272827,
"learning_rate": 6.253022159574108e-05,
"loss": 0.3741,
"step": 10104
},
{
"epoch": 13.136186770428015,
"grad_norm": 2.579502820968628,
"learning_rate": 6.23674834827712e-05,
"loss": 0.373,
"step": 10128
},
{
"epoch": 13.167315175097276,
"grad_norm": 1.5349212884902954,
"learning_rate": 6.220460569685437e-05,
"loss": 0.3739,
"step": 10152
},
{
"epoch": 13.198443579766536,
"grad_norm": 1.6323474645614624,
"learning_rate": 6.204159007746103e-05,
"loss": 0.3729,
"step": 10176
},
{
"epoch": 13.229571984435797,
"grad_norm": 1.1729427576065063,
"learning_rate": 6.187843846561824e-05,
"loss": 0.3759,
"step": 10200
},
{
"epoch": 13.26070038910506,
"grad_norm": 2.276395320892334,
"learning_rate": 6.171515270388892e-05,
"loss": 0.3657,
"step": 10224
},
{
"epoch": 13.29182879377432,
"grad_norm": 0.9925207495689392,
"learning_rate": 6.155173463635103e-05,
"loss": 0.3724,
"step": 10248
},
{
"epoch": 13.32295719844358,
"grad_norm": 0.9079545140266418,
"learning_rate": 6.13881861085767e-05,
"loss": 0.3675,
"step": 10272
},
{
"epoch": 13.354085603112841,
"grad_norm": 2.5486135482788086,
"learning_rate": 6.122450896761147e-05,
"loss": 0.3684,
"step": 10296
},
{
"epoch": 13.385214007782102,
"grad_norm": 1.5650309324264526,
"learning_rate": 6.106070506195332e-05,
"loss": 0.3765,
"step": 10320
},
{
"epoch": 13.416342412451362,
"grad_norm": 0.9130122065544128,
"learning_rate": 6.0896776241531916e-05,
"loss": 0.3788,
"step": 10344
},
{
"epoch": 13.447470817120623,
"grad_norm": 1.1227184534072876,
"learning_rate": 6.073272435768761e-05,
"loss": 0.3717,
"step": 10368
},
{
"epoch": 13.478599221789883,
"grad_norm": 2.312488079071045,
"learning_rate": 6.0568551263150606e-05,
"loss": 0.3775,
"step": 10392
},
{
"epoch": 13.509727626459144,
"grad_norm": 1.1797654628753662,
"learning_rate": 6.040425881201998e-05,
"loss": 0.3721,
"step": 10416
},
{
"epoch": 13.540856031128405,
"grad_norm": 3.0446395874023438,
"learning_rate": 6.0239848859742795e-05,
"loss": 0.3698,
"step": 10440
},
{
"epoch": 13.571984435797665,
"grad_norm": 1.0386089086532593,
"learning_rate": 6.007532326309313e-05,
"loss": 0.3724,
"step": 10464
},
{
"epoch": 13.603112840466926,
"grad_norm": 1.4335585832595825,
"learning_rate": 5.9910683880151064e-05,
"loss": 0.3749,
"step": 10488
},
{
"epoch": 13.634241245136186,
"grad_norm": 1.4243568181991577,
"learning_rate": 5.974593257028176e-05,
"loss": 0.3714,
"step": 10512
},
{
"epoch": 13.665369649805447,
"grad_norm": 1.3887135982513428,
"learning_rate": 5.958107119411441e-05,
"loss": 0.3763,
"step": 10536
},
{
"epoch": 13.696498054474707,
"grad_norm": 1.4939093589782715,
"learning_rate": 5.941610161352128e-05,
"loss": 0.3689,
"step": 10560
},
{
"epoch": 13.727626459143968,
"grad_norm": 1.3950523138046265,
"learning_rate": 5.925102569159661e-05,
"loss": 0.3721,
"step": 10584
},
{
"epoch": 13.75875486381323,
"grad_norm": 1.5457286834716797,
"learning_rate": 5.9085845292635645e-05,
"loss": 0.3736,
"step": 10608
},
{
"epoch": 13.789883268482491,
"grad_norm": 1.7134722471237183,
"learning_rate": 5.8920562282113534e-05,
"loss": 0.3705,
"step": 10632
},
{
"epoch": 13.821011673151752,
"grad_norm": 1.9264869689941406,
"learning_rate": 5.875517852666428e-05,
"loss": 0.3731,
"step": 10656
},
{
"epoch": 13.852140077821012,
"grad_norm": 1.9957599639892578,
"learning_rate": 5.8589695894059626e-05,
"loss": 0.3727,
"step": 10680
},
{
"epoch": 13.883268482490273,
"grad_norm": 1.0721269845962524,
"learning_rate": 5.842411625318805e-05,
"loss": 0.3717,
"step": 10704
},
{
"epoch": 13.914396887159533,
"grad_norm": 1.339650273323059,
"learning_rate": 5.825844147403353e-05,
"loss": 0.3781,
"step": 10728
},
{
"epoch": 13.945525291828794,
"grad_norm": 1.0256425142288208,
"learning_rate": 5.809267342765456e-05,
"loss": 0.3743,
"step": 10752
},
{
"epoch": 13.976653696498055,
"grad_norm": 1.1623256206512451,
"learning_rate": 5.792681398616293e-05,
"loss": 0.372,
"step": 10776
},
{
"epoch": 14.007782101167315,
"grad_norm": 2.1772332191467285,
"learning_rate": 5.776086502270258e-05,
"loss": 0.3768,
"step": 10800
},
{
"epoch": 14.038910505836576,
"grad_norm": 1.4126263856887817,
"learning_rate": 5.759482841142848e-05,
"loss": 0.3689,
"step": 10824
},
{
"epoch": 14.070038910505836,
"grad_norm": 1.1903387308120728,
"learning_rate": 5.742870602748547e-05,
"loss": 0.3667,
"step": 10848
},
{
"epoch": 14.101167315175097,
"grad_norm": 1.1915792226791382,
"learning_rate": 5.7262499746987094e-05,
"loss": 0.372,
"step": 10872
},
{
"epoch": 14.132295719844358,
"grad_norm": 1.3118023872375488,
"learning_rate": 5.7096211446994344e-05,
"loss": 0.3673,
"step": 10896
},
{
"epoch": 14.163424124513618,
"grad_norm": 1.0034823417663574,
"learning_rate": 5.692984300549451e-05,
"loss": 0.3743,
"step": 10920
},
{
"epoch": 14.194552529182879,
"grad_norm": 1.1173166036605835,
"learning_rate": 5.6763396301379976e-05,
"loss": 0.3722,
"step": 10944
},
{
"epoch": 14.22568093385214,
"grad_norm": 1.1479343175888062,
"learning_rate": 5.659687321442701e-05,
"loss": 0.3691,
"step": 10968
},
{
"epoch": 14.2568093385214,
"grad_norm": 1.3507132530212402,
"learning_rate": 5.6430275625274456e-05,
"loss": 0.3655,
"step": 10992
},
{
"epoch": 14.287937743190662,
"grad_norm": 1.1012446880340576,
"learning_rate": 5.626360541540261e-05,
"loss": 0.366,
"step": 11016
},
{
"epoch": 14.319066147859923,
"grad_norm": 1.2122224569320679,
"learning_rate": 5.609686446711191e-05,
"loss": 0.3608,
"step": 11040
},
{
"epoch": 14.350194552529183,
"grad_norm": 0.9675916433334351,
"learning_rate": 5.593005466350164e-05,
"loss": 0.3677,
"step": 11064
},
{
"epoch": 14.381322957198444,
"grad_norm": 1.0538902282714844,
"learning_rate": 5.576317788844875e-05,
"loss": 0.369,
"step": 11088
},
{
"epoch": 14.412451361867705,
"grad_norm": 2.077829122543335,
"learning_rate": 5.55962360265865e-05,
"loss": 0.3642,
"step": 11112
},
{
"epoch": 14.443579766536965,
"grad_norm": 1.2885998487472534,
"learning_rate": 5.542923096328325e-05,
"loss": 0.3685,
"step": 11136
},
{
"epoch": 14.474708171206226,
"grad_norm": 2.953463077545166,
"learning_rate": 5.526216458462111e-05,
"loss": 0.3683,
"step": 11160
},
{
"epoch": 14.505836575875486,
"grad_norm": 1.336449384689331,
"learning_rate": 5.509503877737465e-05,
"loss": 0.3627,
"step": 11184
},
{
"epoch": 14.536964980544747,
"grad_norm": 4.623841762542725,
"learning_rate": 5.4927855428989624e-05,
"loss": 0.3738,
"step": 11208
},
{
"epoch": 14.568093385214008,
"grad_norm": 1.4652122259140015,
"learning_rate": 5.476061642756161e-05,
"loss": 0.3722,
"step": 11232
},
{
"epoch": 14.599221789883268,
"grad_norm": 1.3524249792099,
"learning_rate": 5.4593323661814686e-05,
"loss": 0.3586,
"step": 11256
},
{
"epoch": 14.630350194552529,
"grad_norm": 1.833708643913269,
"learning_rate": 5.442597902108019e-05,
"loss": 0.3568,
"step": 11280
},
{
"epoch": 14.66147859922179,
"grad_norm": 1.4893455505371094,
"learning_rate": 5.425858439527525e-05,
"loss": 0.3698,
"step": 11304
},
{
"epoch": 14.69260700389105,
"grad_norm": 1.7463867664337158,
"learning_rate": 5.409114167488152e-05,
"loss": 0.3726,
"step": 11328
},
{
"epoch": 14.72373540856031,
"grad_norm": 1.5364842414855957,
"learning_rate": 5.392365275092383e-05,
"loss": 0.3656,
"step": 11352
},
{
"epoch": 14.754863813229573,
"grad_norm": 1.4161092042922974,
"learning_rate": 5.37561195149488e-05,
"loss": 0.3636,
"step": 11376
},
{
"epoch": 14.785992217898833,
"grad_norm": 1.125667691230774,
"learning_rate": 5.358854385900348e-05,
"loss": 0.3636,
"step": 11400
},
{
"epoch": 14.817120622568094,
"grad_norm": 1.9482998847961426,
"learning_rate": 5.342092767561402e-05,
"loss": 0.3646,
"step": 11424
},
{
"epoch": 14.848249027237355,
"grad_norm": 1.8707369565963745,
"learning_rate": 5.325327285776425e-05,
"loss": 0.3657,
"step": 11448
},
{
"epoch": 14.879377431906615,
"grad_norm": 1.7567267417907715,
"learning_rate": 5.308558129887431e-05,
"loss": 0.3628,
"step": 11472
},
{
"epoch": 14.910505836575876,
"grad_norm": 1.5714308023452759,
"learning_rate": 5.2917854892779304e-05,
"loss": 0.3667,
"step": 11496
},
{
"epoch": 14.941634241245136,
"grad_norm": 2.1905322074890137,
"learning_rate": 5.275009553370788e-05,
"loss": 0.371,
"step": 11520
},
{
"epoch": 14.972762645914397,
"grad_norm": 2.8119211196899414,
"learning_rate": 5.2582305116260835e-05,
"loss": 0.3704,
"step": 11544
},
{
"epoch": 15.003891050583658,
"grad_norm": 1.1872552633285522,
"learning_rate": 5.241448553538968e-05,
"loss": 0.3755,
"step": 11568
},
{
"epoch": 15.035019455252918,
"grad_norm": 1.4244314432144165,
"learning_rate": 5.224663868637538e-05,
"loss": 0.3599,
"step": 11592
},
{
"epoch": 15.066147859922179,
"grad_norm": 1.2808740139007568,
"learning_rate": 5.2078766464806796e-05,
"loss": 0.3683,
"step": 11616
},
{
"epoch": 15.09727626459144,
"grad_norm": 1.0528135299682617,
"learning_rate": 5.191087076655935e-05,
"loss": 0.3598,
"step": 11640
},
{
"epoch": 15.1284046692607,
"grad_norm": 1.8377207517623901,
"learning_rate": 5.174295348777357e-05,
"loss": 0.3553,
"step": 11664
},
{
"epoch": 15.15953307392996,
"grad_norm": 1.7853907346725464,
"learning_rate": 5.1575016524833754e-05,
"loss": 0.3614,
"step": 11688
},
{
"epoch": 15.190661478599221,
"grad_norm": 1.7978260517120361,
"learning_rate": 5.140706177434645e-05,
"loss": 0.3608,
"step": 11712
},
{
"epoch": 15.221789883268482,
"grad_norm": 1.1315481662750244,
"learning_rate": 5.123909113311915e-05,
"loss": 0.3635,
"step": 11736
},
{
"epoch": 15.252918287937742,
"grad_norm": 1.6177383661270142,
"learning_rate": 5.1071106498138764e-05,
"loss": 0.3624,
"step": 11760
},
{
"epoch": 15.284046692607005,
"grad_norm": 1.2278454303741455,
"learning_rate": 5.0903109766550264e-05,
"loss": 0.3658,
"step": 11784
},
{
"epoch": 15.315175097276265,
"grad_norm": 1.3733409643173218,
"learning_rate": 5.073510283563523e-05,
"loss": 0.3612,
"step": 11808
},
{
"epoch": 15.346303501945526,
"grad_norm": 1.3404691219329834,
"learning_rate": 5.05670876027904e-05,
"loss": 0.3629,
"step": 11832
},
{
"epoch": 15.377431906614786,
"grad_norm": 1.2201738357543945,
"learning_rate": 5.039906596550633e-05,
"loss": 0.3666,
"step": 11856
},
{
"epoch": 15.408560311284047,
"grad_norm": 2.0148181915283203,
"learning_rate": 5.023103982134586e-05,
"loss": 0.3665,
"step": 11880
},
{
"epoch": 15.439688715953308,
"grad_norm": 1.249961256980896,
"learning_rate": 5.006301106792274e-05,
"loss": 0.3647,
"step": 11904
},
{
"epoch": 15.470817120622568,
"grad_norm": 1.5822800397872925,
"learning_rate": 4.989498160288019e-05,
"loss": 0.3659,
"step": 11928
},
{
"epoch": 15.501945525291829,
"grad_norm": 1.1686407327651978,
"learning_rate": 4.9726953323869456e-05,
"loss": 0.363,
"step": 11952
},
{
"epoch": 15.53307392996109,
"grad_norm": 1.8801552057266235,
"learning_rate": 4.9558928128528414e-05,
"loss": 0.3623,
"step": 11976
},
{
"epoch": 15.56420233463035,
"grad_norm": 1.2335692644119263,
"learning_rate": 4.9390907914460105e-05,
"loss": 0.3664,
"step": 12000
},
{
"epoch": 15.59533073929961,
"grad_norm": 1.496955156326294,
"learning_rate": 4.9222894579211276e-05,
"loss": 0.3644,
"step": 12024
},
{
"epoch": 15.626459143968871,
"grad_norm": 1.6293377876281738,
"learning_rate": 4.905489002025106e-05,
"loss": 0.3605,
"step": 12048
},
{
"epoch": 15.657587548638132,
"grad_norm": 1.2555320262908936,
"learning_rate": 4.8886896134949415e-05,
"loss": 0.3594,
"step": 12072
},
{
"epoch": 15.688715953307392,
"grad_norm": 1.2741057872772217,
"learning_rate": 4.871891482055575e-05,
"loss": 0.3622,
"step": 12096
},
{
"epoch": 15.719844357976653,
"grad_norm": 2.100410223007202,
"learning_rate": 4.855094797417758e-05,
"loss": 0.3612,
"step": 12120
},
{
"epoch": 15.750972762645915,
"grad_norm": 0.88619464635849,
"learning_rate": 4.8382997492758936e-05,
"loss": 0.3589,
"step": 12144
},
{
"epoch": 15.782101167315176,
"grad_norm": 1.5951071977615356,
"learning_rate": 4.8215065273059085e-05,
"loss": 0.3613,
"step": 12168
},
{
"epoch": 15.813229571984436,
"grad_norm": 1.1034135818481445,
"learning_rate": 4.8047153211631e-05,
"loss": 0.3609,
"step": 12192
},
{
"epoch": 15.844357976653697,
"grad_norm": 1.9069421291351318,
"learning_rate": 4.787926320480009e-05,
"loss": 0.3617,
"step": 12216
},
{
"epoch": 15.875486381322958,
"grad_norm": 2.139292001724243,
"learning_rate": 4.7711397148642583e-05,
"loss": 0.3582,
"step": 12240
},
{
"epoch": 15.906614785992218,
"grad_norm": 1.134293556213379,
"learning_rate": 4.7543556938964275e-05,
"loss": 0.361,
"step": 12264
},
{
"epoch": 15.937743190661479,
"grad_norm": 1.2520484924316406,
"learning_rate": 4.7375744471279084e-05,
"loss": 0.3613,
"step": 12288
},
{
"epoch": 15.96887159533074,
"grad_norm": 1.2001314163208008,
"learning_rate": 4.720796164078755e-05,
"loss": 0.363,
"step": 12312
},
{
"epoch": 16.0,
"grad_norm": 1.0038580894470215,
"learning_rate": 4.7040210342355584e-05,
"loss": 0.3566,
"step": 12336
},
{
"epoch": 16.03112840466926,
"grad_norm": 1.0586698055267334,
"learning_rate": 4.6872492470492914e-05,
"loss": 0.3554,
"step": 12360
},
{
"epoch": 16.06225680933852,
"grad_norm": 1.4238923788070679,
"learning_rate": 4.670480991933182e-05,
"loss": 0.3598,
"step": 12384
},
{
"epoch": 16.09338521400778,
"grad_norm": 1.7448209524154663,
"learning_rate": 4.6537164582605674e-05,
"loss": 0.3523,
"step": 12408
},
{
"epoch": 16.124513618677042,
"grad_norm": 0.9236373901367188,
"learning_rate": 4.6369558353627517e-05,
"loss": 0.3556,
"step": 12432
},
{
"epoch": 16.155642023346303,
"grad_norm": 1.2013592720031738,
"learning_rate": 4.6201993125268804e-05,
"loss": 0.352,
"step": 12456
},
{
"epoch": 16.186770428015564,
"grad_norm": 1.267756700515747,
"learning_rate": 4.603447078993788e-05,
"loss": 0.3578,
"step": 12480
},
{
"epoch": 16.217898832684824,
"grad_norm": 1.0369305610656738,
"learning_rate": 4.586699323955871e-05,
"loss": 0.3476,
"step": 12504
},
{
"epoch": 16.249027237354085,
"grad_norm": 1.4075908660888672,
"learning_rate": 4.569956236554945e-05,
"loss": 0.3544,
"step": 12528
},
{
"epoch": 16.280155642023345,
"grad_norm": 1.3998584747314453,
"learning_rate": 4.5532180058801145e-05,
"loss": 0.3596,
"step": 12552
},
{
"epoch": 16.311284046692606,
"grad_norm": 1.5231702327728271,
"learning_rate": 4.5364848209656336e-05,
"loss": 0.3542,
"step": 12576
},
{
"epoch": 16.342412451361866,
"grad_norm": 1.283345103263855,
"learning_rate": 4.5197568707887675e-05,
"loss": 0.3526,
"step": 12600
},
{
"epoch": 16.373540856031127,
"grad_norm": 1.3944894075393677,
"learning_rate": 4.503034344267671e-05,
"loss": 0.357,
"step": 12624
},
{
"epoch": 16.404669260700388,
"grad_norm": 1.9900680780410767,
"learning_rate": 4.486317430259238e-05,
"loss": 0.3603,
"step": 12648
},
{
"epoch": 16.43579766536965,
"grad_norm": 0.9823328852653503,
"learning_rate": 4.4696063175569804e-05,
"loss": 0.3545,
"step": 12672
},
{
"epoch": 16.46692607003891,
"grad_norm": 1.634529709815979,
"learning_rate": 4.452901194888897e-05,
"loss": 0.3543,
"step": 12696
},
{
"epoch": 16.49805447470817,
"grad_norm": 1.4010380506515503,
"learning_rate": 4.436202250915329e-05,
"loss": 0.3524,
"step": 12720
},
{
"epoch": 16.529182879377434,
"grad_norm": 1.239943504333496,
"learning_rate": 4.419509674226846e-05,
"loss": 0.3648,
"step": 12744
},
{
"epoch": 16.560311284046694,
"grad_norm": 3.315246820449829,
"learning_rate": 4.4028236533421016e-05,
"loss": 0.3624,
"step": 12768
},
{
"epoch": 16.591439688715955,
"grad_norm": 1.0445722341537476,
"learning_rate": 4.3861443767057205e-05,
"loss": 0.3536,
"step": 12792
},
{
"epoch": 16.622568093385215,
"grad_norm": 1.154893398284912,
"learning_rate": 4.369472032686149e-05,
"loss": 0.3608,
"step": 12816
},
{
"epoch": 16.653696498054476,
"grad_norm": 2.0033769607543945,
"learning_rate": 4.352806809573547e-05,
"loss": 0.3511,
"step": 12840
},
{
"epoch": 16.684824902723737,
"grad_norm": 1.4693876504898071,
"learning_rate": 4.336148895577656e-05,
"loss": 0.3531,
"step": 12864
},
{
"epoch": 16.715953307392997,
"grad_norm": 1.8765549659729004,
"learning_rate": 4.319498478825663e-05,
"loss": 0.3563,
"step": 12888
},
{
"epoch": 16.747081712062258,
"grad_norm": 1.6893914937973022,
"learning_rate": 4.302855747360092e-05,
"loss": 0.3579,
"step": 12912
},
{
"epoch": 16.77821011673152,
"grad_norm": 1.183452844619751,
"learning_rate": 4.286220889136668e-05,
"loss": 0.3637,
"step": 12936
},
{
"epoch": 16.80933852140078,
"grad_norm": 1.102815866470337,
"learning_rate": 4.269594092022203e-05,
"loss": 0.3561,
"step": 12960
},
{
"epoch": 16.84046692607004,
"grad_norm": 0.9764434695243835,
"learning_rate": 4.252975543792468e-05,
"loss": 0.3581,
"step": 12984
},
{
"epoch": 16.8715953307393,
"grad_norm": 2.3779425621032715,
"learning_rate": 4.2363654321300735e-05,
"loss": 0.3531,
"step": 13008
},
{
"epoch": 16.90272373540856,
"grad_norm": 1.463118076324463,
"learning_rate": 4.219763944622356e-05,
"loss": 0.3562,
"step": 13032
},
{
"epoch": 16.93385214007782,
"grad_norm": 1.756101369857788,
"learning_rate": 4.203171268759248e-05,
"loss": 0.3566,
"step": 13056
},
{
"epoch": 16.964980544747082,
"grad_norm": 1.5917153358459473,
"learning_rate": 4.1865875919311726e-05,
"loss": 0.3504,
"step": 13080
},
{
"epoch": 16.996108949416342,
"grad_norm": 2.404031753540039,
"learning_rate": 4.170013101426917e-05,
"loss": 0.3581,
"step": 13104
},
{
"epoch": 17.027237354085603,
"grad_norm": 1.3285900354385376,
"learning_rate": 4.153447984431527e-05,
"loss": 0.3499,
"step": 13128
},
{
"epoch": 17.058365758754864,
"grad_norm": 1.0520793199539185,
"learning_rate": 4.136892428024187e-05,
"loss": 0.3547,
"step": 13152
},
{
"epoch": 17.089494163424124,
"grad_norm": 1.0784560441970825,
"learning_rate": 4.120346619176102e-05,
"loss": 0.3525,
"step": 13176
},
{
"epoch": 17.120622568093385,
"grad_norm": 1.9099761247634888,
"learning_rate": 4.103810744748403e-05,
"loss": 0.3531,
"step": 13200
},
{
"epoch": 17.151750972762645,
"grad_norm": 1.4144366979599,
"learning_rate": 4.0872849914900175e-05,
"loss": 0.3431,
"step": 13224
},
{
"epoch": 17.182879377431906,
"grad_norm": 1.078682541847229,
"learning_rate": 4.070769546035571e-05,
"loss": 0.3563,
"step": 13248
},
{
"epoch": 17.214007782101167,
"grad_norm": 2.5183982849121094,
"learning_rate": 4.054264594903281e-05,
"loss": 0.3534,
"step": 13272
},
{
"epoch": 17.245136186770427,
"grad_norm": 1.3110893964767456,
"learning_rate": 4.037770324492841e-05,
"loss": 0.351,
"step": 13296
},
{
"epoch": 17.276264591439688,
"grad_norm": 1.4684545993804932,
"learning_rate": 4.021286921083326e-05,
"loss": 0.3525,
"step": 13320
},
{
"epoch": 17.30739299610895,
"grad_norm": 1.3898323774337769,
"learning_rate": 4.004814570831078e-05,
"loss": 0.353,
"step": 13344
},
{
"epoch": 17.33852140077821,
"grad_norm": 1.7565838098526,
"learning_rate": 3.9883534597676177e-05,
"loss": 0.3566,
"step": 13368
},
{
"epoch": 17.36964980544747,
"grad_norm": 1.3672667741775513,
"learning_rate": 3.971903773797528e-05,
"loss": 0.3502,
"step": 13392
},
{
"epoch": 17.40077821011673,
"grad_norm": 1.2242878675460815,
"learning_rate": 3.955465698696363e-05,
"loss": 0.3518,
"step": 13416
},
{
"epoch": 17.43190661478599,
"grad_norm": 2.410991907119751,
"learning_rate": 3.939039420108556e-05,
"loss": 0.3503,
"step": 13440
},
{
"epoch": 17.46303501945525,
"grad_norm": 1.4282727241516113,
"learning_rate": 3.922625123545305e-05,
"loss": 0.3488,
"step": 13464
},
{
"epoch": 17.494163424124515,
"grad_norm": 1.5992825031280518,
"learning_rate": 3.906222994382495e-05,
"loss": 0.3567,
"step": 13488
},
{
"epoch": 17.525291828793776,
"grad_norm": 2.398169994354248,
"learning_rate": 3.889833217858594e-05,
"loss": 0.3542,
"step": 13512
},
{
"epoch": 17.556420233463037,
"grad_norm": 1.140195608139038,
"learning_rate": 3.873455979072569e-05,
"loss": 0.3493,
"step": 13536
},
{
"epoch": 17.587548638132297,
"grad_norm": 1.305156946182251,
"learning_rate": 3.8570914629817886e-05,
"loss": 0.3504,
"step": 13560
},
{
"epoch": 17.618677042801558,
"grad_norm": 9.382534980773926,
"learning_rate": 3.840739854399934e-05,
"loss": 0.3534,
"step": 13584
},
{
"epoch": 17.64980544747082,
"grad_norm": 1.1403177976608276,
"learning_rate": 3.824401337994923e-05,
"loss": 0.3461,
"step": 13608
},
{
"epoch": 17.68093385214008,
"grad_norm": 2.1274640560150146,
"learning_rate": 3.808076098286806e-05,
"loss": 0.3521,
"step": 13632
},
{
"epoch": 17.71206225680934,
"grad_norm": 1.9969298839569092,
"learning_rate": 3.7917643196457e-05,
"loss": 0.3521,
"step": 13656
},
{
"epoch": 17.7431906614786,
"grad_norm": 1.2433438301086426,
"learning_rate": 3.775466186289693e-05,
"loss": 0.3565,
"step": 13680
},
{
"epoch": 17.77431906614786,
"grad_norm": 1.7864729166030884,
"learning_rate": 3.7591818822827745e-05,
"loss": 0.3508,
"step": 13704
},
{
"epoch": 17.80544747081712,
"grad_norm": 1.7596447467803955,
"learning_rate": 3.7429115915327484e-05,
"loss": 0.3533,
"step": 13728
},
{
"epoch": 17.836575875486382,
"grad_norm": 1.7605047225952148,
"learning_rate": 3.726655497789156e-05,
"loss": 0.3553,
"step": 13752
},
{
"epoch": 17.867704280155642,
"grad_norm": 1.5380836725234985,
"learning_rate": 3.710413784641212e-05,
"loss": 0.3526,
"step": 13776
},
{
"epoch": 17.898832684824903,
"grad_norm": 1.448866844177246,
"learning_rate": 3.694186635515714e-05,
"loss": 0.3516,
"step": 13800
},
{
"epoch": 17.929961089494164,
"grad_norm": 1.527550458908081,
"learning_rate": 3.677974233674983e-05,
"loss": 0.3438,
"step": 13824
},
{
"epoch": 17.961089494163424,
"grad_norm": 1.3250521421432495,
"learning_rate": 3.661776762214797e-05,
"loss": 0.3551,
"step": 13848
},
{
"epoch": 17.992217898832685,
"grad_norm": 1.4741333723068237,
"learning_rate": 3.6455944040623075e-05,
"loss": 0.3529,
"step": 13872
},
{
"epoch": 18.023346303501945,
"grad_norm": 2.2234058380126953,
"learning_rate": 3.6294273419739874e-05,
"loss": 0.3486,
"step": 13896
},
{
"epoch": 18.054474708171206,
"grad_norm": 1.4099419116973877,
"learning_rate": 3.613275758533561e-05,
"loss": 0.3473,
"step": 13920
},
{
"epoch": 18.085603112840467,
"grad_norm": 1.9094316959381104,
"learning_rate": 3.5971398361499466e-05,
"loss": 0.3548,
"step": 13944
},
{
"epoch": 18.116731517509727,
"grad_norm": 1.2845815420150757,
"learning_rate": 3.581019757055188e-05,
"loss": 0.345,
"step": 13968
},
{
"epoch": 18.147859922178988,
"grad_norm": 2.0491998195648193,
"learning_rate": 3.564915703302407e-05,
"loss": 0.3474,
"step": 13992
},
{
"epoch": 18.17898832684825,
"grad_norm": 1.3620078563690186,
"learning_rate": 3.5488278567637426e-05,
"loss": 0.3452,
"step": 14016
},
{
"epoch": 18.21011673151751,
"grad_norm": 4.295355796813965,
"learning_rate": 3.53275639912829e-05,
"loss": 0.3474,
"step": 14040
},
{
"epoch": 18.24124513618677,
"grad_norm": 2.150200366973877,
"learning_rate": 3.516701511900062e-05,
"loss": 0.3465,
"step": 14064
},
{
"epoch": 18.27237354085603,
"grad_norm": 1.407614827156067,
"learning_rate": 3.500663376395927e-05,
"loss": 0.3453,
"step": 14088
},
{
"epoch": 18.30350194552529,
"grad_norm": 1.2066164016723633,
"learning_rate": 3.484642173743575e-05,
"loss": 0.3477,
"step": 14112
},
{
"epoch": 18.33463035019455,
"grad_norm": 1.1473839282989502,
"learning_rate": 3.4686380848794544e-05,
"loss": 0.3448,
"step": 14136
},
{
"epoch": 18.365758754863812,
"grad_norm": 2.0838565826416016,
"learning_rate": 3.452651290546742e-05,
"loss": 0.3451,
"step": 14160
},
{
"epoch": 18.396887159533073,
"grad_norm": 1.3917421102523804,
"learning_rate": 3.436681971293301e-05,
"loss": 0.3442,
"step": 14184
},
{
"epoch": 18.428015564202333,
"grad_norm": 1.2915924787521362,
"learning_rate": 3.420730307469632e-05,
"loss": 0.3409,
"step": 14208
},
{
"epoch": 18.459143968871594,
"grad_norm": 2.337096691131592,
"learning_rate": 3.404796479226852e-05,
"loss": 0.3471,
"step": 14232
},
{
"epoch": 18.490272373540854,
"grad_norm": 1.732359528541565,
"learning_rate": 3.3888806665146374e-05,
"loss": 0.3478,
"step": 14256
},
{
"epoch": 18.52140077821012,
"grad_norm": 1.1314399242401123,
"learning_rate": 3.3729830490792166e-05,
"loss": 0.345,
"step": 14280
},
{
"epoch": 18.55252918287938,
"grad_norm": 1.5127285718917847,
"learning_rate": 3.357103806461328e-05,
"loss": 0.3405,
"step": 14304
},
{
"epoch": 18.58365758754864,
"grad_norm": 1.306648850440979,
"learning_rate": 3.3412431179941847e-05,
"loss": 0.3443,
"step": 14328
},
{
"epoch": 18.6147859922179,
"grad_norm": 1.189726710319519,
"learning_rate": 3.3254011628014656e-05,
"loss": 0.3447,
"step": 14352
},
{
"epoch": 18.64591439688716,
"grad_norm": 1.2058913707733154,
"learning_rate": 3.309578119795278e-05,
"loss": 0.347,
"step": 14376
},
{
"epoch": 18.67704280155642,
"grad_norm": 1.702572226524353,
"learning_rate": 3.293774167674149e-05,
"loss": 0.3496,
"step": 14400
},
{
"epoch": 18.708171206225682,
"grad_norm": 1.8515872955322266,
"learning_rate": 3.277989484920996e-05,
"loss": 0.344,
"step": 14424
},
{
"epoch": 18.739299610894943,
"grad_norm": 1.8190243244171143,
"learning_rate": 3.26222424980112e-05,
"loss": 0.3499,
"step": 14448
},
{
"epoch": 18.770428015564203,
"grad_norm": 1.261648416519165,
"learning_rate": 3.246478640360191e-05,
"loss": 0.345,
"step": 14472
},
{
"epoch": 18.801556420233464,
"grad_norm": 1.3052914142608643,
"learning_rate": 3.2307528344222296e-05,
"loss": 0.3505,
"step": 14496
},
{
"epoch": 18.832684824902724,
"grad_norm": 1.5217386484146118,
"learning_rate": 3.215047009587609e-05,
"loss": 0.3507,
"step": 14520
},
{
"epoch": 18.863813229571985,
"grad_norm": 1.2934740781784058,
"learning_rate": 3.1993613432310384e-05,
"loss": 0.3459,
"step": 14544
},
{
"epoch": 18.894941634241246,
"grad_norm": 1.5978559255599976,
"learning_rate": 3.183696012499574e-05,
"loss": 0.3464,
"step": 14568
},
{
"epoch": 18.926070038910506,
"grad_norm": 1.2306820154190063,
"learning_rate": 3.168051194310609e-05,
"loss": 0.3446,
"step": 14592
},
{
"epoch": 18.957198443579767,
"grad_norm": 1.1488240957260132,
"learning_rate": 3.152427065349867e-05,
"loss": 0.3475,
"step": 14616
},
{
"epoch": 18.988326848249027,
"grad_norm": 3.1832704544067383,
"learning_rate": 3.1368238020694316e-05,
"loss": 0.3437,
"step": 14640
},
{
"epoch": 19.019455252918288,
"grad_norm": 2.3371617794036865,
"learning_rate": 3.121241580685727e-05,
"loss": 0.3465,
"step": 14664
},
{
"epoch": 19.05058365758755,
"grad_norm": 2.816099166870117,
"learning_rate": 3.1056805771775436e-05,
"loss": 0.3435,
"step": 14688
},
{
"epoch": 19.08171206225681,
"grad_norm": 1.3421522378921509,
"learning_rate": 3.090140967284046e-05,
"loss": 0.3418,
"step": 14712
},
{
"epoch": 19.11284046692607,
"grad_norm": 1.8488672971725464,
"learning_rate": 3.07462292650279e-05,
"loss": 0.348,
"step": 14736
},
{
"epoch": 19.14396887159533,
"grad_norm": 1.2293037176132202,
"learning_rate": 3.05912663008774e-05,
"loss": 0.342,
"step": 14760
},
{
"epoch": 19.17509727626459,
"grad_norm": 1.7620015144348145,
"learning_rate": 3.043652253047281e-05,
"loss": 0.3454,
"step": 14784
},
{
"epoch": 19.20622568093385,
"grad_norm": 1.6479402780532837,
"learning_rate": 3.0281999701422637e-05,
"loss": 0.3427,
"step": 14808
},
{
"epoch": 19.237354085603112,
"grad_norm": 1.5058902502059937,
"learning_rate": 3.012769955884005e-05,
"loss": 0.3328,
"step": 14832
},
{
"epoch": 19.268482490272373,
"grad_norm": 1.6616445779800415,
"learning_rate": 2.9973623845323347e-05,
"loss": 0.3441,
"step": 14856
},
{
"epoch": 19.299610894941633,
"grad_norm": 1.5390020608901978,
"learning_rate": 2.9819774300936255e-05,
"loss": 0.3434,
"step": 14880
},
{
"epoch": 19.330739299610894,
"grad_norm": 1.7172026634216309,
"learning_rate": 2.9666152663188172e-05,
"loss": 0.3439,
"step": 14904
},
{
"epoch": 19.361867704280154,
"grad_norm": 1.134320855140686,
"learning_rate": 2.9512760667014682e-05,
"loss": 0.3431,
"step": 14928
},
{
"epoch": 19.392996108949415,
"grad_norm": 4.418805122375488,
"learning_rate": 2.935960004475784e-05,
"loss": 0.344,
"step": 14952
},
{
"epoch": 19.424124513618676,
"grad_norm": 1.3951141834259033,
"learning_rate": 2.920667252614674e-05,
"loss": 0.3334,
"step": 14976
},
{
"epoch": 19.455252918287936,
"grad_norm": 2.0081377029418945,
"learning_rate": 2.9053979838277834e-05,
"loss": 0.3413,
"step": 15000
},
{
"epoch": 19.486381322957197,
"grad_norm": 1.0862860679626465,
"learning_rate": 2.890152370559552e-05,
"loss": 0.3406,
"step": 15024
},
{
"epoch": 19.51750972762646,
"grad_norm": 1.3487762212753296,
"learning_rate": 2.8749305849872686e-05,
"loss": 0.3335,
"step": 15048
},
{
"epoch": 19.54863813229572,
"grad_norm": 1.122753381729126,
"learning_rate": 2.8597327990191146e-05,
"loss": 0.3491,
"step": 15072
},
{
"epoch": 19.579766536964982,
"grad_norm": 1.518355131149292,
"learning_rate": 2.844559184292239e-05,
"loss": 0.3405,
"step": 15096
},
{
"epoch": 19.610894941634243,
"grad_norm": 1.0469350814819336,
"learning_rate": 2.829409912170806e-05,
"loss": 0.3395,
"step": 15120
},
{
"epoch": 19.642023346303503,
"grad_norm": 1.915490984916687,
"learning_rate": 2.814285153744064e-05,
"loss": 0.3426,
"step": 15144
},
{
"epoch": 19.673151750972764,
"grad_norm": 1.477184772491455,
"learning_rate": 2.7991850798244197e-05,
"loss": 0.3463,
"step": 15168
},
{
"epoch": 19.704280155642024,
"grad_norm": 1.3598774671554565,
"learning_rate": 2.7841098609454976e-05,
"loss": 0.3454,
"step": 15192
},
{
"epoch": 19.735408560311285,
"grad_norm": 2.6406991481781006,
"learning_rate": 2.769059667360227e-05,
"loss": 0.3422,
"step": 15216
},
{
"epoch": 19.766536964980546,
"grad_norm": 1.2698395252227783,
"learning_rate": 2.754034669038905e-05,
"loss": 0.3473,
"step": 15240
},
{
"epoch": 19.797665369649806,
"grad_norm": 1.3700004816055298,
"learning_rate": 2.7390350356672934e-05,
"loss": 0.3434,
"step": 15264
},
{
"epoch": 19.828793774319067,
"grad_norm": 1.1726247072219849,
"learning_rate": 2.7240609366446845e-05,
"loss": 0.3421,
"step": 15288
},
{
"epoch": 19.859922178988327,
"grad_norm": 1.5183639526367188,
"learning_rate": 2.709112541082e-05,
"loss": 0.3418,
"step": 15312
},
{
"epoch": 19.891050583657588,
"grad_norm": 1.1311919689178467,
"learning_rate": 2.6941900177998824e-05,
"loss": 0.3411,
"step": 15336
},
{
"epoch": 19.92217898832685,
"grad_norm": 1.6014869213104248,
"learning_rate": 2.6792935353267757e-05,
"loss": 0.339,
"step": 15360
},
{
"epoch": 19.95330739299611,
"grad_norm": 1.8378218412399292,
"learning_rate": 2.6644232618970382e-05,
"loss": 0.3464,
"step": 15384
},
{
"epoch": 19.98443579766537,
"grad_norm": 2.1291933059692383,
"learning_rate": 2.6495793654490292e-05,
"loss": 0.3409,
"step": 15408
},
{
"epoch": 20.01556420233463,
"grad_norm": 1.1774524450302124,
"learning_rate": 2.6347620136232232e-05,
"loss": 0.339,
"step": 15432
},
{
"epoch": 20.04669260700389,
"grad_norm": 1.3319616317749023,
"learning_rate": 2.6199713737603055e-05,
"loss": 0.3376,
"step": 15456
},
{
"epoch": 20.07782101167315,
"grad_norm": 1.488239049911499,
"learning_rate": 2.60520761289929e-05,
"loss": 0.3379,
"step": 15480
},
{
"epoch": 20.108949416342412,
"grad_norm": 1.2733827829360962,
"learning_rate": 2.590470897775636e-05,
"loss": 0.3352,
"step": 15504
},
{
"epoch": 20.140077821011673,
"grad_norm": 2.291374921798706,
"learning_rate": 2.575761394819351e-05,
"loss": 0.3395,
"step": 15528
},
{
"epoch": 20.171206225680933,
"grad_norm": 1.3169567584991455,
"learning_rate": 2.5610792701531298e-05,
"loss": 0.3365,
"step": 15552
},
{
"epoch": 20.202334630350194,
"grad_norm": 1.0463300943374634,
"learning_rate": 2.54642468959046e-05,
"loss": 0.337,
"step": 15576
},
{
"epoch": 20.233463035019454,
"grad_norm": 1.5346705913543701,
"learning_rate": 2.5317978186337664e-05,
"loss": 0.3394,
"step": 15600
},
{
"epoch": 20.264591439688715,
"grad_norm": 1.6092703342437744,
"learning_rate": 2.5171988224725267e-05,
"loss": 0.3308,
"step": 15624
},
{
"epoch": 20.295719844357976,
"grad_norm": 1.3011606931686401,
"learning_rate": 2.5026278659814144e-05,
"loss": 0.339,
"step": 15648
},
{
"epoch": 20.326848249027236,
"grad_norm": 1.2459102869033813,
"learning_rate": 2.4880851137184403e-05,
"loss": 0.3308,
"step": 15672
},
{
"epoch": 20.357976653696497,
"grad_norm": 1.4810408353805542,
"learning_rate": 2.4735707299230808e-05,
"loss": 0.3376,
"step": 15696
},
{
"epoch": 20.389105058365757,
"grad_norm": 1.2645267248153687,
"learning_rate": 2.4590848785144386e-05,
"loss": 0.3402,
"step": 15720
},
{
"epoch": 20.420233463035018,
"grad_norm": 2.001779556274414,
"learning_rate": 2.4446277230893823e-05,
"loss": 0.3358,
"step": 15744
},
{
"epoch": 20.45136186770428,
"grad_norm": 3.0970067977905273,
"learning_rate": 2.4301994269206968e-05,
"loss": 0.334,
"step": 15768
},
{
"epoch": 20.48249027237354,
"grad_norm": 1.4983640909194946,
"learning_rate": 2.415800152955247e-05,
"loss": 0.3424,
"step": 15792
},
{
"epoch": 20.5136186770428,
"grad_norm": 1.3392024040222168,
"learning_rate": 2.40143006381213e-05,
"loss": 0.3463,
"step": 15816
},
{
"epoch": 20.544747081712064,
"grad_norm": 1.4383450746536255,
"learning_rate": 2.3870893217808495e-05,
"loss": 0.3354,
"step": 15840
},
{
"epoch": 20.575875486381324,
"grad_norm": 1.4223530292510986,
"learning_rate": 2.3727780888194658e-05,
"loss": 0.333,
"step": 15864
},
{
"epoch": 20.607003891050585,
"grad_norm": 1.5441044569015503,
"learning_rate": 2.3584965265527847e-05,
"loss": 0.3335,
"step": 15888
},
{
"epoch": 20.638132295719846,
"grad_norm": 0.8291170597076416,
"learning_rate": 2.344244796270524e-05,
"loss": 0.3389,
"step": 15912
},
{
"epoch": 20.669260700389106,
"grad_norm": 2.7805609703063965,
"learning_rate": 2.330023058925486e-05,
"loss": 0.3353,
"step": 15936
},
{
"epoch": 20.700389105058367,
"grad_norm": 1.6097582578659058,
"learning_rate": 2.3158314751317513e-05,
"loss": 0.339,
"step": 15960
},
{
"epoch": 20.731517509727627,
"grad_norm": 1.4149878025054932,
"learning_rate": 2.3016702051628547e-05,
"loss": 0.3375,
"step": 15984
},
{
"epoch": 20.762645914396888,
"grad_norm": 1.2236443758010864,
"learning_rate": 2.2875394089499847e-05,
"loss": 0.3358,
"step": 16008
},
{
"epoch": 20.79377431906615,
"grad_norm": 1.0645393133163452,
"learning_rate": 2.2734392460801727e-05,
"loss": 0.3377,
"step": 16032
},
{
"epoch": 20.82490272373541,
"grad_norm": 1.2843340635299683,
"learning_rate": 2.259369875794485e-05,
"loss": 0.3332,
"step": 16056
},
{
"epoch": 20.85603112840467,
"grad_norm": 1.735514760017395,
"learning_rate": 2.2453314569862366e-05,
"loss": 0.3364,
"step": 16080
},
{
"epoch": 20.88715953307393,
"grad_norm": 1.3856208324432373,
"learning_rate": 2.2313241481991855e-05,
"loss": 0.3389,
"step": 16104
},
{
"epoch": 20.91828793774319,
"grad_norm": 1.7546725273132324,
"learning_rate": 2.217348107625748e-05,
"loss": 0.3373,
"step": 16128
},
{
"epoch": 20.94941634241245,
"grad_norm": 1.3664530515670776,
"learning_rate": 2.2034034931052096e-05,
"loss": 0.3398,
"step": 16152
},
{
"epoch": 20.980544747081712,
"grad_norm": 5.165532112121582,
"learning_rate": 2.1894904621219463e-05,
"loss": 0.3372,
"step": 16176
},
{
"epoch": 21.011673151750973,
"grad_norm": 1.3261635303497314,
"learning_rate": 2.175609171803644e-05,
"loss": 0.3381,
"step": 16200
},
{
"epoch": 21.042801556420233,
"grad_norm": 1.8854881525039673,
"learning_rate": 2.1617597789195193e-05,
"loss": 0.3347,
"step": 16224
},
{
"epoch": 21.073929961089494,
"grad_norm": 1.3904035091400146,
"learning_rate": 2.1479424398785573e-05,
"loss": 0.3346,
"step": 16248
},
{
"epoch": 21.105058365758754,
"grad_norm": 1.318601369857788,
"learning_rate": 2.1341573107277392e-05,
"loss": 0.3347,
"step": 16272
},
{
"epoch": 21.136186770428015,
"grad_norm": 1.0564274787902832,
"learning_rate": 2.1204045471502803e-05,
"loss": 0.3295,
"step": 16296
},
{
"epoch": 21.167315175097276,
"grad_norm": 0.9953235387802124,
"learning_rate": 2.106684304463874e-05,
"loss": 0.3339,
"step": 16320
},
{
"epoch": 21.198443579766536,
"grad_norm": 1.0253063440322876,
"learning_rate": 2.092996737618939e-05,
"loss": 0.3271,
"step": 16344
},
{
"epoch": 21.229571984435797,
"grad_norm": 1.5001134872436523,
"learning_rate": 2.079342001196869e-05,
"loss": 0.3359,
"step": 16368
},
{
"epoch": 21.260700389105057,
"grad_norm": 1.1106650829315186,
"learning_rate": 2.0657202494082773e-05,
"loss": 0.327,
"step": 16392
},
{
"epoch": 21.291828793774318,
"grad_norm": 1.0053423643112183,
"learning_rate": 2.052131636091273e-05,
"loss": 0.3398,
"step": 16416
},
{
"epoch": 21.32295719844358,
"grad_norm": 1.3083621263504028,
"learning_rate": 2.038576314709707e-05,
"loss": 0.3306,
"step": 16440
},
{
"epoch": 21.35408560311284,
"grad_norm": 1.4561755657196045,
"learning_rate": 2.0250544383514457e-05,
"loss": 0.3364,
"step": 16464
},
{
"epoch": 21.3852140077821,
"grad_norm": 1.0885835886001587,
"learning_rate": 2.0115661597266476e-05,
"loss": 0.3355,
"step": 16488
},
{
"epoch": 21.41634241245136,
"grad_norm": 1.3506430387496948,
"learning_rate": 1.998111631166027e-05,
"loss": 0.3334,
"step": 16512
},
{
"epoch": 21.44747081712062,
"grad_norm": 1.0331530570983887,
"learning_rate": 1.9846910046191446e-05,
"loss": 0.3303,
"step": 16536
},
{
"epoch": 21.47859922178988,
"grad_norm": 1.0616254806518555,
"learning_rate": 1.9713044316526813e-05,
"loss": 0.3348,
"step": 16560
},
{
"epoch": 21.509727626459146,
"grad_norm": 2.5577657222747803,
"learning_rate": 1.9579520634487386e-05,
"loss": 0.335,
"step": 16584
},
{
"epoch": 21.540856031128406,
"grad_norm": 1.5290476083755493,
"learning_rate": 1.9446340508031185e-05,
"loss": 0.3382,
"step": 16608
},
{
"epoch": 21.571984435797667,
"grad_norm": 0.8804724216461182,
"learning_rate": 1.931350544123627e-05,
"loss": 0.3257,
"step": 16632
},
{
"epoch": 21.603112840466927,
"grad_norm": 1.1799284219741821,
"learning_rate": 1.918101693428379e-05,
"loss": 0.3298,
"step": 16656
},
{
"epoch": 21.634241245136188,
"grad_norm": 1.3328742980957031,
"learning_rate": 1.9048876483440942e-05,
"loss": 0.3373,
"step": 16680
},
{
"epoch": 21.66536964980545,
"grad_norm": 0.9985073208808899,
"learning_rate": 1.8917085581044193e-05,
"loss": 0.3313,
"step": 16704
},
{
"epoch": 21.69649805447471,
"grad_norm": 1.498244047164917,
"learning_rate": 1.8785645715482285e-05,
"loss": 0.3303,
"step": 16728
},
{
"epoch": 21.72762645914397,
"grad_norm": 1.6468580961227417,
"learning_rate": 1.8654558371179583e-05,
"loss": 0.3252,
"step": 16752
},
{
"epoch": 21.75875486381323,
"grad_norm": 1.6541725397109985,
"learning_rate": 1.8523825028579212e-05,
"loss": 0.3299,
"step": 16776
},
{
"epoch": 21.78988326848249,
"grad_norm": 0.9805202484130859,
"learning_rate": 1.8393447164126282e-05,
"loss": 0.3342,
"step": 16800
},
{
"epoch": 21.82101167315175,
"grad_norm": 0.9097315073013306,
"learning_rate": 1.8263426250251388e-05,
"loss": 0.3309,
"step": 16824
},
{
"epoch": 21.852140077821012,
"grad_norm": 1.2603996992111206,
"learning_rate": 1.8133763755353816e-05,
"loss": 0.3387,
"step": 16848
},
{
"epoch": 21.883268482490273,
"grad_norm": 1.0283710956573486,
"learning_rate": 1.800446114378508e-05,
"loss": 0.3325,
"step": 16872
},
{
"epoch": 21.914396887159533,
"grad_norm": 2.601137399673462,
"learning_rate": 1.7875519875832254e-05,
"loss": 0.3356,
"step": 16896
},
{
"epoch": 21.945525291828794,
"grad_norm": 1.0405902862548828,
"learning_rate": 1.774694140770163e-05,
"loss": 0.3339,
"step": 16920
},
{
"epoch": 21.976653696498055,
"grad_norm": 1.504928708076477,
"learning_rate": 1.7618727191502188e-05,
"loss": 0.3329,
"step": 16944
},
{
"epoch": 22.007782101167315,
"grad_norm": 1.1356394290924072,
"learning_rate": 1.749087867522912e-05,
"loss": 0.331,
"step": 16968
},
{
"epoch": 22.038910505836576,
"grad_norm": 1.3053059577941895,
"learning_rate": 1.7363397302747687e-05,
"loss": 0.3316,
"step": 16992
},
{
"epoch": 22.070038910505836,
"grad_norm": 1.8512986898422241,
"learning_rate": 1.723628451377669e-05,
"loss": 0.3286,
"step": 17016
},
{
"epoch": 22.101167315175097,
"grad_norm": 1.1379419565200806,
"learning_rate": 1.7109541743872366e-05,
"loss": 0.3311,
"step": 17040
},
{
"epoch": 22.132295719844358,
"grad_norm": 1.0137568712234497,
"learning_rate": 1.698317042441211e-05,
"loss": 0.3294,
"step": 17064
},
{
"epoch": 22.163424124513618,
"grad_norm": 1.1163158416748047,
"learning_rate": 1.6857171982578286e-05,
"loss": 0.3247,
"step": 17088
},
{
"epoch": 22.19455252918288,
"grad_norm": 0.992064893245697,
"learning_rate": 1.6731547841342193e-05,
"loss": 0.3331,
"step": 17112
},
{
"epoch": 22.22568093385214,
"grad_norm": 1.2021843194961548,
"learning_rate": 1.6606299419447894e-05,
"loss": 0.3284,
"step": 17136
},
{
"epoch": 22.2568093385214,
"grad_norm": 2.352348566055298,
"learning_rate": 1.6481428131396275e-05,
"loss": 0.3315,
"step": 17160
},
{
"epoch": 22.28793774319066,
"grad_norm": 1.283078908920288,
"learning_rate": 1.6356935387428996e-05,
"loss": 0.3262,
"step": 17184
},
{
"epoch": 22.31906614785992,
"grad_norm": 1.2125391960144043,
"learning_rate": 1.6232822593512654e-05,
"loss": 0.3312,
"step": 17208
},
{
"epoch": 22.35019455252918,
"grad_norm": 1.2397364377975464,
"learning_rate": 1.610909115132286e-05,
"loss": 0.3268,
"step": 17232
},
{
"epoch": 22.381322957198442,
"grad_norm": 1.4817135334014893,
"learning_rate": 1.5985742458228338e-05,
"loss": 0.3283,
"step": 17256
},
{
"epoch": 22.412451361867703,
"grad_norm": 2.0548017024993896,
"learning_rate": 1.58627779072753e-05,
"loss": 0.3249,
"step": 17280
},
{
"epoch": 22.443579766536963,
"grad_norm": 1.4913387298583984,
"learning_rate": 1.574019888717155e-05,
"loss": 0.3277,
"step": 17304
},
{
"epoch": 22.474708171206224,
"grad_norm": 1.2476876974105835,
"learning_rate": 1.5618006782270904e-05,
"loss": 0.3298,
"step": 17328
},
{
"epoch": 22.505836575875485,
"grad_norm": 1.2181342840194702,
"learning_rate": 1.5496202972557556e-05,
"loss": 0.329,
"step": 17352
},
{
"epoch": 22.53696498054475,
"grad_norm": 1.3082391023635864,
"learning_rate": 1.5374788833630404e-05,
"loss": 0.328,
"step": 17376
},
{
"epoch": 22.56809338521401,
"grad_norm": 1.217458963394165,
"learning_rate": 1.5253765736687636e-05,
"loss": 0.3273,
"step": 17400
},
{
"epoch": 22.59922178988327,
"grad_norm": 1.1426113843917847,
"learning_rate": 1.5133135048511127e-05,
"loss": 0.3314,
"step": 17424
},
{
"epoch": 22.63035019455253,
"grad_norm": 1.8684285879135132,
"learning_rate": 1.5012898131451114e-05,
"loss": 0.3301,
"step": 17448
},
{
"epoch": 22.66147859922179,
"grad_norm": 1.1370235681533813,
"learning_rate": 1.489305634341071e-05,
"loss": 0.3315,
"step": 17472
},
{
"epoch": 22.69260700389105,
"grad_norm": 1.1359672546386719,
"learning_rate": 1.4773611037830626e-05,
"loss": 0.3283,
"step": 17496
},
{
"epoch": 22.723735408560312,
"grad_norm": 1.3090800046920776,
"learning_rate": 1.4654563563673901e-05,
"loss": 0.3282,
"step": 17520
},
{
"epoch": 22.754863813229573,
"grad_norm": 1.2736905813217163,
"learning_rate": 1.4535915265410593e-05,
"loss": 0.33,
"step": 17544
},
{
"epoch": 22.785992217898833,
"grad_norm": 1.189782977104187,
"learning_rate": 1.4417667483002688e-05,
"loss": 0.3267,
"step": 17568
},
{
"epoch": 22.817120622568094,
"grad_norm": 2.092562437057495,
"learning_rate": 1.4299821551888881e-05,
"loss": 0.3276,
"step": 17592
},
{
"epoch": 22.848249027237355,
"grad_norm": 1.8085280656814575,
"learning_rate": 1.4182378802969582e-05,
"loss": 0.3267,
"step": 17616
},
{
"epoch": 22.879377431906615,
"grad_norm": 1.2389247417449951,
"learning_rate": 1.4065340562591784e-05,
"loss": 0.3322,
"step": 17640
},
{
"epoch": 22.910505836575876,
"grad_norm": 2.3639073371887207,
"learning_rate": 1.3948708152534162e-05,
"loss": 0.3286,
"step": 17664
},
{
"epoch": 22.941634241245136,
"grad_norm": 1.4584684371948242,
"learning_rate": 1.3832482889992138e-05,
"loss": 0.3275,
"step": 17688
},
{
"epoch": 22.972762645914397,
"grad_norm": 1.2135454416275024,
"learning_rate": 1.3716666087562951e-05,
"loss": 0.3331,
"step": 17712
},
{
"epoch": 23.003891050583658,
"grad_norm": 1.1459728479385376,
"learning_rate": 1.3601259053230924e-05,
"loss": 0.3259,
"step": 17736
},
{
"epoch": 23.035019455252918,
"grad_norm": 1.1459057331085205,
"learning_rate": 1.3486263090352563e-05,
"loss": 0.3229,
"step": 17760
},
{
"epoch": 23.06614785992218,
"grad_norm": 1.3186362981796265,
"learning_rate": 1.3371679497641997e-05,
"loss": 0.3242,
"step": 17784
},
{
"epoch": 23.09727626459144,
"grad_norm": 0.9882354736328125,
"learning_rate": 1.3257509569156162e-05,
"loss": 0.3263,
"step": 17808
},
{
"epoch": 23.1284046692607,
"grad_norm": 1.146543264389038,
"learning_rate": 1.3143754594280266e-05,
"loss": 0.3239,
"step": 17832
},
{
"epoch": 23.15953307392996,
"grad_norm": 1.5829049348831177,
"learning_rate": 1.3030415857713246e-05,
"loss": 0.3274,
"step": 17856
},
{
"epoch": 23.19066147859922,
"grad_norm": 1.1690993309020996,
"learning_rate": 1.2917494639453171e-05,
"loss": 0.3266,
"step": 17880
},
{
"epoch": 23.22178988326848,
"grad_norm": 2.0189902782440186,
"learning_rate": 1.280499221478289e-05,
"loss": 0.3277,
"step": 17904
},
{
"epoch": 23.252918287937742,
"grad_norm": 2.8502254486083984,
"learning_rate": 1.269290985425557e-05,
"loss": 0.3309,
"step": 17928
},
{
"epoch": 23.284046692607003,
"grad_norm": 1.144399881362915,
"learning_rate": 1.2581248823680336e-05,
"loss": 0.3302,
"step": 17952
},
{
"epoch": 23.315175097276263,
"grad_norm": 1.0023480653762817,
"learning_rate": 1.2470010384108012e-05,
"loss": 0.3259,
"step": 17976
},
{
"epoch": 23.346303501945524,
"grad_norm": 1.0780220031738281,
"learning_rate": 1.2359195791816841e-05,
"loss": 0.3274,
"step": 18000
},
{
"epoch": 23.377431906614785,
"grad_norm": 1.4481017589569092,
"learning_rate": 1.2248806298298372e-05,
"loss": 0.3191,
"step": 18024
},
{
"epoch": 23.408560311284045,
"grad_norm": 0.9282727837562561,
"learning_rate": 1.2138843150243212e-05,
"loss": 0.326,
"step": 18048
},
{
"epoch": 23.439688715953306,
"grad_norm": 1.2329308986663818,
"learning_rate": 1.2029307589527062e-05,
"loss": 0.3245,
"step": 18072
},
{
"epoch": 23.470817120622566,
"grad_norm": 1.535043478012085,
"learning_rate": 1.1920200853196623e-05,
"loss": 0.3273,
"step": 18096
},
{
"epoch": 23.50194552529183,
"grad_norm": 1.5993396043777466,
"learning_rate": 1.1811524173455618e-05,
"loss": 0.3242,
"step": 18120
},
{
"epoch": 23.53307392996109,
"grad_norm": 2.646594762802124,
"learning_rate": 1.1703278777650929e-05,
"loss": 0.3323,
"step": 18144
},
{
"epoch": 23.56420233463035,
"grad_norm": 1.254061222076416,
"learning_rate": 1.1595465888258661e-05,
"loss": 0.3238,
"step": 18168
},
{
"epoch": 23.595330739299612,
"grad_norm": 1.3275645971298218,
"learning_rate": 1.1488086722870439e-05,
"loss": 0.328,
"step": 18192
},
{
"epoch": 23.626459143968873,
"grad_norm": 1.366665244102478,
"learning_rate": 1.1381142494179586e-05,
"loss": 0.3275,
"step": 18216
},
{
"epoch": 23.657587548638134,
"grad_norm": 1.2128342390060425,
"learning_rate": 1.1274634409967389e-05,
"loss": 0.3247,
"step": 18240
},
{
"epoch": 23.688715953307394,
"grad_norm": 1.168764591217041,
"learning_rate": 1.1168563673089589e-05,
"loss": 0.3239,
"step": 18264
},
{
"epoch": 23.719844357976655,
"grad_norm": 1.2446372509002686,
"learning_rate": 1.1062931481462647e-05,
"loss": 0.32,
"step": 18288
},
{
"epoch": 23.750972762645915,
"grad_norm": 1.4571527242660522,
"learning_rate": 1.095773902805033e-05,
"loss": 0.3272,
"step": 18312
},
{
"epoch": 23.782101167315176,
"grad_norm": 1.1576392650604248,
"learning_rate": 1.0852987500850148e-05,
"loss": 0.3251,
"step": 18336
},
{
"epoch": 23.813229571984436,
"grad_norm": 1.3691147565841675,
"learning_rate": 1.0748678082880049e-05,
"loss": 0.3253,
"step": 18360
},
{
"epoch": 23.844357976653697,
"grad_norm": 1.859039068222046,
"learning_rate": 1.0644811952164957e-05,
"loss": 0.3293,
"step": 18384
},
{
"epoch": 23.875486381322958,
"grad_norm": 1.2036535739898682,
"learning_rate": 1.0541390281723478e-05,
"loss": 0.3269,
"step": 18408
},
{
"epoch": 23.90661478599222,
"grad_norm": 1.459100365638733,
"learning_rate": 1.043841423955474e-05,
"loss": 0.3276,
"step": 18432
},
{
"epoch": 23.93774319066148,
"grad_norm": 1.2927861213684082,
"learning_rate": 1.0335884988625084e-05,
"loss": 0.3263,
"step": 18456
},
{
"epoch": 23.96887159533074,
"grad_norm": 1.4151058197021484,
"learning_rate": 1.0233803686855014e-05,
"loss": 0.321,
"step": 18480
},
{
"epoch": 24.0,
"grad_norm": 1.434226393699646,
"learning_rate": 1.0132171487106068e-05,
"loss": 0.3202,
"step": 18504
},
{
"epoch": 24.03112840466926,
"grad_norm": 1.2331753969192505,
"learning_rate": 1.0030989537167857e-05,
"loss": 0.3242,
"step": 18528
},
{
"epoch": 24.06225680933852,
"grad_norm": 1.6305173635482788,
"learning_rate": 9.930258979745055e-06,
"loss": 0.3221,
"step": 18552
},
{
"epoch": 24.09338521400778,
"grad_norm": 1.1515713930130005,
"learning_rate": 9.82998095244449e-06,
"loss": 0.3217,
"step": 18576
},
{
"epoch": 24.124513618677042,
"grad_norm": 1.1086283922195435,
"learning_rate": 9.730156587762335e-06,
"loss": 0.3225,
"step": 18600
},
{
"epoch": 24.155642023346303,
"grad_norm": 1.256364107131958,
"learning_rate": 9.630787013071286e-06,
"loss": 0.3218,
"step": 18624
},
{
"epoch": 24.186770428015564,
"grad_norm": 1.2893520593643188,
"learning_rate": 9.531873350607823e-06,
"loss": 0.3285,
"step": 18648
},
{
"epoch": 24.217898832684824,
"grad_norm": 1.1564453840255737,
"learning_rate": 9.433416717459592e-06,
"loss": 0.3234,
"step": 18672
},
{
"epoch": 24.249027237354085,
"grad_norm": 1.6299091577529907,
"learning_rate": 9.3354182255527e-06,
"loss": 0.3237,
"step": 18696
},
{
"epoch": 24.280155642023345,
"grad_norm": 0.9497871994972229,
"learning_rate": 9.237878981639264e-06,
"loss": 0.3226,
"step": 18720
},
{
"epoch": 24.311284046692606,
"grad_norm": 1.3882777690887451,
"learning_rate": 9.140800087284801e-06,
"loss": 0.322,
"step": 18744
},
{
"epoch": 24.342412451361866,
"grad_norm": 1.1506375074386597,
"learning_rate": 9.044182638855891e-06,
"loss": 0.3274,
"step": 18768
},
{
"epoch": 24.373540856031127,
"grad_norm": 0.8968532681465149,
"learning_rate": 8.948027727507708e-06,
"loss": 0.319,
"step": 18792
},
{
"epoch": 24.404669260700388,
"grad_norm": 1.5157815217971802,
"learning_rate": 8.852336439171733e-06,
"loss": 0.3254,
"step": 18816
},
{
"epoch": 24.43579766536965,
"grad_norm": 0.9984537959098816,
"learning_rate": 8.757109854543533e-06,
"loss": 0.3244,
"step": 18840
},
{
"epoch": 24.46692607003891,
"grad_norm": 1.8151588439941406,
"learning_rate": 8.662349049070463e-06,
"loss": 0.3198,
"step": 18864
},
{
"epoch": 24.49805447470817,
"grad_norm": 1.1167311668395996,
"learning_rate": 8.568055092939615e-06,
"loss": 0.3179,
"step": 18888
},
{
"epoch": 24.529182879377434,
"grad_norm": 1.3895347118377686,
"learning_rate": 8.474229051065657e-06,
"loss": 0.3211,
"step": 18912
},
{
"epoch": 24.560311284046694,
"grad_norm": 1.2524361610412598,
"learning_rate": 8.38087198307887e-06,
"loss": 0.32,
"step": 18936
},
{
"epoch": 24.591439688715955,
"grad_norm": 1.389087200164795,
"learning_rate": 8.287984943313114e-06,
"loss": 0.3251,
"step": 18960
},
{
"epoch": 24.622568093385215,
"grad_norm": 1.6150294542312622,
"learning_rate": 8.195568980793967e-06,
"loss": 0.3275,
"step": 18984
},
{
"epoch": 24.653696498054476,
"grad_norm": 1.6251153945922852,
"learning_rate": 8.103625139226895e-06,
"loss": 0.3225,
"step": 19008
},
{
"epoch": 24.684824902723737,
"grad_norm": 1.5373034477233887,
"learning_rate": 8.012154456985388e-06,
"loss": 0.3253,
"step": 19032
},
{
"epoch": 24.715953307392997,
"grad_norm": 0.9456262588500977,
"learning_rate": 7.921157967099336e-06,
"loss": 0.3151,
"step": 19056
},
{
"epoch": 24.747081712062258,
"grad_norm": 0.9828768372535706,
"learning_rate": 7.830636697243254e-06,
"loss": 0.3252,
"step": 19080
},
{
"epoch": 24.77821011673152,
"grad_norm": 1.8610461950302124,
"learning_rate": 7.740591669724772e-06,
"loss": 0.325,
"step": 19104
},
{
"epoch": 24.80933852140078,
"grad_norm": 1.8049260377883911,
"learning_rate": 7.651023901473032e-06,
"loss": 0.3204,
"step": 19128
},
{
"epoch": 24.84046692607004,
"grad_norm": 1.1601166725158691,
"learning_rate": 7.561934404027193e-06,
"loss": 0.3231,
"step": 19152
},
{
"epoch": 24.8715953307393,
"grad_norm": 1.2389658689498901,
"learning_rate": 7.473324183525088e-06,
"loss": 0.329,
"step": 19176
},
{
"epoch": 24.90272373540856,
"grad_norm": 1.0001511573791504,
"learning_rate": 7.385194240691751e-06,
"loss": 0.319,
"step": 19200
},
{
"epoch": 24.93385214007782,
"grad_norm": 1.7757816314697266,
"learning_rate": 7.297545570828207e-06,
"loss": 0.3267,
"step": 19224
},
{
"epoch": 24.964980544747082,
"grad_norm": 1.1014970541000366,
"learning_rate": 7.210379163800185e-06,
"loss": 0.3223,
"step": 19248
},
{
"epoch": 24.996108949416342,
"grad_norm": 1.6188836097717285,
"learning_rate": 7.123696004026947e-06,
"loss": 0.3227,
"step": 19272
},
{
"epoch": 25.027237354085603,
"grad_norm": 1.2841421365737915,
"learning_rate": 7.037497070470167e-06,
"loss": 0.32,
"step": 19296
},
{
"epoch": 25.058365758754864,
"grad_norm": 1.2222139835357666,
"learning_rate": 6.951783336622864e-06,
"loss": 0.3217,
"step": 19320
},
{
"epoch": 25.089494163424124,
"grad_norm": 1.0179907083511353,
"learning_rate": 6.866555770498473e-06,
"loss": 0.3182,
"step": 19344
},
{
"epoch": 25.120622568093385,
"grad_norm": 0.9595916271209717,
"learning_rate": 6.781815334619812e-06,
"loss": 0.3195,
"step": 19368
},
{
"epoch": 25.151750972762645,
"grad_norm": 1.2857320308685303,
"learning_rate": 6.6975629860082935e-06,
"loss": 0.3177,
"step": 19392
},
{
"epoch": 25.182879377431906,
"grad_norm": 1.7358510494232178,
"learning_rate": 6.613799676173088e-06,
"loss": 0.3208,
"step": 19416
},
{
"epoch": 25.214007782101167,
"grad_norm": 1.8369121551513672,
"learning_rate": 6.530526351100347e-06,
"loss": 0.3196,
"step": 19440
},
{
"epoch": 25.245136186770427,
"grad_norm": 2.4744224548339844,
"learning_rate": 6.447743951242591e-06,
"loss": 0.3239,
"step": 19464
},
{
"epoch": 25.276264591439688,
"grad_norm": 1.2925540208816528,
"learning_rate": 6.3654534115079936e-06,
"loss": 0.3157,
"step": 19488
},
{
"epoch": 25.30739299610895,
"grad_norm": 1.1039607524871826,
"learning_rate": 6.28365566124991e-06,
"loss": 0.3229,
"step": 19512
},
{
"epoch": 25.33852140077821,
"grad_norm": 0.8712733387947083,
"learning_rate": 6.202351624256359e-06,
"loss": 0.3181,
"step": 19536
},
{
"epoch": 25.36964980544747,
"grad_norm": 1.236718773841858,
"learning_rate": 6.1215422187395345e-06,
"loss": 0.3172,
"step": 19560
},
{
"epoch": 25.40077821011673,
"grad_norm": 1.4729557037353516,
"learning_rate": 6.041228357325529e-06,
"loss": 0.3244,
"step": 19584
},
{
"epoch": 25.43190661478599,
"grad_norm": 1.1015067100524902,
"learning_rate": 5.961410947043927e-06,
"loss": 0.3227,
"step": 19608
},
{
"epoch": 25.46303501945525,
"grad_norm": 1.4798215627670288,
"learning_rate": 5.882090889317671e-06,
"loss": 0.3208,
"step": 19632
},
{
"epoch": 25.494163424124515,
"grad_norm": 1.9315009117126465,
"learning_rate": 5.803269079952739e-06,
"loss": 0.3158,
"step": 19656
},
{
"epoch": 25.525291828793776,
"grad_norm": 1.1661323308944702,
"learning_rate": 5.724946409128179e-06,
"loss": 0.3194,
"step": 19680
},
{
"epoch": 25.556420233463037,
"grad_norm": 1.796525239944458,
"learning_rate": 5.647123761385975e-06,
"loss": 0.3236,
"step": 19704
},
{
"epoch": 25.587548638132297,
"grad_norm": 1.251969814300537,
"learning_rate": 5.569802015621039e-06,
"loss": 0.3228,
"step": 19728
},
{
"epoch": 25.618677042801558,
"grad_norm": 1.9998018741607666,
"learning_rate": 5.492982045071355e-06,
"loss": 0.3248,
"step": 19752
},
{
"epoch": 25.64980544747082,
"grad_norm": 1.0044583082199097,
"learning_rate": 5.4166647173080345e-06,
"loss": 0.3246,
"step": 19776
},
{
"epoch": 25.68093385214008,
"grad_norm": 1.0275497436523438,
"learning_rate": 5.340850894225607e-06,
"loss": 0.3253,
"step": 19800
},
{
"epoch": 25.71206225680934,
"grad_norm": 1.0156971216201782,
"learning_rate": 5.265541432032212e-06,
"loss": 0.3171,
"step": 19824
},
{
"epoch": 25.7431906614786,
"grad_norm": 1.4596341848373413,
"learning_rate": 5.190737181239941e-06,
"loss": 0.3212,
"step": 19848
},
{
"epoch": 25.77431906614786,
"grad_norm": 1.2357956171035767,
"learning_rate": 5.116438986655303e-06,
"loss": 0.3268,
"step": 19872
},
{
"epoch": 25.80544747081712,
"grad_norm": 1.335877537727356,
"learning_rate": 5.042647687369573e-06,
"loss": 0.3218,
"step": 19896
},
{
"epoch": 25.836575875486382,
"grad_norm": 1.5729907751083374,
"learning_rate": 4.969364116749414e-06,
"loss": 0.3205,
"step": 19920
},
{
"epoch": 25.867704280155642,
"grad_norm": 1.5255457162857056,
"learning_rate": 4.89658910242739e-06,
"loss": 0.3165,
"step": 19944
},
{
"epoch": 25.898832684824903,
"grad_norm": 1.195453405380249,
"learning_rate": 4.8243234662926905e-06,
"loss": 0.323,
"step": 19968
},
{
"epoch": 25.929961089494164,
"grad_norm": 1.1830676794052124,
"learning_rate": 4.75256802448178e-06,
"loss": 0.3173,
"step": 19992
},
{
"epoch": 25.961089494163424,
"grad_norm": 0.9383173584938049,
"learning_rate": 4.681323587369213e-06,
"loss": 0.3159,
"step": 20016
},
{
"epoch": 25.992217898832685,
"grad_norm": 1.3204113245010376,
"learning_rate": 4.610590959558497e-06,
"loss": 0.3217,
"step": 20040
},
{
"epoch": 26.023346303501945,
"grad_norm": 1.1940529346466064,
"learning_rate": 4.540370939872974e-06,
"loss": 0.3188,
"step": 20064
},
{
"epoch": 26.054474708171206,
"grad_norm": 1.7250840663909912,
"learning_rate": 4.470664321346829e-06,
"loss": 0.3192,
"step": 20088
},
{
"epoch": 26.085603112840467,
"grad_norm": 0.9612188339233398,
"learning_rate": 4.401471891216114e-06,
"loss": 0.3183,
"step": 20112
},
{
"epoch": 26.116731517509727,
"grad_norm": 1.175308108329773,
"learning_rate": 4.332794430909854e-06,
"loss": 0.3162,
"step": 20136
},
{
"epoch": 26.147859922178988,
"grad_norm": 1.3628140687942505,
"learning_rate": 4.264632716041234e-06,
"loss": 0.3173,
"step": 20160
},
{
"epoch": 26.17898832684825,
"grad_norm": 0.9504318237304688,
"learning_rate": 4.196987516398831e-06,
"loss": 0.3259,
"step": 20184
},
{
"epoch": 26.21011673151751,
"grad_norm": 1.6836086511611938,
"learning_rate": 4.129859595937946e-06,
"loss": 0.3188,
"step": 20208
},
{
"epoch": 26.24124513618677,
"grad_norm": 1.2717008590698242,
"learning_rate": 4.063249712771922e-06,
"loss": 0.321,
"step": 20232
},
{
"epoch": 26.27237354085603,
"grad_norm": 1.989966869354248,
"learning_rate": 3.997158619163644e-06,
"loss": 0.3215,
"step": 20256
},
{
"epoch": 26.30350194552529,
"grad_norm": 1.1739614009857178,
"learning_rate": 3.931587061517011e-06,
"loss": 0.3193,
"step": 20280
},
{
"epoch": 26.33463035019455,
"grad_norm": 1.1167713403701782,
"learning_rate": 3.8665357803685025e-06,
"loss": 0.3174,
"step": 20304
},
{
"epoch": 26.365758754863812,
"grad_norm": 1.379565715789795,
"learning_rate": 3.8020055103788144e-06,
"loss": 0.3218,
"step": 20328
},
{
"epoch": 26.396887159533073,
"grad_norm": 1.4840023517608643,
"learning_rate": 3.7379969803245763e-06,
"loss": 0.3213,
"step": 20352
},
{
"epoch": 26.428015564202333,
"grad_norm": 1.1443723440170288,
"learning_rate": 3.6745109130901288e-06,
"loss": 0.3141,
"step": 20376
},
{
"epoch": 26.459143968871594,
"grad_norm": 1.090888500213623,
"learning_rate": 3.6115480256593394e-06,
"loss": 0.3212,
"step": 20400
},
{
"epoch": 26.490272373540854,
"grad_norm": 1.472679615020752,
"learning_rate": 3.5491090291075004e-06,
"loss": 0.3151,
"step": 20424
},
{
"epoch": 26.52140077821012,
"grad_norm": 0.9774566292762756,
"learning_rate": 3.487194628593332e-06,
"loss": 0.3214,
"step": 20448
},
{
"epoch": 26.55252918287938,
"grad_norm": 2.1687231063842773,
"learning_rate": 3.4258055233509665e-06,
"loss": 0.324,
"step": 20472
},
{
"epoch": 26.58365758754864,
"grad_norm": 1.2352170944213867,
"learning_rate": 3.364942406682109e-06,
"loss": 0.3101,
"step": 20496
},
{
"epoch": 26.6147859922179,
"grad_norm": 2.996083974838257,
"learning_rate": 3.304605965948149e-06,
"loss": 0.3141,
"step": 20520
},
{
"epoch": 26.64591439688716,
"grad_norm": 1.5926743745803833,
"learning_rate": 3.244796882562462e-06,
"loss": 0.3229,
"step": 20544
},
{
"epoch": 26.67704280155642,
"grad_norm": 1.1748905181884766,
"learning_rate": 3.1855158319826774e-06,
"loss": 0.3213,
"step": 20568
},
{
"epoch": 26.708171206225682,
"grad_norm": 1.1093063354492188,
"learning_rate": 3.126763483703016e-06,
"loss": 0.3178,
"step": 20592
},
{
"epoch": 26.739299610894943,
"grad_norm": 1.1090799570083618,
"learning_rate": 3.0685405012468137e-06,
"loss": 0.3198,
"step": 20616
},
{
"epoch": 26.770428015564203,
"grad_norm": 1.0905050039291382,
"learning_rate": 3.010847542158951e-06,
"loss": 0.3192,
"step": 20640
},
{
"epoch": 26.801556420233464,
"grad_norm": 1.8493279218673706,
"learning_rate": 2.953685257998451e-06,
"loss": 0.3204,
"step": 20664
},
{
"epoch": 26.832684824902724,
"grad_norm": 1.2924058437347412,
"learning_rate": 2.8970542943311583e-06,
"loss": 0.3261,
"step": 20688
},
{
"epoch": 26.863813229571985,
"grad_norm": 0.9771651029586792,
"learning_rate": 2.8409552907223804e-06,
"loss": 0.3132,
"step": 20712
},
{
"epoch": 26.894941634241246,
"grad_norm": 1.0269138813018799,
"learning_rate": 2.785388880729739e-06,
"loss": 0.3199,
"step": 20736
},
{
"epoch": 26.926070038910506,
"grad_norm": 1.309114933013916,
"learning_rate": 2.7303556918959305e-06,
"loss": 0.3145,
"step": 20760
},
{
"epoch": 26.957198443579767,
"grad_norm": 1.0709702968597412,
"learning_rate": 2.6758563457417286e-06,
"loss": 0.3192,
"step": 20784
},
{
"epoch": 26.988326848249027,
"grad_norm": 1.4049859046936035,
"learning_rate": 2.621891457758896e-06,
"loss": 0.3206,
"step": 20808
},
{
"epoch": 27.019455252918288,
"grad_norm": 1.3224713802337646,
"learning_rate": 2.568461637403252e-06,
"loss": 0.312,
"step": 20832
},
{
"epoch": 27.05058365758755,
"grad_norm": 1.3082164525985718,
"learning_rate": 2.5155674880878334e-06,
"loss": 0.3108,
"step": 20856
},
{
"epoch": 27.08171206225681,
"grad_norm": 0.991944432258606,
"learning_rate": 2.4632096071759925e-06,
"loss": 0.3188,
"step": 20880
},
{
"epoch": 27.11284046692607,
"grad_norm": 1.2203731536865234,
"learning_rate": 2.4113885859747497e-06,
"loss": 0.3108,
"step": 20904
},
{
"epoch": 27.14396887159533,
"grad_norm": 1.203995704650879,
"learning_rate": 2.360105009728025e-06,
"loss": 0.3102,
"step": 20928
},
{
"epoch": 27.17509727626459,
"grad_norm": 1.6264797449111938,
"learning_rate": 2.3093594576101107e-06,
"loss": 0.3174,
"step": 20952
},
{
"epoch": 27.20622568093385,
"grad_norm": 1.3530755043029785,
"learning_rate": 2.2591525027190473e-06,
"loss": 0.3252,
"step": 20976
},
{
"epoch": 27.237354085603112,
"grad_norm": 2.048307418823242,
"learning_rate": 2.20948471207022e-06,
"loss": 0.3184,
"step": 21000
},
{
"epoch": 27.268482490272373,
"grad_norm": 1.320873737335205,
"learning_rate": 2.160356646589934e-06,
"loss": 0.3191,
"step": 21024
},
{
"epoch": 27.299610894941633,
"grad_norm": 1.1831213235855103,
"learning_rate": 2.111768861109048e-06,
"loss": 0.3183,
"step": 21048
},
{
"epoch": 27.330739299610894,
"grad_norm": 1.0811506509780884,
"learning_rate": 2.0637219043567636e-06,
"loss": 0.3177,
"step": 21072
},
{
"epoch": 27.361867704280154,
"grad_norm": 1.1472513675689697,
"learning_rate": 2.0162163189543838e-06,
"loss": 0.3171,
"step": 21096
},
{
"epoch": 27.392996108949415,
"grad_norm": 1.6906425952911377,
"learning_rate": 1.9692526414092084e-06,
"loss": 0.3223,
"step": 21120
},
{
"epoch": 27.424124513618676,
"grad_norm": 1.600865364074707,
"learning_rate": 1.9228314021084548e-06,
"loss": 0.3151,
"step": 21144
},
{
"epoch": 27.455252918287936,
"grad_norm": 1.7052664756774902,
"learning_rate": 1.8769531253132854e-06,
"loss": 0.3172,
"step": 21168
},
{
"epoch": 27.486381322957197,
"grad_norm": 1.2754665613174438,
"learning_rate": 1.83161832915289e-06,
"loss": 0.3181,
"step": 21192
},
{
"epoch": 27.51750972762646,
"grad_norm": 0.9670736193656921,
"learning_rate": 1.7868275256186174e-06,
"loss": 0.3209,
"step": 21216
},
{
"epoch": 27.54863813229572,
"grad_norm": 1.7570668458938599,
"learning_rate": 1.7425812205582147e-06,
"loss": 0.3151,
"step": 21240
},
{
"epoch": 27.579766536964982,
"grad_norm": 1.1468702554702759,
"learning_rate": 1.6988799136700706e-06,
"loss": 0.32,
"step": 21264
},
{
"epoch": 27.610894941634243,
"grad_norm": 1.837241768836975,
"learning_rate": 1.6557240984976408e-06,
"loss": 0.3176,
"step": 21288
},
{
"epoch": 27.642023346303503,
"grad_norm": 1.050024151802063,
"learning_rate": 1.613114262423815e-06,
"loss": 0.3169,
"step": 21312
},
{
"epoch": 27.673151750972764,
"grad_norm": 1.0731110572814941,
"learning_rate": 1.5710508866654261e-06,
"loss": 0.3204,
"step": 21336
},
{
"epoch": 27.704280155642024,
"grad_norm": 1.2539221048355103,
"learning_rate": 1.5295344462678495e-06,
"loss": 0.3168,
"step": 21360
},
{
"epoch": 27.735408560311285,
"grad_norm": 1.4090372323989868,
"learning_rate": 1.488565410099585e-06,
"loss": 0.3164,
"step": 21384
},
{
"epoch": 27.766536964980546,
"grad_norm": 1.5965330600738525,
"learning_rate": 1.4481442408470047e-06,
"loss": 0.3216,
"step": 21408
},
{
"epoch": 27.797665369649806,
"grad_norm": 1.1138761043548584,
"learning_rate": 1.4082713950091198e-06,
"loss": 0.3206,
"step": 21432
},
{
"epoch": 27.828793774319067,
"grad_norm": 1.1677641868591309,
"learning_rate": 1.3689473228923944e-06,
"loss": 0.3241,
"step": 21456
},
{
"epoch": 27.859922178988327,
"grad_norm": 2.1310067176818848,
"learning_rate": 1.3301724686056894e-06,
"loss": 0.3187,
"step": 21480
},
{
"epoch": 27.891050583657588,
"grad_norm": 1.3181018829345703,
"learning_rate": 1.2919472700552382e-06,
"loss": 0.3164,
"step": 21504
},
{
"epoch": 27.92217898832685,
"grad_norm": 1.476120114326477,
"learning_rate": 1.2542721589397234e-06,
"loss": 0.3184,
"step": 21528
},
{
"epoch": 27.95330739299611,
"grad_norm": 1.1621023416519165,
"learning_rate": 1.217147560745352e-06,
"loss": 0.319,
"step": 21552
},
{
"epoch": 27.98443579766537,
"grad_norm": 1.1426842212677002,
"learning_rate": 1.1805738947410938e-06,
"loss": 0.3155,
"step": 21576
},
{
"epoch": 28.01556420233463,
"grad_norm": 2.4093399047851562,
"learning_rate": 1.1445515739739399e-06,
"loss": 0.3135,
"step": 21600
},
{
"epoch": 28.04669260700389,
"grad_norm": 1.5340672731399536,
"learning_rate": 1.1090810052642064e-06,
"loss": 0.3181,
"step": 21624
},
{
"epoch": 28.07782101167315,
"grad_norm": 1.0847253799438477,
"learning_rate": 1.0741625892009833e-06,
"loss": 0.3165,
"step": 21648
},
{
"epoch": 28.108949416342412,
"grad_norm": 1.3261409997940063,
"learning_rate": 1.0397967201375814e-06,
"loss": 0.3204,
"step": 21672
},
{
"epoch": 28.140077821011673,
"grad_norm": 1.0757031440734863,
"learning_rate": 1.0059837861870812e-06,
"loss": 0.3187,
"step": 21696
},
{
"epoch": 28.171206225680933,
"grad_norm": 1.2534974813461304,
"learning_rate": 9.727241692179756e-07,
"loss": 0.3096,
"step": 21720
},
{
"epoch": 28.202334630350194,
"grad_norm": 1.2287142276763916,
"learning_rate": 9.400182448498163e-07,
"loss": 0.3169,
"step": 21744
},
{
"epoch": 28.233463035019454,
"grad_norm": 0.9463332891464233,
"learning_rate": 9.078663824490131e-07,
"loss": 0.3185,
"step": 21768
},
{
"epoch": 28.264591439688715,
"grad_norm": 2.7430317401885986,
"learning_rate": 8.762689451246198e-07,
"loss": 0.3178,
"step": 21792
},
{
"epoch": 28.295719844357976,
"grad_norm": 1.1905908584594727,
"learning_rate": 8.452262897242768e-07,
"loss": 0.3197,
"step": 21816
},
{
"epoch": 28.326848249027236,
"grad_norm": 0.894260823726654,
"learning_rate": 8.147387668301421e-07,
"loss": 0.3201,
"step": 21840
},
{
"epoch": 28.357976653696497,
"grad_norm": 1.122759222984314,
"learning_rate": 7.848067207549603e-07,
"loss": 0.3102,
"step": 21864
},
{
"epoch": 28.389105058365757,
"grad_norm": 1.454839825630188,
"learning_rate": 7.554304895381781e-07,
"loss": 0.3156,
"step": 21888
},
{
"epoch": 28.420233463035018,
"grad_norm": 1.348819613456726,
"learning_rate": 7.266104049420797e-07,
"loss": 0.3173,
"step": 21912
},
{
"epoch": 28.45136186770428,
"grad_norm": 1.397900104522705,
"learning_rate": 6.983467924480957e-07,
"loss": 0.3206,
"step": 21936
},
{
"epoch": 28.48249027237354,
"grad_norm": 2.4935896396636963,
"learning_rate": 6.706399712531009e-07,
"loss": 0.3227,
"step": 21960
},
{
"epoch": 28.5136186770428,
"grad_norm": 1.3364354372024536,
"learning_rate": 6.434902542658106e-07,
"loss": 0.3143,
"step": 21984
},
{
"epoch": 28.544747081712064,
"grad_norm": 1.0415703058242798,
"learning_rate": 6.168979481032455e-07,
"loss": 0.3204,
"step": 22008
},
{
"epoch": 28.575875486381324,
"grad_norm": 1.0268234014511108,
"learning_rate": 5.908633530872732e-07,
"loss": 0.3163,
"step": 22032
},
{
"epoch": 28.607003891050585,
"grad_norm": 1.0088456869125366,
"learning_rate": 5.653867632412269e-07,
"loss": 0.3118,
"step": 22056
},
{
"epoch": 28.638132295719846,
"grad_norm": 1.52815842628479,
"learning_rate": 5.404684662865589e-07,
"loss": 0.3166,
"step": 22080
},
{
"epoch": 28.669260700389106,
"grad_norm": 1.0740587711334229,
"learning_rate": 5.161087436396095e-07,
"loss": 0.3157,
"step": 22104
},
{
"epoch": 28.700389105058367,
"grad_norm": 1.263934850692749,
"learning_rate": 4.923078704084372e-07,
"loss": 0.3169,
"step": 22128
},
{
"epoch": 28.731517509727627,
"grad_norm": 1.1837375164031982,
"learning_rate": 4.690661153896825e-07,
"loss": 0.3177,
"step": 22152
},
{
"epoch": 28.762645914396888,
"grad_norm": 1.1407973766326904,
"learning_rate": 4.463837410655536e-07,
"loss": 0.3161,
"step": 22176
},
{
"epoch": 28.79377431906615,
"grad_norm": 1.019492268562317,
"learning_rate": 4.242610036008676e-07,
"loss": 0.3135,
"step": 22200
},
{
"epoch": 28.82490272373541,
"grad_norm": 1.7875498533248901,
"learning_rate": 4.026981528401419e-07,
"loss": 0.3213,
"step": 22224
},
{
"epoch": 28.85603112840467,
"grad_norm": 0.9684593677520752,
"learning_rate": 3.8169543230477387e-07,
"loss": 0.3151,
"step": 22248
},
{
"epoch": 28.88715953307393,
"grad_norm": 1.086421012878418,
"learning_rate": 3.612530791903046e-07,
"loss": 0.3172,
"step": 22272
},
{
"epoch": 28.91828793774319,
"grad_norm": 1.9420697689056396,
"learning_rate": 3.4137132436372064e-07,
"loss": 0.3181,
"step": 22296
},
{
"epoch": 28.94941634241245,
"grad_norm": 1.217786192893982,
"learning_rate": 3.2205039236086197e-07,
"loss": 0.3151,
"step": 22320
},
{
"epoch": 28.980544747081712,
"grad_norm": 1.1275442838668823,
"learning_rate": 3.0329050138388494e-07,
"loss": 0.3193,
"step": 22344
},
{
"epoch": 29.011673151750973,
"grad_norm": 0.9701781272888184,
"learning_rate": 2.850918632987809e-07,
"loss": 0.316,
"step": 22368
},
{
"epoch": 29.042801556420233,
"grad_norm": 1.0859931707382202,
"learning_rate": 2.674546836330172e-07,
"loss": 0.3169,
"step": 22392
},
{
"epoch": 29.073929961089494,
"grad_norm": 0.9976264834403992,
"learning_rate": 2.503791615731721e-07,
"loss": 0.3172,
"step": 22416
},
{
"epoch": 29.105058365758754,
"grad_norm": 2.1112818717956543,
"learning_rate": 2.3386548996272572e-07,
"loss": 0.3202,
"step": 22440
},
{
"epoch": 29.136186770428015,
"grad_norm": 1.3070718050003052,
"learning_rate": 2.1791385529986163e-07,
"loss": 0.3163,
"step": 22464
},
{
"epoch": 29.167315175097276,
"grad_norm": 1.5637389421463013,
"learning_rate": 2.02524437735363e-07,
"loss": 0.3183,
"step": 22488
},
{
"epoch": 29.198443579766536,
"grad_norm": 1.19569730758667,
"learning_rate": 1.876974110705698e-07,
"loss": 0.3176,
"step": 22512
},
{
"epoch": 29.229571984435797,
"grad_norm": 2.7948904037475586,
"learning_rate": 1.7343294275543599e-07,
"loss": 0.3181,
"step": 22536
},
{
"epoch": 29.260700389105057,
"grad_norm": 2.1853528022766113,
"learning_rate": 1.597311938866308e-07,
"loss": 0.3144,
"step": 22560
},
{
"epoch": 29.291828793774318,
"grad_norm": 1.4694305658340454,
"learning_rate": 1.4659231920571282e-07,
"loss": 0.318,
"step": 22584
},
{
"epoch": 29.32295719844358,
"grad_norm": 1.037607192993164,
"learning_rate": 1.3401646709736983e-07,
"loss": 0.3142,
"step": 22608
},
{
"epoch": 29.35408560311284,
"grad_norm": 0.9353266358375549,
"learning_rate": 1.2200377958778708e-07,
"loss": 0.3133,
"step": 22632
},
{
"epoch": 29.3852140077821,
"grad_norm": 1.4458966255187988,
"learning_rate": 1.1055439234299858e-07,
"loss": 0.3164,
"step": 22656
},
{
"epoch": 29.41634241245136,
"grad_norm": 0.9110085368156433,
"learning_rate": 9.966843466736597e-08,
"loss": 0.3157,
"step": 22680
},
{
"epoch": 29.44747081712062,
"grad_norm": 1.0257847309112549,
"learning_rate": 8.934602950213533e-08,
"loss": 0.319,
"step": 22704
},
{
"epoch": 29.47859922178988,
"grad_norm": 1.2331140041351318,
"learning_rate": 7.958729342403826e-08,
"loss": 0.3177,
"step": 22728
},
{
"epoch": 29.509727626459146,
"grad_norm": 2.199601650238037,
"learning_rate": 7.039233664396516e-08,
"loss": 0.3164,
"step": 22752
},
{
"epoch": 29.540856031128406,
"grad_norm": 1.1412527561187744,
"learning_rate": 6.176126300573848e-08,
"loss": 0.3127,
"step": 22776
},
{
"epoch": 29.571984435797667,
"grad_norm": 1.556688904762268,
"learning_rate": 5.369416998492471e-08,
"loss": 0.3181,
"step": 22800
},
{
"epoch": 29.603112840466927,
"grad_norm": 1.2471084594726562,
"learning_rate": 4.619114868774643e-08,
"loss": 0.3152,
"step": 22824
},
{
"epoch": 29.634241245136188,
"grad_norm": 1.3103766441345215,
"learning_rate": 3.92522838500331e-08,
"loss": 0.3171,
"step": 22848
},
{
"epoch": 29.66536964980545,
"grad_norm": 1.0881154537200928,
"learning_rate": 3.2877653836299594e-08,
"loss": 0.3162,
"step": 22872
},
{
"epoch": 29.69649805447471,
"grad_norm": 0.981332004070282,
"learning_rate": 2.7067330638824718e-08,
"loss": 0.3152,
"step": 22896
},
{
"epoch": 29.72762645914397,
"grad_norm": 2.1748950481414795,
"learning_rate": 2.1821379876851845e-08,
"loss": 0.3138,
"step": 22920
},
{
"epoch": 29.75875486381323,
"grad_norm": 1.0983901023864746,
"learning_rate": 1.7139860795861717e-08,
"loss": 0.3194,
"step": 22944
},
{
"epoch": 29.78988326848249,
"grad_norm": 0.9180955290794373,
"learning_rate": 1.3022826266873012e-08,
"loss": 0.3155,
"step": 22968
},
{
"epoch": 29.82101167315175,
"grad_norm": 4.426241397857666,
"learning_rate": 9.470322785881668e-09,
"loss": 0.3176,
"step": 22992
},
{
"epoch": 29.852140077821012,
"grad_norm": 1.521730661392212,
"learning_rate": 6.482390473294686e-09,
"loss": 0.3179,
"step": 23016
},
{
"epoch": 29.883268482490273,
"grad_norm": 1.1130119562149048,
"learning_rate": 4.059063073524882e-09,
"loss": 0.3199,
"step": 23040
},
{
"epoch": 29.914396887159533,
"grad_norm": 1.0622695684432983,
"learning_rate": 2.2003679545690158e-09,
"loss": 0.3167,
"step": 23064
},
{
"epoch": 29.945525291828794,
"grad_norm": 1.495850920677185,
"learning_rate": 9.063261077080221e-10,
"loss": 0.3201,
"step": 23088
},
{
"epoch": 29.976653696498055,
"grad_norm": 1.2298061847686768,
"learning_rate": 1.7695214729607224e-10,
"loss": 0.3134,
"step": 23112
},
{
"epoch": 30.0,
"step": 23130,
"total_flos": 9.11148472281858e+17,
"train_loss": 0.3991138265909079,
"train_runtime": 54856.7027,
"train_samples_per_second": 107.912,
"train_steps_per_second": 0.422
}
],
"logging_steps": 24,
"max_steps": 23130,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 1157,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.11148472281858e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}