{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.993997599039616, "eval_steps": 500, "global_step": 1248, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024009603841536616, "grad_norm": 20.687761793962967, "learning_rate": 8.000000000000001e-07, "loss": 3.041, "step": 10 }, { "epoch": 0.04801920768307323, "grad_norm": 9.794234746287575, "learning_rate": 1.6000000000000001e-06, "loss": 2.6099, "step": 20 }, { "epoch": 0.07202881152460984, "grad_norm": 6.5777988620330365, "learning_rate": 2.4000000000000003e-06, "loss": 2.3365, "step": 30 }, { "epoch": 0.09603841536614646, "grad_norm": 6.29761081804441, "learning_rate": 3.2000000000000003e-06, "loss": 2.2801, "step": 40 }, { "epoch": 0.12004801920768307, "grad_norm": 5.520168810289388, "learning_rate": 4.000000000000001e-06, "loss": 2.1745, "step": 50 }, { "epoch": 0.14405762304921968, "grad_norm": 5.123098743767706, "learning_rate": 4.800000000000001e-06, "loss": 2.1269, "step": 60 }, { "epoch": 0.16806722689075632, "grad_norm": 5.422361829855033, "learning_rate": 5.600000000000001e-06, "loss": 2.2153, "step": 70 }, { "epoch": 0.19207683073229292, "grad_norm": 5.3572698260571885, "learning_rate": 6.4000000000000006e-06, "loss": 2.1211, "step": 80 }, { "epoch": 0.21608643457382953, "grad_norm": 5.047733772943945, "learning_rate": 7.2000000000000005e-06, "loss": 2.1216, "step": 90 }, { "epoch": 0.24009603841536614, "grad_norm": 4.718768637017437, "learning_rate": 8.000000000000001e-06, "loss": 2.1262, "step": 100 }, { "epoch": 0.26410564225690275, "grad_norm": 4.919713507188908, "learning_rate": 8.8e-06, "loss": 2.2468, "step": 110 }, { "epoch": 0.28811524609843936, "grad_norm": 4.677146498377886, "learning_rate": 9.600000000000001e-06, "loss": 2.1325, "step": 120 }, { "epoch": 0.31212484993997597, "grad_norm": 5.466016358856625, "learning_rate": 9.999510882536288e-06, "loss": 2.1459, "step": 130 }, { "epoch": 0.33613445378151263, "grad_norm": 4.440556392680108, "learning_rate": 9.995598516974005e-06, "loss": 2.2798, "step": 140 }, { "epoch": 0.36014405762304924, "grad_norm": 4.004727778029605, "learning_rate": 9.987776847469797e-06, "loss": 2.1991, "step": 150 }, { "epoch": 0.38415366146458585, "grad_norm": 4.543742816180648, "learning_rate": 9.976051994868506e-06, "loss": 2.337, "step": 160 }, { "epoch": 0.40816326530612246, "grad_norm": 4.205969855939202, "learning_rate": 9.960433134449601e-06, "loss": 2.1878, "step": 170 }, { "epoch": 0.43217286914765907, "grad_norm": 4.05967628095397, "learning_rate": 9.940932488747054e-06, "loss": 2.1711, "step": 180 }, { "epoch": 0.4561824729891957, "grad_norm": 3.851467643989681, "learning_rate": 9.917565317984614e-06, "loss": 2.2059, "step": 190 }, { "epoch": 0.4801920768307323, "grad_norm": 3.6455587183088203, "learning_rate": 9.890349908133914e-06, "loss": 2.262, "step": 200 }, { "epoch": 0.5042016806722689, "grad_norm": 3.8411593313415393, "learning_rate": 9.859307556604794e-06, "loss": 2.1945, "step": 210 }, { "epoch": 0.5282112845138055, "grad_norm": 4.236728754680901, "learning_rate": 9.824462555579019e-06, "loss": 2.2873, "step": 220 }, { "epoch": 0.5522208883553421, "grad_norm": 4.002728659906048, "learning_rate": 9.785842173000439e-06, "loss": 2.1592, "step": 230 }, { "epoch": 0.5762304921968787, "grad_norm": 3.9915206002259698, "learning_rate": 9.743476631236473e-06, "loss": 2.1277, "step": 240 }, { "epoch": 0.6002400960384153, "grad_norm": 3.7416035754534174, "learning_rate": 9.697399083427602e-06, "loss": 2.089, "step": 250 }, { "epoch": 0.6242496998799519, "grad_norm": 3.7596118293233873, "learning_rate": 9.647645587543391e-06, "loss": 2.0995, "step": 260 }, { "epoch": 0.6482593037214885, "grad_norm": 4.099393566721857, "learning_rate": 9.594255078165338e-06, "loss": 2.2086, "step": 270 }, { "epoch": 0.6722689075630253, "grad_norm": 3.75884213911114, "learning_rate": 9.537269336018627e-06, "loss": 2.0663, "step": 280 }, { "epoch": 0.6962785114045619, "grad_norm": 3.9076832551024006, "learning_rate": 9.476732955276637e-06, "loss": 2.2313, "step": 290 }, { "epoch": 0.7202881152460985, "grad_norm": 3.999571507578816, "learning_rate": 9.412693308663793e-06, "loss": 2.1956, "step": 300 }, { "epoch": 0.7442977190876351, "grad_norm": 3.673914733393223, "learning_rate": 9.345200510384044e-06, "loss": 2.1372, "step": 310 }, { "epoch": 0.7683073229291717, "grad_norm": 4.065856280538105, "learning_rate": 9.274307376904023e-06, "loss": 2.1247, "step": 320 }, { "epoch": 0.7923169267707083, "grad_norm": 3.437535576857318, "learning_rate": 9.200069385621528e-06, "loss": 2.1668, "step": 330 }, { "epoch": 0.8163265306122449, "grad_norm": 3.8590752772047447, "learning_rate": 9.122544631451703e-06, "loss": 2.2321, "step": 340 }, { "epoch": 0.8403361344537815, "grad_norm": 3.5809836445656886, "learning_rate": 9.041793781364898e-06, "loss": 2.0716, "step": 350 }, { "epoch": 0.8643457382953181, "grad_norm": 3.641632940529625, "learning_rate": 8.957880026911727e-06, "loss": 2.2342, "step": 360 }, { "epoch": 0.8883553421368547, "grad_norm": 3.688771072520596, "learning_rate": 8.870869034772563e-06, "loss": 2.208, "step": 370 }, { "epoch": 0.9123649459783914, "grad_norm": 4.146934930912289, "learning_rate": 8.78082889537008e-06, "loss": 2.1074, "step": 380 }, { "epoch": 0.936374549819928, "grad_norm": 3.334175185522678, "learning_rate": 8.687830069585138e-06, "loss": 2.0091, "step": 390 }, { "epoch": 0.9603841536614646, "grad_norm": 3.830656059837796, "learning_rate": 8.591945333617622e-06, "loss": 2.248, "step": 400 }, { "epoch": 0.9843937575030012, "grad_norm": 3.9829216438871216, "learning_rate": 8.493249722035464e-06, "loss": 2.2053, "step": 410 }, { "epoch": 1.007202881152461, "grad_norm": 3.9394668655626304, "learning_rate": 8.391820469056371e-06, "loss": 1.9717, "step": 420 }, { "epoch": 1.0312124849939976, "grad_norm": 4.550842560189379, "learning_rate": 8.287736948108197e-06, "loss": 1.413, "step": 430 }, { "epoch": 1.0552220888355341, "grad_norm": 4.356821890648689, "learning_rate": 8.181080609715309e-06, "loss": 1.3939, "step": 440 }, { "epoch": 1.0792316926770709, "grad_norm": 4.518076349604797, "learning_rate": 8.071934917759502e-06, "loss": 1.39, "step": 450 }, { "epoch": 1.1032412965186074, "grad_norm": 3.8536102257477878, "learning_rate": 7.960385284165364e-06, "loss": 1.4119, "step": 460 }, { "epoch": 1.127250900360144, "grad_norm": 4.701240892617297, "learning_rate": 7.846519002061208e-06, "loss": 1.341, "step": 470 }, { "epoch": 1.1512605042016806, "grad_norm": 3.896277166698315, "learning_rate": 7.730425177467854e-06, "loss": 1.4495, "step": 480 }, { "epoch": 1.1752701080432173, "grad_norm": 5.214676571280148, "learning_rate": 7.612194659568755e-06, "loss": 1.3667, "step": 490 }, { "epoch": 1.199279711884754, "grad_norm": 4.289971399955271, "learning_rate": 7.491919969615993e-06, "loss": 1.3402, "step": 500 }, { "epoch": 1.2232893157262905, "grad_norm": 4.317059354915328, "learning_rate": 7.369695228527796e-06, "loss": 1.3434, "step": 510 }, { "epoch": 1.247298919567827, "grad_norm": 3.939366941478397, "learning_rate": 7.245616083234266e-06, "loss": 1.392, "step": 520 }, { "epoch": 1.2713085234093637, "grad_norm": 3.785565139610707, "learning_rate": 7.119779631828882e-06, "loss": 1.3443, "step": 530 }, { "epoch": 1.2953181272509005, "grad_norm": 3.5905115574582815, "learning_rate": 6.992284347584438e-06, "loss": 1.3996, "step": 540 }, { "epoch": 1.319327731092437, "grad_norm": 4.219125670543618, "learning_rate": 6.8632300018928046e-06, "loss": 1.3255, "step": 550 }, { "epoch": 1.3433373349339737, "grad_norm": 4.213563188046468, "learning_rate": 6.732717586188866e-06, "loss": 1.3917, "step": 560 }, { "epoch": 1.3673469387755102, "grad_norm": 4.010244839439909, "learning_rate": 6.600849232919707e-06, "loss": 1.4168, "step": 570 }, { "epoch": 1.3913565426170469, "grad_norm": 3.929564306152493, "learning_rate": 6.467728135620892e-06, "loss": 1.2492, "step": 580 }, { "epoch": 1.4153661464585834, "grad_norm": 3.9942636003807133, "learning_rate": 6.333458468162415e-06, "loss": 1.3267, "step": 590 }, { "epoch": 1.43937575030012, "grad_norm": 4.943277008699724, "learning_rate": 6.198145303227456e-06, "loss": 1.3832, "step": 600 }, { "epoch": 1.4633853541416566, "grad_norm": 4.255100090454613, "learning_rate": 6.0618945300877964e-06, "loss": 1.4676, "step": 610 }, { "epoch": 1.4873949579831933, "grad_norm": 4.176877482554283, "learning_rate": 5.924812771740201e-06, "loss": 1.3791, "step": 620 }, { "epoch": 1.51140456182473, "grad_norm": 4.58206991985806, "learning_rate": 5.787007301468637e-06, "loss": 1.3183, "step": 630 }, { "epoch": 1.5354141656662665, "grad_norm": 4.095301189501832, "learning_rate": 5.648585958897585e-06, "loss": 1.3407, "step": 640 }, { "epoch": 1.559423769507803, "grad_norm": 3.850464869839509, "learning_rate": 5.509657065602197e-06, "loss": 1.3666, "step": 650 }, { "epoch": 1.5834333733493398, "grad_norm": 4.680569296847418, "learning_rate": 5.370329340341261e-06, "loss": 1.3968, "step": 660 }, { "epoch": 1.6074429771908765, "grad_norm": 4.426880734817505, "learning_rate": 5.2307118139794015e-06, "loss": 1.3658, "step": 670 }, { "epoch": 1.631452581032413, "grad_norm": 4.008404443022427, "learning_rate": 5.090913744164987e-06, "loss": 1.2817, "step": 680 }, { "epoch": 1.6554621848739495, "grad_norm": 4.452596800536699, "learning_rate": 4.951044529830603e-06, "loss": 1.3149, "step": 690 }, { "epoch": 1.6794717887154862, "grad_norm": 3.9870495810854423, "learning_rate": 4.811213625582961e-06, "loss": 1.3856, "step": 700 }, { "epoch": 1.703481392557023, "grad_norm": 4.6711092636358424, "learning_rate": 4.671530456049225e-06, "loss": 1.3813, "step": 710 }, { "epoch": 1.7274909963985594, "grad_norm": 4.184946891066517, "learning_rate": 4.532104330246807e-06, "loss": 1.3402, "step": 720 }, { "epoch": 1.751500600240096, "grad_norm": 4.423869307360585, "learning_rate": 4.3930443560436346e-06, "loss": 1.3824, "step": 730 }, { "epoch": 1.7755102040816326, "grad_norm": 3.9130866621429825, "learning_rate": 4.2544593547758214e-06, "loss": 1.391, "step": 740 }, { "epoch": 1.7995198079231693, "grad_norm": 3.585626688576833, "learning_rate": 4.116457776089576e-06, "loss": 1.3607, "step": 750 }, { "epoch": 1.8235294117647058, "grad_norm": 4.595114918315794, "learning_rate": 3.979147613073956e-06, "loss": 1.4228, "step": 760 }, { "epoch": 1.8475390156062423, "grad_norm": 4.59294338671302, "learning_rate": 3.842636317750918e-06, "loss": 1.3064, "step": 770 }, { "epoch": 1.871548619447779, "grad_norm": 4.315956805402431, "learning_rate": 3.707030716988783e-06, "loss": 1.3054, "step": 780 }, { "epoch": 1.8955582232893158, "grad_norm": 4.494329679068256, "learning_rate": 3.5724369289048845e-06, "loss": 1.4475, "step": 790 }, { "epoch": 1.9195678271308525, "grad_norm": 3.83422319487575, "learning_rate": 3.4389602798228942e-06, "loss": 1.2875, "step": 800 }, { "epoch": 1.943577430972389, "grad_norm": 3.9598203386027264, "learning_rate": 3.3067052218497263e-06, "loss": 1.2589, "step": 810 }, { "epoch": 1.9675870348139255, "grad_norm": 4.369625879638144, "learning_rate": 3.1757752511365903e-06, "loss": 1.2926, "step": 820 }, { "epoch": 1.9915966386554622, "grad_norm": 4.297274070272408, "learning_rate": 3.046272826888097e-06, "loss": 1.3962, "step": 830 }, { "epoch": 2.014405762304922, "grad_norm": 3.467032136187331, "learning_rate": 2.9182992911828585e-06, "loss": 0.8827, "step": 840 }, { "epoch": 2.0384153661464586, "grad_norm": 5.834392828540893, "learning_rate": 2.791954789668264e-06, "loss": 0.518, "step": 850 }, { "epoch": 2.0624249699879953, "grad_norm": 4.35808297800326, "learning_rate": 2.6673381931915466e-06, "loss": 0.5302, "step": 860 }, { "epoch": 2.086434573829532, "grad_norm": 4.295950284412872, "learning_rate": 2.5445470204284384e-06, "loss": 0.6175, "step": 870 }, { "epoch": 2.1104441776710683, "grad_norm": 4.498379665112538, "learning_rate": 2.4236773615699466e-06, "loss": 0.5003, "step": 880 }, { "epoch": 2.134453781512605, "grad_norm": 4.087409019142491, "learning_rate": 2.304823803127023e-06, "loss": 0.5097, "step": 890 }, { "epoch": 2.1584633853541417, "grad_norm": 4.141485559719574, "learning_rate": 2.1880793539119168e-06, "loss": 0.5089, "step": 900 }, { "epoch": 2.1824729891956784, "grad_norm": 4.167475392774359, "learning_rate": 2.073535372254158e-06, "loss": 0.5004, "step": 910 }, { "epoch": 2.2064825930372147, "grad_norm": 4.0504831735580655, "learning_rate": 1.961281494508129e-06, "loss": 0.4905, "step": 920 }, { "epoch": 2.2304921968787514, "grad_norm": 4.582931550708554, "learning_rate": 1.8514055649081646e-06, "loss": 0.5111, "step": 930 }, { "epoch": 2.254501800720288, "grad_norm": 5.055063219094166, "learning_rate": 1.743993566826077e-06, "loss": 0.4623, "step": 940 }, { "epoch": 2.278511404561825, "grad_norm": 4.528043836852871, "learning_rate": 1.6391295554848957e-06, "loss": 0.4996, "step": 950 }, { "epoch": 2.302521008403361, "grad_norm": 4.790281842406982, "learning_rate": 1.5368955921814844e-06, "loss": 0.5649, "step": 960 }, { "epoch": 2.326530612244898, "grad_norm": 4.487294982617259, "learning_rate": 1.437371680069491e-06, "loss": 0.5021, "step": 970 }, { "epoch": 2.3505402160864346, "grad_norm": 4.79811730303713, "learning_rate": 1.3406357015529236e-06, "loss": 0.4707, "step": 980 }, { "epoch": 2.3745498199279713, "grad_norm": 4.331408379478664, "learning_rate": 1.2467633573392829e-06, "loss": 0.4973, "step": 990 }, { "epoch": 2.398559423769508, "grad_norm": 4.81821270116248, "learning_rate": 1.15582810720001e-06, "loss": 0.4531, "step": 1000 }, { "epoch": 2.4225690276110443, "grad_norm": 3.840737088024985, "learning_rate": 1.0679011124845702e-06, "loss": 0.5042, "step": 1010 }, { "epoch": 2.446578631452581, "grad_norm": 5.275863790410332, "learning_rate": 9.830511804331467e-07, "loss": 0.4401, "step": 1020 }, { "epoch": 2.4705882352941178, "grad_norm": 4.562178525657066, "learning_rate": 9.013447103315758e-07, "loss": 0.4665, "step": 1030 }, { "epoch": 2.494597839135654, "grad_norm": 3.757433482713813, "learning_rate": 8.22845641550598e-07, "loss": 0.4568, "step": 1040 }, { "epoch": 2.5186074429771907, "grad_norm": 4.151109799062541, "learning_rate": 7.476154035101279e-07, "loss": 0.4835, "step": 1050 }, { "epoch": 2.5426170468187275, "grad_norm": 3.6159229877862367, "learning_rate": 6.757128676076813e-07, "loss": 0.434, "step": 1060 }, { "epoch": 2.566626650660264, "grad_norm": 4.49946909123513, "learning_rate": 6.071943011485837e-07, "loss": 0.4279, "step": 1070 }, { "epoch": 2.590636254501801, "grad_norm": 4.935063508764955, "learning_rate": 5.421133233140096e-07, "loss": 0.463, "step": 1080 }, { "epoch": 2.614645858343337, "grad_norm": 4.07260484094093, "learning_rate": 4.80520863201308e-07, "loss": 0.4221, "step": 1090 }, { "epoch": 2.638655462184874, "grad_norm": 5.214934170547326, "learning_rate": 4.2246511996945904e-07, "loss": 0.6136, "step": 1100 }, { "epoch": 2.6626650660264106, "grad_norm": 4.308544688046405, "learning_rate": 3.679915251208305e-07, "loss": 0.4775, "step": 1110 }, { "epoch": 2.6866746698679473, "grad_norm": 3.7974095950716293, "learning_rate": 3.17142706948782e-07, "loss": 0.428, "step": 1120 }, { "epoch": 2.710684273709484, "grad_norm": 4.256161845989194, "learning_rate": 2.6995845717889715e-07, "loss": 0.4731, "step": 1130 }, { "epoch": 2.7346938775510203, "grad_norm": 4.158934712493225, "learning_rate": 2.2647569982998942e-07, "loss": 0.4358, "step": 1140 }, { "epoch": 2.758703481392557, "grad_norm": 4.066226801856005, "learning_rate": 1.8672846231922005e-07, "loss": 0.4977, "step": 1150 }, { "epoch": 2.7827130852340938, "grad_norm": 4.944564503261002, "learning_rate": 1.5074784883395587e-07, "loss": 0.4495, "step": 1160 }, { "epoch": 2.80672268907563, "grad_norm": 4.116559563991335, "learning_rate": 1.1856201599119876e-07, "loss": 0.4488, "step": 1170 }, { "epoch": 2.8307322929171668, "grad_norm": 4.300196152639256, "learning_rate": 9.019615080363087e-08, "loss": 0.4994, "step": 1180 }, { "epoch": 2.8547418967587035, "grad_norm": 4.367567850384355, "learning_rate": 6.56724509695289e-08, "loss": 0.474, "step": 1190 }, { "epoch": 2.87875150060024, "grad_norm": 4.2390360863938215, "learning_rate": 4.501010750196322e-08, "loss": 0.4507, "step": 1200 }, { "epoch": 2.902761104441777, "grad_norm": 4.352099381645135, "learning_rate": 2.8225289710876457e-08, "loss": 0.4552, "step": 1210 }, { "epoch": 2.926770708283313, "grad_norm": 4.25941958105542, "learning_rate": 1.5331132549794014e-08, "loss": 0.4984, "step": 1220 }, { "epoch": 2.95078031212485, "grad_norm": 3.9123796575299723, "learning_rate": 6.3377263370728585e-09, "loss": 0.43, "step": 1230 }, { "epoch": 2.9747899159663866, "grad_norm": 4.3724081003161945, "learning_rate": 1.2521088597239328e-09, "loss": 0.4747, "step": 1240 }, { "epoch": 2.993997599039616, "step": 1248, "total_flos": 15664931143680.0, "train_loss": 1.354868695904047, "train_runtime": 2143.3151, "train_samples_per_second": 9.322, "train_steps_per_second": 0.582 } ], "logging_steps": 10, "max_steps": 1248, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 15664931143680.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }