{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9979633401221996, "eval_steps": 200, "global_step": 245, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004073319755600814, "grad_norm": 49.20765561486543, "learning_rate": 3.846153846153847e-07, "loss": 4.7582, "step": 1 }, { "epoch": 0.020366598778004074, "grad_norm": 49.16830472275932, "learning_rate": 1.9230769230769234e-06, "loss": 4.7542, "step": 5 }, { "epoch": 0.04073319755600815, "grad_norm": 47.93735350681692, "learning_rate": 3.846153846153847e-06, "loss": 4.675, "step": 10 }, { "epoch": 0.06109979633401222, "grad_norm": 34.85227976138109, "learning_rate": 4.999083215558211e-06, "loss": 4.1995, "step": 15 }, { "epoch": 0.0814663951120163, "grad_norm": 28.623178088461376, "learning_rate": 4.988777111066646e-06, "loss": 3.3879, "step": 20 }, { "epoch": 0.10183299389002037, "grad_norm": 14.704442306463271, "learning_rate": 4.967066306353816e-06, "loss": 2.5321, "step": 25 }, { "epoch": 0.12219959266802444, "grad_norm": 12.620738205233762, "learning_rate": 4.934050290127733e-06, "loss": 2.403, "step": 30 }, { "epoch": 0.1425661914460285, "grad_norm": 7.513981453630628, "learning_rate": 4.88988035667903e-06, "loss": 2.0794, "step": 35 }, { "epoch": 0.1629327902240326, "grad_norm": 5.5173642810506704, "learning_rate": 4.834758912582217e-06, "loss": 1.8869, "step": 40 }, { "epoch": 0.18329938900203666, "grad_norm": 4.641628513012735, "learning_rate": 4.7689385491773934e-06, "loss": 1.7749, "step": 45 }, { "epoch": 0.20366598778004075, "grad_norm": 4.064482253605761, "learning_rate": 4.692720885082693e-06, "loss": 1.7059, "step": 50 }, { "epoch": 0.2240325865580448, "grad_norm": 3.658952575458924, "learning_rate": 4.606455184041623e-06, "loss": 1.6527, "step": 55 }, { "epoch": 0.24439918533604887, "grad_norm": 3.3953108181872107, "learning_rate": 4.510536754438923e-06, "loss": 1.6071, "step": 60 }, { "epoch": 0.26476578411405294, "grad_norm": 3.1218567201858343, "learning_rate": 4.4054051378190915e-06, "loss": 1.5756, "step": 65 }, { "epoch": 0.285132382892057, "grad_norm": 3.018107484342879, "learning_rate": 4.2915420947086124e-06, "loss": 1.5449, "step": 70 }, { "epoch": 0.3054989816700611, "grad_norm": 2.7503856611169613, "learning_rate": 4.169469396971739e-06, "loss": 1.5139, "step": 75 }, { "epoch": 0.3258655804480652, "grad_norm": 2.614277144865713, "learning_rate": 4.039746436816277e-06, "loss": 1.4955, "step": 80 }, { "epoch": 0.34623217922606925, "grad_norm": 2.6742965366944467, "learning_rate": 3.9029676634059565e-06, "loss": 1.4554, "step": 85 }, { "epoch": 0.3665987780040733, "grad_norm": 2.542019317330229, "learning_rate": 3.7597598588260196e-06, "loss": 1.4344, "step": 90 }, { "epoch": 0.3869653767820774, "grad_norm": 2.5392741563301526, "learning_rate": 3.6107792658847597e-06, "loss": 1.428, "step": 95 }, { "epoch": 0.4073319755600815, "grad_norm": 2.352402399841174, "learning_rate": 3.4567085809127247e-06, "loss": 1.393, "step": 100 }, { "epoch": 0.42769857433808556, "grad_norm": 2.3051986813347836, "learning_rate": 3.29825382533995e-06, "loss": 1.3776, "step": 105 }, { "epoch": 0.4480651731160896, "grad_norm": 2.1913216414316015, "learning_rate": 3.1361411103870455e-06, "loss": 1.3703, "step": 110 }, { "epoch": 0.4684317718940937, "grad_norm": 2.0799262464815906, "learning_rate": 2.971113309695796e-06, "loss": 1.3551, "step": 115 }, { "epoch": 0.48879837067209775, "grad_norm": 2.1304438840612856, "learning_rate": 2.8039266551467876e-06, "loss": 1.3534, "step": 120 }, { "epoch": 0.5091649694501018, "grad_norm": 2.02651856945392, "learning_rate": 2.6353472714635443e-06, "loss": 1.3451, "step": 125 }, { "epoch": 0.5295315682281059, "grad_norm": 2.0213666994205073, "learning_rate": 2.466147665483196e-06, "loss": 1.3331, "step": 130 }, { "epoch": 0.5498981670061099, "grad_norm": 1.9063782318669384, "learning_rate": 2.2971031861814225e-06, "loss": 1.3302, "step": 135 }, { "epoch": 0.570264765784114, "grad_norm": 1.8510885600426448, "learning_rate": 2.128988471673435e-06, "loss": 1.3234, "step": 140 }, { "epoch": 0.5906313645621182, "grad_norm": 1.896713719309073, "learning_rate": 1.96257389947244e-06, "loss": 1.3309, "step": 145 }, { "epoch": 0.6109979633401222, "grad_norm": 1.7860315803999223, "learning_rate": 1.798622056272104e-06, "loss": 1.3166, "step": 150 }, { "epoch": 0.6313645621181263, "grad_norm": 1.7091260049598365, "learning_rate": 1.6378842434300746e-06, "loss": 1.3097, "step": 155 }, { "epoch": 0.6517311608961304, "grad_norm": 1.7668975196315897, "learning_rate": 1.481097034165998e-06, "loss": 1.3138, "step": 160 }, { "epoch": 0.6720977596741344, "grad_norm": 1.6213484424303142, "learning_rate": 1.328978898250525e-06, "loss": 1.2958, "step": 165 }, { "epoch": 0.6924643584521385, "grad_norm": 1.7070168750433874, "learning_rate": 1.1822269096524813e-06, "loss": 1.3088, "step": 170 }, { "epoch": 0.7128309572301426, "grad_norm": 1.6819401904260896, "learning_rate": 1.041513552231265e-06, "loss": 1.2866, "step": 175 }, { "epoch": 0.7331975560081466, "grad_norm": 1.6322096567129587, "learning_rate": 9.074836381122313e-07, "loss": 1.2968, "step": 180 }, { "epoch": 0.7535641547861507, "grad_norm": 1.7724019182054933, "learning_rate": 7.807513528664415e-07, "loss": 1.2945, "step": 185 }, { "epoch": 0.7739307535641547, "grad_norm": 1.6740383527430804, "learning_rate": 6.618974410351248e-07, "loss": 1.2947, "step": 190 }, { "epoch": 0.7942973523421588, "grad_norm": 1.5843813663357698, "learning_rate": 5.51466544896021e-07, "loss": 1.296, "step": 195 }, { "epoch": 0.814663951120163, "grad_norm": 1.604642517306322, "learning_rate": 4.499647086666029e-07, "loss": 1.3086, "step": 200 }, { "epoch": 0.814663951120163, "eval_loss": 1.2994157075881958, "eval_runtime": 57.8527, "eval_samples_per_second": 120.202, "eval_steps_per_second": 1.884, "step": 200 }, { "epoch": 0.835030549898167, "grad_norm": 1.5963773663289895, "learning_rate": 3.578570595810274e-07, "loss": 1.2945, "step": 205 }, { "epoch": 0.8553971486761711, "grad_norm": 1.6106215658321952, "learning_rate": 2.7556567646717907e-07, "loss": 1.2817, "step": 210 }, { "epoch": 0.8757637474541752, "grad_norm": 1.5304515921043562, "learning_rate": 2.0346765559094566e-07, "loss": 1.2951, "step": 215 }, { "epoch": 0.8961303462321792, "grad_norm": 1.595543511408829, "learning_rate": 1.4189338263089242e-07, "loss": 1.2854, "step": 220 }, { "epoch": 0.9164969450101833, "grad_norm": 1.6367393626714526, "learning_rate": 9.112501870194273e-08, "loss": 1.29, "step": 225 }, { "epoch": 0.9368635437881874, "grad_norm": 1.525279082901764, "learning_rate": 5.1395207365770586e-08, "loss": 1.283, "step": 230 }, { "epoch": 0.9572301425661914, "grad_norm": 1.6110003829594048, "learning_rate": 2.2886008552983064e-08, "loss": 1.2891, "step": 235 }, { "epoch": 0.9775967413441955, "grad_norm": 1.5366047109816883, "learning_rate": 5.7280642823301366e-09, "loss": 1.2847, "step": 240 }, { "epoch": 0.9979633401221996, "grad_norm": 1.4989602574206755, "learning_rate": 0.0, "loss": 1.287, "step": 245 }, { "epoch": 0.9979633401221996, "step": 245, "total_flos": 154525407117312.0, "train_loss": 1.684320724253752, "train_runtime": 2040.7023, "train_samples_per_second": 30.788, "train_steps_per_second": 0.12 } ], "logging_steps": 5, "max_steps": 245, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 154525407117312.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }