| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.662514156285391, | |
| "eval_steps": 500, | |
| "global_step": 5000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.2195617377758026, | |
| "epoch": 0.011325028312570781, | |
| "grad_norm": 41.854122161865234, | |
| "learning_rate": 4.5e-06, | |
| "loss": 1.4649, | |
| "mean_token_accuracy": 0.7436085730791092, | |
| "num_tokens": 33232.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.8825877517461777, | |
| "epoch": 0.022650056625141562, | |
| "grad_norm": 10.884156227111816, | |
| "learning_rate": 9.5e-06, | |
| "loss": 0.8606, | |
| "mean_token_accuracy": 0.8266362160444259, | |
| "num_tokens": 62241.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.7232772573828697, | |
| "epoch": 0.03397508493771234, | |
| "grad_norm": 9.259407997131348, | |
| "learning_rate": 1.45e-05, | |
| "loss": 0.6503, | |
| "mean_token_accuracy": 0.8444810211658478, | |
| "num_tokens": 92125.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.6392747581005096, | |
| "epoch": 0.045300113250283124, | |
| "grad_norm": 7.45441198348999, | |
| "learning_rate": 1.9500000000000003e-05, | |
| "loss": 0.5675, | |
| "mean_token_accuracy": 0.8603695809841156, | |
| "num_tokens": 125345.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.7545703947544098, | |
| "epoch": 0.056625141562853906, | |
| "grad_norm": 7.38155460357666, | |
| "learning_rate": 2.45e-05, | |
| "loss": 0.6309, | |
| "mean_token_accuracy": 0.8437724113464355, | |
| "num_tokens": 155506.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.7073849737644196, | |
| "epoch": 0.06795016987542468, | |
| "grad_norm": 8.455394744873047, | |
| "learning_rate": 2.95e-05, | |
| "loss": 0.5849, | |
| "mean_token_accuracy": 0.8386865645647049, | |
| "num_tokens": 186173.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.6777370646595955, | |
| "epoch": 0.07927519818799547, | |
| "grad_norm": 10.703876495361328, | |
| "learning_rate": 3.45e-05, | |
| "loss": 0.6124, | |
| "mean_token_accuracy": 0.823593533039093, | |
| "num_tokens": 218694.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.6481794014573097, | |
| "epoch": 0.09060022650056625, | |
| "grad_norm": 7.329402923583984, | |
| "learning_rate": 3.9500000000000005e-05, | |
| "loss": 0.6196, | |
| "mean_token_accuracy": 0.8281102895736694, | |
| "num_tokens": 251166.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.6877432465553284, | |
| "epoch": 0.10192525481313704, | |
| "grad_norm": 7.263290882110596, | |
| "learning_rate": 4.4500000000000004e-05, | |
| "loss": 0.6643, | |
| "mean_token_accuracy": 0.81614009141922, | |
| "num_tokens": 282032.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.7204893633723259, | |
| "epoch": 0.11325028312570781, | |
| "grad_norm": 7.8037590980529785, | |
| "learning_rate": 4.9500000000000004e-05, | |
| "loss": 0.6983, | |
| "mean_token_accuracy": 0.8097896903753281, | |
| "num_tokens": 313148.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.760354095697403, | |
| "epoch": 0.1245753114382786, | |
| "grad_norm": 5.422664642333984, | |
| "learning_rate": 4.999958380063603e-05, | |
| "loss": 0.7468, | |
| "mean_token_accuracy": 0.799429202079773, | |
| "num_tokens": 343247.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.7298643738031387, | |
| "epoch": 0.13590033975084936, | |
| "grad_norm": 6.69228458404541, | |
| "learning_rate": 4.999814510457652e-05, | |
| "loss": 0.7326, | |
| "mean_token_accuracy": 0.8030451983213425, | |
| "num_tokens": 375581.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.768428310751915, | |
| "epoch": 0.14722536806342015, | |
| "grad_norm": 5.626662254333496, | |
| "learning_rate": 4.99956788326828e-05, | |
| "loss": 0.7484, | |
| "mean_token_accuracy": 0.7996043682098388, | |
| "num_tokens": 408107.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.7457070380449295, | |
| "epoch": 0.15855039637599094, | |
| "grad_norm": 6.562707901000977, | |
| "learning_rate": 4.9992185086333655e-05, | |
| "loss": 0.7464, | |
| "mean_token_accuracy": 0.8001674145460129, | |
| "num_tokens": 438643.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.8232061445713044, | |
| "epoch": 0.16987542468856173, | |
| "grad_norm": 5.214873790740967, | |
| "learning_rate": 4.998766400914329e-05, | |
| "loss": 0.795, | |
| "mean_token_accuracy": 0.7896791487932205, | |
| "num_tokens": 469183.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.7592632710933686, | |
| "epoch": 0.1812004530011325, | |
| "grad_norm": 4.8459625244140625, | |
| "learning_rate": 4.99821157869555e-05, | |
| "loss": 0.7495, | |
| "mean_token_accuracy": 0.8009798556566239, | |
| "num_tokens": 499213.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.7950803458690643, | |
| "epoch": 0.19252548131370328, | |
| "grad_norm": 4.592363357543945, | |
| "learning_rate": 4.9975540647835964e-05, | |
| "loss": 0.7862, | |
| "mean_token_accuracy": 0.7938760071992874, | |
| "num_tokens": 529845.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.7862876445055008, | |
| "epoch": 0.20385050962627407, | |
| "grad_norm": 4.092002868652344, | |
| "learning_rate": 4.99679388620629e-05, | |
| "loss": 0.7826, | |
| "mean_token_accuracy": 0.7916485607624054, | |
| "num_tokens": 564300.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.7688385576009751, | |
| "epoch": 0.21517553793884484, | |
| "grad_norm": 4.930974006652832, | |
| "learning_rate": 4.995931074211594e-05, | |
| "loss": 0.7503, | |
| "mean_token_accuracy": 0.8021332859992981, | |
| "num_tokens": 598731.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.7965670645236969, | |
| "epoch": 0.22650056625141562, | |
| "grad_norm": 4.959285736083984, | |
| "learning_rate": 4.994965664266331e-05, | |
| "loss": 0.7702, | |
| "mean_token_accuracy": 0.7963653445243836, | |
| "num_tokens": 629368.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.7948065817356109, | |
| "epoch": 0.23782559456398641, | |
| "grad_norm": 4.833596229553223, | |
| "learning_rate": 4.993897696054722e-05, | |
| "loss": 0.7944, | |
| "mean_token_accuracy": 0.789614400267601, | |
| "num_tokens": 660377.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.7758382678031921, | |
| "epoch": 0.2491506228765572, | |
| "grad_norm": 4.224271774291992, | |
| "learning_rate": 4.9927272134767566e-05, | |
| "loss": 0.7741, | |
| "mean_token_accuracy": 0.795179745554924, | |
| "num_tokens": 690586.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.7900628983974457, | |
| "epoch": 0.26047565118912797, | |
| "grad_norm": 3.947178602218628, | |
| "learning_rate": 4.991454264646391e-05, | |
| "loss": 0.766, | |
| "mean_token_accuracy": 0.794954827427864, | |
| "num_tokens": 725572.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.8080787748098374, | |
| "epoch": 0.2718006795016987, | |
| "grad_norm": 3.5941126346588135, | |
| "learning_rate": 4.9900789018895636e-05, | |
| "loss": 0.7964, | |
| "mean_token_accuracy": 0.7924022555351258, | |
| "num_tokens": 756763.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.8200718402862549, | |
| "epoch": 0.28312570781426954, | |
| "grad_norm": 3.9727747440338135, | |
| "learning_rate": 4.9886011817420526e-05, | |
| "loss": 0.7799, | |
| "mean_token_accuracy": 0.7895867377519608, | |
| "num_tokens": 787596.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.8339032739400863, | |
| "epoch": 0.2944507361268403, | |
| "grad_norm": 4.92995548248291, | |
| "learning_rate": 4.9870211649471436e-05, | |
| "loss": 0.8152, | |
| "mean_token_accuracy": 0.7852619111537933, | |
| "num_tokens": 817469.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.0795388698577881, | |
| "epoch": 0.3057757644394111, | |
| "grad_norm": 9.842768669128418, | |
| "learning_rate": 4.9853389164531396e-05, | |
| "loss": 1.0836, | |
| "mean_token_accuracy": 0.744838410615921, | |
| "num_tokens": 850001.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.0132719367742538, | |
| "epoch": 0.3171007927519819, | |
| "grad_norm": 5.305619239807129, | |
| "learning_rate": 4.983554505410687e-05, | |
| "loss": 1.01, | |
| "mean_token_accuracy": 0.7551202327013016, | |
| "num_tokens": 880532.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.8525327175855637, | |
| "epoch": 0.32842582106455265, | |
| "grad_norm": 3.3658530712127686, | |
| "learning_rate": 4.981668005169934e-05, | |
| "loss": 0.8566, | |
| "mean_token_accuracy": 0.7789583891630173, | |
| "num_tokens": 915441.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.841290682554245, | |
| "epoch": 0.33975084937712347, | |
| "grad_norm": 4.01895809173584, | |
| "learning_rate": 4.979679493277518e-05, | |
| "loss": 0.795, | |
| "mean_token_accuracy": 0.7931433916091919, | |
| "num_tokens": 949771.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.7752810955047608, | |
| "epoch": 0.3510758776896942, | |
| "grad_norm": 3.522529125213623, | |
| "learning_rate": 4.977589051473373e-05, | |
| "loss": 0.7492, | |
| "mean_token_accuracy": 0.8046811252832413, | |
| "num_tokens": 982523.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.8144816100597382, | |
| "epoch": 0.362400906002265, | |
| "grad_norm": 3.3812482357025146, | |
| "learning_rate": 4.975396765687375e-05, | |
| "loss": 0.7877, | |
| "mean_token_accuracy": 0.7939431577920913, | |
| "num_tokens": 1016280.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.8689843416213989, | |
| "epoch": 0.3737259343148358, | |
| "grad_norm": 4.172185897827148, | |
| "learning_rate": 4.9731027260358056e-05, | |
| "loss": 0.8494, | |
| "mean_token_accuracy": 0.7813519924879074, | |
| "num_tokens": 1048322.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.7746345907449722, | |
| "epoch": 0.38505096262740657, | |
| "grad_norm": 3.5320262908935547, | |
| "learning_rate": 4.9707070268176494e-05, | |
| "loss": 0.7661, | |
| "mean_token_accuracy": 0.7966699630022049, | |
| "num_tokens": 1082053.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.8384666532278061, | |
| "epoch": 0.39637599093997733, | |
| "grad_norm": 3.9420437812805176, | |
| "learning_rate": 4.9682097665107185e-05, | |
| "loss": 0.8145, | |
| "mean_token_accuracy": 0.7866115182638168, | |
| "num_tokens": 1113077.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.8227119207382202, | |
| "epoch": 0.40770101925254815, | |
| "grad_norm": 3.9070684909820557, | |
| "learning_rate": 4.965611047767602e-05, | |
| "loss": 0.8011, | |
| "mean_token_accuracy": 0.7920049339532852, | |
| "num_tokens": 1141483.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.8064505487680436, | |
| "epoch": 0.4190260475651189, | |
| "grad_norm": 3.753575563430786, | |
| "learning_rate": 4.962910977411451e-05, | |
| "loss": 0.798, | |
| "mean_token_accuracy": 0.7958554178476334, | |
| "num_tokens": 1173773.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.80653215944767, | |
| "epoch": 0.43035107587768967, | |
| "grad_norm": 4.219732284545898, | |
| "learning_rate": 4.96010966643158e-05, | |
| "loss": 0.7906, | |
| "mean_token_accuracy": 0.7969932436943055, | |
| "num_tokens": 1204159.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.8062887847423553, | |
| "epoch": 0.4416761041902605, | |
| "grad_norm": 3.3941314220428467, | |
| "learning_rate": 4.957207229978913e-05, | |
| "loss": 0.7938, | |
| "mean_token_accuracy": 0.7924901843070984, | |
| "num_tokens": 1236691.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.7844055652618408, | |
| "epoch": 0.45300113250283125, | |
| "grad_norm": 3.865077018737793, | |
| "learning_rate": 4.9542037873612444e-05, | |
| "loss": 0.7839, | |
| "mean_token_accuracy": 0.7974178075790406, | |
| "num_tokens": 1272447.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.8854492843151093, | |
| "epoch": 0.464326160815402, | |
| "grad_norm": 3.856250047683716, | |
| "learning_rate": 4.9510994620383354e-05, | |
| "loss": 0.8433, | |
| "mean_token_accuracy": 0.7830950766801834, | |
| "num_tokens": 1303921.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.805621936917305, | |
| "epoch": 0.47565118912797283, | |
| "grad_norm": 3.9170212745666504, | |
| "learning_rate": 4.9478943816168424e-05, | |
| "loss": 0.8045, | |
| "mean_token_accuracy": 0.7906429260969162, | |
| "num_tokens": 1335991.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.7835706263780594, | |
| "epoch": 0.4869762174405436, | |
| "grad_norm": 4.020387172698975, | |
| "learning_rate": 4.944588677845068e-05, | |
| "loss": 0.7576, | |
| "mean_token_accuracy": 0.8036289840936661, | |
| "num_tokens": 1369249.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.7757609993219375, | |
| "epoch": 0.4983012457531144, | |
| "grad_norm": 3.977161407470703, | |
| "learning_rate": 4.941182486607546e-05, | |
| "loss": 0.7724, | |
| "mean_token_accuracy": 0.801279491186142, | |
| "num_tokens": 1399744.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.8368120342493057, | |
| "epoch": 0.5096262740656852, | |
| "grad_norm": 3.2676987648010254, | |
| "learning_rate": 4.937675947919457e-05, | |
| "loss": 0.8203, | |
| "mean_token_accuracy": 0.7870727032423019, | |
| "num_tokens": 1431385.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.8545293360948563, | |
| "epoch": 0.5209513023782559, | |
| "grad_norm": 4.302636623382568, | |
| "learning_rate": 4.9340692059208726e-05, | |
| "loss": 0.8189, | |
| "mean_token_accuracy": 0.7853404134511948, | |
| "num_tokens": 1460138.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.8571904718875885, | |
| "epoch": 0.5322763306908267, | |
| "grad_norm": 3.3016743659973145, | |
| "learning_rate": 4.93036240887083e-05, | |
| "loss": 0.8428, | |
| "mean_token_accuracy": 0.7849946826696396, | |
| "num_tokens": 1492180.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.7961454778909683, | |
| "epoch": 0.5436013590033975, | |
| "grad_norm": 3.5592470169067383, | |
| "learning_rate": 4.926555709141238e-05, | |
| "loss": 0.7875, | |
| "mean_token_accuracy": 0.7980852603912354, | |
| "num_tokens": 1524445.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.8381227642297745, | |
| "epoch": 0.5549263873159683, | |
| "grad_norm": 3.524243116378784, | |
| "learning_rate": 4.9226492632106115e-05, | |
| "loss": 0.8016, | |
| "mean_token_accuracy": 0.789078775048256, | |
| "num_tokens": 1556464.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.8132199555635452, | |
| "epoch": 0.5662514156285391, | |
| "grad_norm": 3.4961235523223877, | |
| "learning_rate": 4.918643231657642e-05, | |
| "loss": 0.8021, | |
| "mean_token_accuracy": 0.7943854779005051, | |
| "num_tokens": 1586084.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5662514156285391, | |
| "eval_entropy": 0.7580640996570018, | |
| "eval_loss": 0.7161204218864441, | |
| "eval_mean_token_accuracy": 0.8122646586218877, | |
| "eval_num_tokens": 1586084.0, | |
| "eval_runtime": 23.0676, | |
| "eval_samples_per_second": 46.342, | |
| "eval_steps_per_second": 5.809, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.7857061624526978, | |
| "epoch": 0.5775764439411099, | |
| "grad_norm": 3.8294427394866943, | |
| "learning_rate": 4.914537779154596e-05, | |
| "loss": 0.7603, | |
| "mean_token_accuracy": 0.79789317548275, | |
| "num_tokens": 1619063.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.7763534516096116, | |
| "epoch": 0.5889014722536806, | |
| "grad_norm": 3.318204402923584, | |
| "learning_rate": 4.910333074460548e-05, | |
| "loss": 0.7836, | |
| "mean_token_accuracy": 0.8022702991962433, | |
| "num_tokens": 1649064.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.7938951075077056, | |
| "epoch": 0.6002265005662514, | |
| "grad_norm": 3.7849605083465576, | |
| "learning_rate": 4.906029290414438e-05, | |
| "loss": 0.7402, | |
| "mean_token_accuracy": 0.803567036986351, | |
| "num_tokens": 1679545.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.9141301155090332, | |
| "epoch": 0.6115515288788222, | |
| "grad_norm": 4.45604944229126, | |
| "learning_rate": 4.9016266039279703e-05, | |
| "loss": 0.8793, | |
| "mean_token_accuracy": 0.7735334932804108, | |
| "num_tokens": 1708450.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.8632586270570755, | |
| "epoch": 0.622876557191393, | |
| "grad_norm": 3.9162728786468506, | |
| "learning_rate": 4.897125195978344e-05, | |
| "loss": 0.8436, | |
| "mean_token_accuracy": 0.784154525399208, | |
| "num_tokens": 1739828.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.7791282266378403, | |
| "epoch": 0.6342015855039638, | |
| "grad_norm": 3.5997514724731445, | |
| "learning_rate": 4.8925252516008066e-05, | |
| "loss": 0.7647, | |
| "mean_token_accuracy": 0.7994277358055115, | |
| "num_tokens": 1771291.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.80836041867733, | |
| "epoch": 0.6455266138165345, | |
| "grad_norm": 16.00943946838379, | |
| "learning_rate": 4.887826959881058e-05, | |
| "loss": 0.7846, | |
| "mean_token_accuracy": 0.7990806043148041, | |
| "num_tokens": 1803223.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.7901756346225739, | |
| "epoch": 0.6568516421291053, | |
| "grad_norm": 3.633241653442383, | |
| "learning_rate": 4.883030513947466e-05, | |
| "loss": 0.7681, | |
| "mean_token_accuracy": 0.8016709625720978, | |
| "num_tokens": 1835151.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.8144267559051513, | |
| "epoch": 0.6681766704416761, | |
| "grad_norm": 3.5593814849853516, | |
| "learning_rate": 4.878136110963139e-05, | |
| "loss": 0.7784, | |
| "mean_token_accuracy": 0.7987326920032501, | |
| "num_tokens": 1869253.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.7699689626693725, | |
| "epoch": 0.6795016987542469, | |
| "grad_norm": 3.547506093978882, | |
| "learning_rate": 4.873143952117812e-05, | |
| "loss": 0.7678, | |
| "mean_token_accuracy": 0.7996489554643631, | |
| "num_tokens": 1901972.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.8226570546627044, | |
| "epoch": 0.6908267270668177, | |
| "grad_norm": 3.7377145290374756, | |
| "learning_rate": 4.868054242619582e-05, | |
| "loss": 0.7953, | |
| "mean_token_accuracy": 0.7935166180133819, | |
| "num_tokens": 1934251.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.8604654908180237, | |
| "epoch": 0.7021517553793885, | |
| "grad_norm": 4.307958126068115, | |
| "learning_rate": 4.862867191686473e-05, | |
| "loss": 0.83, | |
| "mean_token_accuracy": 0.7884718626737595, | |
| "num_tokens": 1963277.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.8304067760705948, | |
| "epoch": 0.7134767836919592, | |
| "grad_norm": 3.559305429458618, | |
| "learning_rate": 4.857583012537831e-05, | |
| "loss": 0.8245, | |
| "mean_token_accuracy": 0.7876487731933594, | |
| "num_tokens": 1994596.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.7983928799629212, | |
| "epoch": 0.72480181200453, | |
| "grad_norm": 3.0747625827789307, | |
| "learning_rate": 4.852201922385564e-05, | |
| "loss": 0.7524, | |
| "mean_token_accuracy": 0.8012862592935562, | |
| "num_tokens": 2024407.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.891300618648529, | |
| "epoch": 0.7361268403171007, | |
| "grad_norm": 4.086672306060791, | |
| "learning_rate": 4.846724142425213e-05, | |
| "loss": 0.8883, | |
| "mean_token_accuracy": 0.7746320635080337, | |
| "num_tokens": 2052217.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.8403537899255753, | |
| "epoch": 0.7474518686296716, | |
| "grad_norm": 3.159459352493286, | |
| "learning_rate": 4.841149897826856e-05, | |
| "loss": 0.8149, | |
| "mean_token_accuracy": 0.7901445269584656, | |
| "num_tokens": 2083702.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.761424508690834, | |
| "epoch": 0.7587768969422424, | |
| "grad_norm": 3.523196220397949, | |
| "learning_rate": 4.8354794177258575e-05, | |
| "loss": 0.7393, | |
| "mean_token_accuracy": 0.8060346513986587, | |
| "num_tokens": 2117175.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.864872795343399, | |
| "epoch": 0.7701019252548131, | |
| "grad_norm": 3.8052961826324463, | |
| "learning_rate": 4.829712935213443e-05, | |
| "loss": 0.8493, | |
| "mean_token_accuracy": 0.7827917844057083, | |
| "num_tokens": 2145885.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.8192390084266663, | |
| "epoch": 0.7814269535673839, | |
| "grad_norm": 3.475280284881592, | |
| "learning_rate": 4.823850687327124e-05, | |
| "loss": 0.8014, | |
| "mean_token_accuracy": 0.793234434723854, | |
| "num_tokens": 2178940.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.8399118930101395, | |
| "epoch": 0.7927519818799547, | |
| "grad_norm": 4.192668914794922, | |
| "learning_rate": 4.817892915040948e-05, | |
| "loss": 0.8282, | |
| "mean_token_accuracy": 0.7871042519807816, | |
| "num_tokens": 2208885.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.7586694777011871, | |
| "epoch": 0.8040770101925255, | |
| "grad_norm": 3.1814303398132324, | |
| "learning_rate": 4.811839863255602e-05, | |
| "loss": 0.73, | |
| "mean_token_accuracy": 0.8088628321886062, | |
| "num_tokens": 2240222.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.8313590168952942, | |
| "epoch": 0.8154020385050963, | |
| "grad_norm": 3.2918899059295654, | |
| "learning_rate": 4.805691780788334e-05, | |
| "loss": 0.7996, | |
| "mean_token_accuracy": 0.795934408903122, | |
| "num_tokens": 2270756.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.8529336482286454, | |
| "epoch": 0.8267270668176671, | |
| "grad_norm": 3.0882325172424316, | |
| "learning_rate": 4.799448920362734e-05, | |
| "loss": 0.8301, | |
| "mean_token_accuracy": 0.7841264367103576, | |
| "num_tokens": 2300237.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.8395129233598709, | |
| "epoch": 0.8380520951302378, | |
| "grad_norm": 2.9842989444732666, | |
| "learning_rate": 4.793111538598345e-05, | |
| "loss": 0.7959, | |
| "mean_token_accuracy": 0.7929731100797653, | |
| "num_tokens": 2330775.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.8623131781816482, | |
| "epoch": 0.8493771234428086, | |
| "grad_norm": 4.138679504394531, | |
| "learning_rate": 4.786679896000107e-05, | |
| "loss": 0.8596, | |
| "mean_token_accuracy": 0.7795057654380798, | |
| "num_tokens": 2360008.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.8062419712543487, | |
| "epoch": 0.8607021517553793, | |
| "grad_norm": 3.422121286392212, | |
| "learning_rate": 4.7801542569476576e-05, | |
| "loss": 0.7658, | |
| "mean_token_accuracy": 0.8022167205810546, | |
| "num_tokens": 2390046.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.8502861768007278, | |
| "epoch": 0.8720271800679502, | |
| "grad_norm": 3.099006175994873, | |
| "learning_rate": 4.773534889684458e-05, | |
| "loss": 0.8283, | |
| "mean_token_accuracy": 0.7852615088224411, | |
| "num_tokens": 2421387.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.782235860824585, | |
| "epoch": 0.883352208380521, | |
| "grad_norm": 3.4044156074523926, | |
| "learning_rate": 4.7668220663067705e-05, | |
| "loss": 0.7658, | |
| "mean_token_accuracy": 0.8062999725341797, | |
| "num_tokens": 2450799.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.8354818016290665, | |
| "epoch": 0.8946772366930917, | |
| "grad_norm": 4.402913570404053, | |
| "learning_rate": 4.7600160627524714e-05, | |
| "loss": 0.7934, | |
| "mean_token_accuracy": 0.796307036280632, | |
| "num_tokens": 2483454.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.8112906068563461, | |
| "epoch": 0.9060022650056625, | |
| "grad_norm": 4.405174255371094, | |
| "learning_rate": 4.753117158789711e-05, | |
| "loss": 0.7988, | |
| "mean_token_accuracy": 0.7941813617944717, | |
| "num_tokens": 2514692.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.792055967450142, | |
| "epoch": 0.9173272933182333, | |
| "grad_norm": 3.911775827407837, | |
| "learning_rate": 4.746125638005409e-05, | |
| "loss": 0.7967, | |
| "mean_token_accuracy": 0.7978548645973206, | |
| "num_tokens": 2546588.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 0.8198580473661423, | |
| "epoch": 0.928652321630804, | |
| "grad_norm": 2.8875856399536133, | |
| "learning_rate": 4.7390417877936e-05, | |
| "loss": 0.8022, | |
| "mean_token_accuracy": 0.7922891557216645, | |
| "num_tokens": 2579566.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 0.8724555522203445, | |
| "epoch": 0.9399773499433749, | |
| "grad_norm": 4.037559986114502, | |
| "learning_rate": 4.7318658993436226e-05, | |
| "loss": 0.8366, | |
| "mean_token_accuracy": 0.7826345145702363, | |
| "num_tokens": 2610767.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 0.7823762893676758, | |
| "epoch": 0.9513023782559457, | |
| "grad_norm": 3.327577590942383, | |
| "learning_rate": 4.724598267628143e-05, | |
| "loss": 0.7769, | |
| "mean_token_accuracy": 0.7997746199369431, | |
| "num_tokens": 2642792.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 0.7697542518377304, | |
| "epoch": 0.9626274065685164, | |
| "grad_norm": 3.0890369415283203, | |
| "learning_rate": 4.717239191391038e-05, | |
| "loss": 0.741, | |
| "mean_token_accuracy": 0.8050399273633957, | |
| "num_tokens": 2676964.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.763307249546051, | |
| "epoch": 0.9739524348810872, | |
| "grad_norm": 4.2226643562316895, | |
| "learning_rate": 4.709788973135106e-05, | |
| "loss": 0.7547, | |
| "mean_token_accuracy": 0.8071206361055374, | |
| "num_tokens": 2708988.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 0.7996066987514496, | |
| "epoch": 0.9852774631936579, | |
| "grad_norm": 3.804412603378296, | |
| "learning_rate": 4.70224791910964e-05, | |
| "loss": 0.7901, | |
| "mean_token_accuracy": 0.7994960457086563, | |
| "num_tokens": 2738235.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 0.815873098373413, | |
| "epoch": 0.9966024915062288, | |
| "grad_norm": 3.5629003047943115, | |
| "learning_rate": 4.694616339297835e-05, | |
| "loss": 0.785, | |
| "mean_token_accuracy": 0.7961396902799607, | |
| "num_tokens": 2769182.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 0.7122308641672135, | |
| "epoch": 1.0079275198187996, | |
| "grad_norm": 3.0285568237304688, | |
| "learning_rate": 4.686894547404045e-05, | |
| "loss": 0.6199, | |
| "mean_token_accuracy": 0.8262323826551438, | |
| "num_tokens": 2799895.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 0.5145712837576866, | |
| "epoch": 1.0192525481313703, | |
| "grad_norm": 3.19512939453125, | |
| "learning_rate": 4.679082860840891e-05, | |
| "loss": 0.5098, | |
| "mean_token_accuracy": 0.851035150885582, | |
| "num_tokens": 2830787.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.5525638118386269, | |
| "epoch": 1.030577576443941, | |
| "grad_norm": 2.943143367767334, | |
| "learning_rate": 4.6711816007162124e-05, | |
| "loss": 0.53, | |
| "mean_token_accuracy": 0.851621863245964, | |
| "num_tokens": 2861630.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 0.5268796980381012, | |
| "epoch": 1.0419026047565119, | |
| "grad_norm": 3.5483663082122803, | |
| "learning_rate": 4.6631910918198654e-05, | |
| "loss": 0.4995, | |
| "mean_token_accuracy": 0.8567488580942154, | |
| "num_tokens": 2895795.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 0.5167795568704605, | |
| "epoch": 1.0532276330690826, | |
| "grad_norm": 3.683345317840576, | |
| "learning_rate": 4.655111662610373e-05, | |
| "loss": 0.4901, | |
| "mean_token_accuracy": 0.8578720778226853, | |
| "num_tokens": 2928474.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 0.5409791812300682, | |
| "epoch": 1.0645526613816534, | |
| "grad_norm": 2.885199546813965, | |
| "learning_rate": 4.646943645201426e-05, | |
| "loss": 0.5262, | |
| "mean_token_accuracy": 0.8517296850681305, | |
| "num_tokens": 2959541.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 0.57862998098135, | |
| "epoch": 1.0758776896942241, | |
| "grad_norm": 4.052005767822266, | |
| "learning_rate": 4.638687375348227e-05, | |
| "loss": 0.5375, | |
| "mean_token_accuracy": 0.8474970668554306, | |
| "num_tokens": 2989964.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 0.5536176413297653, | |
| "epoch": 1.087202718006795, | |
| "grad_norm": 3.0179316997528076, | |
| "learning_rate": 4.6303431924336934e-05, | |
| "loss": 0.5525, | |
| "mean_token_accuracy": 0.8437703341245651, | |
| "num_tokens": 3021104.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 0.5437994614243508, | |
| "epoch": 1.098527746319366, | |
| "grad_norm": 3.4563395977020264, | |
| "learning_rate": 4.621911439454504e-05, | |
| "loss": 0.5062, | |
| "mean_token_accuracy": 0.8552743196487427, | |
| "num_tokens": 3054000.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 0.5517205700278283, | |
| "epoch": 1.1098527746319367, | |
| "grad_norm": 3.2511813640594482, | |
| "learning_rate": 4.613392463006995e-05, | |
| "loss": 0.5222, | |
| "mean_token_accuracy": 0.8511393010616303, | |
| "num_tokens": 3082595.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 0.537626887857914, | |
| "epoch": 1.1211778029445074, | |
| "grad_norm": 2.921431303024292, | |
| "learning_rate": 4.6047866132729253e-05, | |
| "loss": 0.5062, | |
| "mean_token_accuracy": 0.852451679110527, | |
| "num_tokens": 3113095.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 0.5493237912654877, | |
| "epoch": 1.1325028312570782, | |
| "grad_norm": 3.2696175575256348, | |
| "learning_rate": 4.596094244005069e-05, | |
| "loss": 0.5319, | |
| "mean_token_accuracy": 0.852485916018486, | |
| "num_tokens": 3143581.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.1325028312570782, | |
| "eval_entropy": 0.575189925174215, | |
| "eval_loss": 0.6139051914215088, | |
| "eval_mean_token_accuracy": 0.8391392471185372, | |
| "eval_num_tokens": 3143581.0, | |
| "eval_runtime": 22.7589, | |
| "eval_samples_per_second": 46.971, | |
| "eval_steps_per_second": 5.888, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.5281848132610321, | |
| "epoch": 1.143827859569649, | |
| "grad_norm": 3.3432345390319824, | |
| "learning_rate": 4.5873157125126806e-05, | |
| "loss": 0.5143, | |
| "mean_token_accuracy": 0.8539972066879272, | |
| "num_tokens": 3176671.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 0.5357129707932472, | |
| "epoch": 1.1551528878822197, | |
| "grad_norm": 2.681213617324829, | |
| "learning_rate": 4.578451379646808e-05, | |
| "loss": 0.5176, | |
| "mean_token_accuracy": 0.8533397912979126, | |
| "num_tokens": 3207113.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 0.5573878586292267, | |
| "epoch": 1.1664779161947905, | |
| "grad_norm": 2.8207201957702637, | |
| "learning_rate": 4.569501609785455e-05, | |
| "loss": 0.5267, | |
| "mean_token_accuracy": 0.8482785791158676, | |
| "num_tokens": 3242168.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 0.5452752098441124, | |
| "epoch": 1.1778029445073612, | |
| "grad_norm": 3.3376107215881348, | |
| "learning_rate": 4.560466770818608e-05, | |
| "loss": 0.5172, | |
| "mean_token_accuracy": 0.8519764631986618, | |
| "num_tokens": 3273260.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 0.5476395100355148, | |
| "epoch": 1.189127972819932, | |
| "grad_norm": 3.6320974826812744, | |
| "learning_rate": 4.5513472341331076e-05, | |
| "loss": 0.5439, | |
| "mean_token_accuracy": 0.8460662096738816, | |
| "num_tokens": 3302219.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 0.5614733591675758, | |
| "epoch": 1.2004530011325028, | |
| "grad_norm": 2.790870428085327, | |
| "learning_rate": 4.542143374597391e-05, | |
| "loss": 0.5218, | |
| "mean_token_accuracy": 0.8489633470773696, | |
| "num_tokens": 3336388.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 0.5463294044137001, | |
| "epoch": 1.2117780294450737, | |
| "grad_norm": 3.163102388381958, | |
| "learning_rate": 4.5328555705460716e-05, | |
| "loss": 0.5324, | |
| "mean_token_accuracy": 0.847563111782074, | |
| "num_tokens": 3368580.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 0.5477871373295784, | |
| "epoch": 1.2231030577576445, | |
| "grad_norm": 2.990779161453247, | |
| "learning_rate": 4.5234842037643985e-05, | |
| "loss": 0.5314, | |
| "mean_token_accuracy": 0.8470851451158523, | |
| "num_tokens": 3399370.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 0.6027134999632835, | |
| "epoch": 1.2344280860702153, | |
| "grad_norm": 3.5813815593719482, | |
| "learning_rate": 4.5140296594725544e-05, | |
| "loss": 0.5768, | |
| "mean_token_accuracy": 0.8372454822063446, | |
| "num_tokens": 3433838.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 0.5187774360179901, | |
| "epoch": 1.245753114382786, | |
| "grad_norm": 3.008913993835449, | |
| "learning_rate": 4.504492326309824e-05, | |
| "loss": 0.5077, | |
| "mean_token_accuracy": 0.8522711008787155, | |
| "num_tokens": 3465362.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 0.5649975180625916, | |
| "epoch": 1.2570781426953568, | |
| "grad_norm": 3.3389503955841064, | |
| "learning_rate": 4.494872596318618e-05, | |
| "loss": 0.5394, | |
| "mean_token_accuracy": 0.8493940323591233, | |
| "num_tokens": 3497415.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 0.6041379794478416, | |
| "epoch": 1.2684031710079275, | |
| "grad_norm": 4.550158977508545, | |
| "learning_rate": 4.48517086492836e-05, | |
| "loss": 0.5791, | |
| "mean_token_accuracy": 0.8398642212152481, | |
| "num_tokens": 3528071.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 0.5308591544628143, | |
| "epoch": 1.2797281993204983, | |
| "grad_norm": 3.043844223022461, | |
| "learning_rate": 4.4753875309392266e-05, | |
| "loss": 0.5014, | |
| "mean_token_accuracy": 0.8557395815849305, | |
| "num_tokens": 3562511.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 0.5395269095897675, | |
| "epoch": 1.291053227633069, | |
| "grad_norm": 3.781155586242676, | |
| "learning_rate": 4.46552299650576e-05, | |
| "loss": 0.5364, | |
| "mean_token_accuracy": 0.8473162204027176, | |
| "num_tokens": 3592113.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 0.5485832586884498, | |
| "epoch": 1.3023782559456398, | |
| "grad_norm": 3.388087511062622, | |
| "learning_rate": 4.455577667120333e-05, | |
| "loss": 0.5232, | |
| "mean_token_accuracy": 0.8495394617319107, | |
| "num_tokens": 3623419.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 0.5457410931587219, | |
| "epoch": 1.3137032842582106, | |
| "grad_norm": 2.7439608573913574, | |
| "learning_rate": 4.445551951596486e-05, | |
| "loss": 0.5239, | |
| "mean_token_accuracy": 0.8516563266515732, | |
| "num_tokens": 3655712.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 0.5817072510719299, | |
| "epoch": 1.3250283125707814, | |
| "grad_norm": 3.6495678424835205, | |
| "learning_rate": 4.4354462620521144e-05, | |
| "loss": 0.552, | |
| "mean_token_accuracy": 0.8431410670280457, | |
| "num_tokens": 3687624.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 0.5500650435686112, | |
| "epoch": 1.3363533408833521, | |
| "grad_norm": 2.702406406402588, | |
| "learning_rate": 4.425261013892534e-05, | |
| "loss": 0.5376, | |
| "mean_token_accuracy": 0.8446941554546357, | |
| "num_tokens": 3718974.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 0.5515587076544761, | |
| "epoch": 1.3476783691959229, | |
| "grad_norm": 3.3534793853759766, | |
| "learning_rate": 4.414996625793404e-05, | |
| "loss": 0.5358, | |
| "mean_token_accuracy": 0.8478793174028396, | |
| "num_tokens": 3749961.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 0.6033241957426071, | |
| "epoch": 1.3590033975084936, | |
| "grad_norm": 4.399415016174316, | |
| "learning_rate": 4.404653519683517e-05, | |
| "loss": 0.568, | |
| "mean_token_accuracy": 0.838269567489624, | |
| "num_tokens": 3779674.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 0.5632871612906456, | |
| "epoch": 1.3703284258210646, | |
| "grad_norm": 3.128950834274292, | |
| "learning_rate": 4.394232120727453e-05, | |
| "loss": 0.5478, | |
| "mean_token_accuracy": 0.8447149246931076, | |
| "num_tokens": 3811144.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 0.5675986737012864, | |
| "epoch": 1.3816534541336354, | |
| "grad_norm": 3.912665843963623, | |
| "learning_rate": 4.3837328573081057e-05, | |
| "loss": 0.551, | |
| "mean_token_accuracy": 0.8420376777648926, | |
| "num_tokens": 3841697.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 0.5417166292667389, | |
| "epoch": 1.3929784824462061, | |
| "grad_norm": 3.0909736156463623, | |
| "learning_rate": 4.373156161009071e-05, | |
| "loss": 0.5136, | |
| "mean_token_accuracy": 0.8509470343589782, | |
| "num_tokens": 3875026.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 0.6006052121520042, | |
| "epoch": 1.404303510758777, | |
| "grad_norm": 3.4114279747009277, | |
| "learning_rate": 4.3625024665969086e-05, | |
| "loss": 0.5766, | |
| "mean_token_accuracy": 0.8366666853427887, | |
| "num_tokens": 3905166.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 0.5631272241473197, | |
| "epoch": 1.4156285390713477, | |
| "grad_norm": 3.365401029586792, | |
| "learning_rate": 4.351772212003268e-05, | |
| "loss": 0.5295, | |
| "mean_token_accuracy": 0.8479926735162735, | |
| "num_tokens": 3936113.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 0.560055273771286, | |
| "epoch": 1.4269535673839184, | |
| "grad_norm": 3.087405204772949, | |
| "learning_rate": 4.3409658383068865e-05, | |
| "loss": 0.5347, | |
| "mean_token_accuracy": 0.8461541920900345, | |
| "num_tokens": 3966883.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 0.5575806692242622, | |
| "epoch": 1.4382785956964892, | |
| "grad_norm": 3.507297992706299, | |
| "learning_rate": 4.3300837897154636e-05, | |
| "loss": 0.5417, | |
| "mean_token_accuracy": 0.8473523437976838, | |
| "num_tokens": 3997355.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 0.6279172241687775, | |
| "epoch": 1.44960362400906, | |
| "grad_norm": 4.649575710296631, | |
| "learning_rate": 4.3191265135473926e-05, | |
| "loss": 0.5937, | |
| "mean_token_accuracy": 0.8347241580486298, | |
| "num_tokens": 4026260.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 0.5787097245454789, | |
| "epoch": 1.460928652321631, | |
| "grad_norm": 3.5137453079223633, | |
| "learning_rate": 4.3080944602133804e-05, | |
| "loss": 0.5253, | |
| "mean_token_accuracy": 0.8458025574684143, | |
| "num_tokens": 4056398.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 0.5460534125566483, | |
| "epoch": 1.4722536806342017, | |
| "grad_norm": 3.8779032230377197, | |
| "learning_rate": 4.296988083197932e-05, | |
| "loss": 0.553, | |
| "mean_token_accuracy": 0.8447301775217056, | |
| "num_tokens": 4087702.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 0.6065227225422859, | |
| "epoch": 1.4835787089467725, | |
| "grad_norm": 3.0295608043670654, | |
| "learning_rate": 4.285807839040702e-05, | |
| "loss": 0.5682, | |
| "mean_token_accuracy": 0.8381103157997132, | |
| "num_tokens": 4119831.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 0.5128992781043052, | |
| "epoch": 1.4949037372593432, | |
| "grad_norm": 2.5592479705810547, | |
| "learning_rate": 4.274554187317742e-05, | |
| "loss": 0.4974, | |
| "mean_token_accuracy": 0.8573563575744629, | |
| "num_tokens": 4153546.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 0.5610502302646637, | |
| "epoch": 1.506228765571914, | |
| "grad_norm": 2.9809346199035645, | |
| "learning_rate": 4.263227590622594e-05, | |
| "loss": 0.5337, | |
| "mean_token_accuracy": 0.8465662121772766, | |
| "num_tokens": 4184159.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 0.5405657783150672, | |
| "epoch": 1.5175537938844847, | |
| "grad_norm": 3.1333119869232178, | |
| "learning_rate": 4.251828514547285e-05, | |
| "loss": 0.5222, | |
| "mean_token_accuracy": 0.851176279783249, | |
| "num_tokens": 4216675.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 0.566189019382, | |
| "epoch": 1.5288788221970555, | |
| "grad_norm": 3.00138783454895, | |
| "learning_rate": 4.2403574276631864e-05, | |
| "loss": 0.5444, | |
| "mean_token_accuracy": 0.8445883154869079, | |
| "num_tokens": 4248386.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 0.527710796892643, | |
| "epoch": 1.5402038505096263, | |
| "grad_norm": 3.269314765930176, | |
| "learning_rate": 4.2288148015017505e-05, | |
| "loss": 0.5053, | |
| "mean_token_accuracy": 0.8536709517240524, | |
| "num_tokens": 4282402.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 0.5625803858041764, | |
| "epoch": 1.551528878822197, | |
| "grad_norm": 3.079254388809204, | |
| "learning_rate": 4.2172011105351294e-05, | |
| "loss": 0.5384, | |
| "mean_token_accuracy": 0.8477725654840469, | |
| "num_tokens": 4313169.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 0.5694461062550544, | |
| "epoch": 1.5628539071347678, | |
| "grad_norm": 3.1012649536132812, | |
| "learning_rate": 4.205516832156671e-05, | |
| "loss": 0.5568, | |
| "mean_token_accuracy": 0.8427411139011383, | |
| "num_tokens": 4344614.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 0.5670416623353958, | |
| "epoch": 1.5741789354473386, | |
| "grad_norm": 3.6018662452697754, | |
| "learning_rate": 4.193762446661294e-05, | |
| "loss": 0.5314, | |
| "mean_token_accuracy": 0.8488269418478012, | |
| "num_tokens": 4375036.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 0.5953057199716568, | |
| "epoch": 1.5855039637599093, | |
| "grad_norm": 3.029942274093628, | |
| "learning_rate": 4.18193843722575e-05, | |
| "loss": 0.5799, | |
| "mean_token_accuracy": 0.837462404370308, | |
| "num_tokens": 4404701.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 0.5928519412875175, | |
| "epoch": 1.59682899207248, | |
| "grad_norm": 3.283315896987915, | |
| "learning_rate": 4.170045289888753e-05, | |
| "loss": 0.5624, | |
| "mean_token_accuracy": 0.8420351535081864, | |
| "num_tokens": 4435592.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 0.5582237809896469, | |
| "epoch": 1.6081540203850508, | |
| "grad_norm": 3.4651055335998535, | |
| "learning_rate": 4.15808349353101e-05, | |
| "loss": 0.5491, | |
| "mean_token_accuracy": 0.8469580680131912, | |
| "num_tokens": 4466796.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 0.5469602465629577, | |
| "epoch": 1.6194790486976216, | |
| "grad_norm": 3.3069710731506348, | |
| "learning_rate": 4.1460535398551156e-05, | |
| "loss": 0.5196, | |
| "mean_token_accuracy": 0.8506799221038819, | |
| "num_tokens": 4500213.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 0.5610076829791069, | |
| "epoch": 1.6308040770101924, | |
| "grad_norm": 2.9331612586975098, | |
| "learning_rate": 4.1339559233653484e-05, | |
| "loss": 0.5464, | |
| "mean_token_accuracy": 0.8456663846969604, | |
| "num_tokens": 4533383.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 0.5389099940657616, | |
| "epoch": 1.6421291053227633, | |
| "grad_norm": 3.7289347648620605, | |
| "learning_rate": 4.12179114134734e-05, | |
| "loss": 0.5173, | |
| "mean_token_accuracy": 0.855185616016388, | |
| "num_tokens": 4566354.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 0.5230394601821899, | |
| "epoch": 1.6534541336353341, | |
| "grad_norm": 3.2918195724487305, | |
| "learning_rate": 4.1095596938476316e-05, | |
| "loss": 0.5053, | |
| "mean_token_accuracy": 0.8556439846754074, | |
| "num_tokens": 4598001.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 0.5553015500307084, | |
| "epoch": 1.6647791619479049, | |
| "grad_norm": 3.6326143741607666, | |
| "learning_rate": 4.097262083653123e-05, | |
| "loss": 0.5369, | |
| "mean_token_accuracy": 0.8448810696601867, | |
| "num_tokens": 4630452.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 0.6212641701102257, | |
| "epoch": 1.6761041902604756, | |
| "grad_norm": 3.491703748703003, | |
| "learning_rate": 4.084898816270405e-05, | |
| "loss": 0.5945, | |
| "mean_token_accuracy": 0.8339376747608185, | |
| "num_tokens": 4660517.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 0.5631924226880074, | |
| "epoch": 1.6874292185730464, | |
| "grad_norm": 2.8656742572784424, | |
| "learning_rate": 4.072470399904972e-05, | |
| "loss": 0.5362, | |
| "mean_token_accuracy": 0.8484214007854461, | |
| "num_tokens": 4693572.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 0.5927946478128433, | |
| "epoch": 1.6987542468856174, | |
| "grad_norm": 3.262650489807129, | |
| "learning_rate": 4.059977345440345e-05, | |
| "loss": 0.5631, | |
| "mean_token_accuracy": 0.8415592044591904, | |
| "num_tokens": 4725726.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.6987542468856174, | |
| "eval_entropy": 0.5341141203890986, | |
| "eval_loss": 0.5169265270233154, | |
| "eval_mean_token_accuracy": 0.8661515663808851, | |
| "eval_num_tokens": 4725726.0, | |
| "eval_runtime": 22.9682, | |
| "eval_samples_per_second": 46.543, | |
| "eval_steps_per_second": 5.834, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 0.5506062388420105, | |
| "epoch": 1.7100792751981881, | |
| "grad_norm": 3.5445897579193115, | |
| "learning_rate": 4.0474201664170605e-05, | |
| "loss": 0.5315, | |
| "mean_token_accuracy": 0.8496404081583023, | |
| "num_tokens": 4755710.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 0.5794235303997993, | |
| "epoch": 1.721404303510759, | |
| "grad_norm": 3.481981039047241, | |
| "learning_rate": 4.034799379011565e-05, | |
| "loss": 0.5681, | |
| "mean_token_accuracy": 0.8396411448717117, | |
| "num_tokens": 4788007.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 0.5977949738502503, | |
| "epoch": 1.7327293318233297, | |
| "grad_norm": 3.0650320053100586, | |
| "learning_rate": 4.0221155020149956e-05, | |
| "loss": 0.5602, | |
| "mean_token_accuracy": 0.838371816277504, | |
| "num_tokens": 4823110.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 0.5333872765302659, | |
| "epoch": 1.7440543601359004, | |
| "grad_norm": 3.4268224239349365, | |
| "learning_rate": 4.009369056811857e-05, | |
| "loss": 0.5284, | |
| "mean_token_accuracy": 0.8493491917848587, | |
| "num_tokens": 4854122.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 0.5541011542081833, | |
| "epoch": 1.7553793884484712, | |
| "grad_norm": 2.7358953952789307, | |
| "learning_rate": 3.9965605673585885e-05, | |
| "loss": 0.5322, | |
| "mean_token_accuracy": 0.8492937862873078, | |
| "num_tokens": 4885040.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 0.5873281508684158, | |
| "epoch": 1.766704416761042, | |
| "grad_norm": 3.24727463722229, | |
| "learning_rate": 3.983690560162021e-05, | |
| "loss": 0.5406, | |
| "mean_token_accuracy": 0.8487812489271164, | |
| "num_tokens": 4913565.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 0.5366435587406159, | |
| "epoch": 1.7780294450736127, | |
| "grad_norm": 2.3538382053375244, | |
| "learning_rate": 3.970759564257744e-05, | |
| "loss": 0.5122, | |
| "mean_token_accuracy": 0.852169793844223, | |
| "num_tokens": 4945547.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 0.5801732629537583, | |
| "epoch": 1.7893544733861835, | |
| "grad_norm": 3.008432388305664, | |
| "learning_rate": 3.9577681111883525e-05, | |
| "loss": 0.5498, | |
| "mean_token_accuracy": 0.8420356780290603, | |
| "num_tokens": 4975227.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 0.5545943319797516, | |
| "epoch": 1.8006795016987542, | |
| "grad_norm": 3.050703525543213, | |
| "learning_rate": 3.9447167349815975e-05, | |
| "loss": 0.5202, | |
| "mean_token_accuracy": 0.8499834507703781, | |
| "num_tokens": 5006740.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 0.614229929447174, | |
| "epoch": 1.812004530011325, | |
| "grad_norm": 3.1485025882720947, | |
| "learning_rate": 3.931605972128434e-05, | |
| "loss": 0.5971, | |
| "mean_token_accuracy": 0.8335285931825638, | |
| "num_tokens": 5038134.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 0.5501451089978218, | |
| "epoch": 1.8233295583238958, | |
| "grad_norm": 3.1998586654663086, | |
| "learning_rate": 3.918436361560975e-05, | |
| "loss": 0.5239, | |
| "mean_token_accuracy": 0.8538520008325576, | |
| "num_tokens": 5069253.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 0.5649801835417747, | |
| "epoch": 1.8346545866364665, | |
| "grad_norm": 3.609856128692627, | |
| "learning_rate": 3.905208444630327e-05, | |
| "loss": 0.544, | |
| "mean_token_accuracy": 0.8445982128381729, | |
| "num_tokens": 5099796.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 0.6116504520177841, | |
| "epoch": 1.8459796149490373, | |
| "grad_norm": 2.742194175720215, | |
| "learning_rate": 3.8919227650843446e-05, | |
| "loss": 0.5889, | |
| "mean_token_accuracy": 0.8342977851629257, | |
| "num_tokens": 5132503.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 0.5255660384893417, | |
| "epoch": 1.857304643261608, | |
| "grad_norm": 2.977027654647827, | |
| "learning_rate": 3.878579869045279e-05, | |
| "loss": 0.4961, | |
| "mean_token_accuracy": 0.8565999537706375, | |
| "num_tokens": 5162678.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 0.5009402737021447, | |
| "epoch": 1.8686296715741788, | |
| "grad_norm": 2.497317314147949, | |
| "learning_rate": 3.865180304987325e-05, | |
| "loss": 0.4848, | |
| "mean_token_accuracy": 0.8591432988643646, | |
| "num_tokens": 5196737.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 0.5639937236905098, | |
| "epoch": 1.8799546998867496, | |
| "grad_norm": 3.1093459129333496, | |
| "learning_rate": 3.851724623714078e-05, | |
| "loss": 0.5568, | |
| "mean_token_accuracy": 0.845026895403862, | |
| "num_tokens": 5227400.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 0.5589576527476311, | |
| "epoch": 1.8912797281993206, | |
| "grad_norm": 3.0768072605133057, | |
| "learning_rate": 3.838213378335893e-05, | |
| "loss": 0.5291, | |
| "mean_token_accuracy": 0.8501736044883728, | |
| "num_tokens": 5259004.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 0.5750891670584679, | |
| "epoch": 1.9026047565118913, | |
| "grad_norm": 2.6023709774017334, | |
| "learning_rate": 3.824647124247149e-05, | |
| "loss": 0.5517, | |
| "mean_token_accuracy": 0.8456086486577987, | |
| "num_tokens": 5290222.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 0.5511097982525826, | |
| "epoch": 1.913929784824462, | |
| "grad_norm": 2.9224021434783936, | |
| "learning_rate": 3.811026419103415e-05, | |
| "loss": 0.5325, | |
| "mean_token_accuracy": 0.8483414232730866, | |
| "num_tokens": 5322612.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 0.5870863348245621, | |
| "epoch": 1.9252548131370328, | |
| "grad_norm": 3.0950560569763184, | |
| "learning_rate": 3.79735182279853e-05, | |
| "loss": 0.5522, | |
| "mean_token_accuracy": 0.8423091471195221, | |
| "num_tokens": 5352812.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 0.5567023307085037, | |
| "epoch": 1.9365798414496036, | |
| "grad_norm": 2.833249092102051, | |
| "learning_rate": 3.78362389744159e-05, | |
| "loss": 0.5457, | |
| "mean_token_accuracy": 0.8451368689537049, | |
| "num_tokens": 5383796.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 0.55447788387537, | |
| "epoch": 1.9479048697621744, | |
| "grad_norm": 3.5825366973876953, | |
| "learning_rate": 3.769843207333837e-05, | |
| "loss": 0.5214, | |
| "mean_token_accuracy": 0.8480161488056183, | |
| "num_tokens": 5416796.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 0.5370402306318283, | |
| "epoch": 1.9592298980747453, | |
| "grad_norm": 2.9841582775115967, | |
| "learning_rate": 3.7560103189454654e-05, | |
| "loss": 0.5232, | |
| "mean_token_accuracy": 0.8511360079050064, | |
| "num_tokens": 5447812.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 0.5704680263996125, | |
| "epoch": 1.970554926387316, | |
| "grad_norm": 3.3857421875, | |
| "learning_rate": 3.742125800892339e-05, | |
| "loss": 0.5528, | |
| "mean_token_accuracy": 0.8471181511878967, | |
| "num_tokens": 5477860.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 0.5869046375155449, | |
| "epoch": 1.9818799546998869, | |
| "grad_norm": 3.149844169616699, | |
| "learning_rate": 3.7281902239126114e-05, | |
| "loss": 0.5758, | |
| "mean_token_accuracy": 0.8392861634492874, | |
| "num_tokens": 5508399.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 0.5820975095033646, | |
| "epoch": 1.9932049830124576, | |
| "grad_norm": 3.422454833984375, | |
| "learning_rate": 3.714204160843271e-05, | |
| "loss": 0.5391, | |
| "mean_token_accuracy": 0.8477762788534164, | |
| "num_tokens": 5538408.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 0.5504972025752067, | |
| "epoch": 2.0045300113250284, | |
| "grad_norm": 2.6110424995422363, | |
| "learning_rate": 3.700168186596591e-05, | |
| "loss": 0.4606, | |
| "mean_token_accuracy": 0.864372169971466, | |
| "num_tokens": 5568168.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 0.33086139261722564, | |
| "epoch": 2.015855039637599, | |
| "grad_norm": 3.0384321212768555, | |
| "learning_rate": 3.686082878136497e-05, | |
| "loss": 0.2968, | |
| "mean_token_accuracy": 0.9110626548528671, | |
| "num_tokens": 5600301.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 0.3262224726378918, | |
| "epoch": 2.02718006795017, | |
| "grad_norm": 3.177510976791382, | |
| "learning_rate": 3.671948814454852e-05, | |
| "loss": 0.3034, | |
| "mean_token_accuracy": 0.9086451709270478, | |
| "num_tokens": 5631080.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 0.33181734979152677, | |
| "epoch": 2.0385050962627407, | |
| "grad_norm": 2.8883979320526123, | |
| "learning_rate": 3.6577665765476534e-05, | |
| "loss": 0.2977, | |
| "mean_token_accuracy": 0.9095875382423401, | |
| "num_tokens": 5663677.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 0.34994775205850603, | |
| "epoch": 2.0498301245753114, | |
| "grad_norm": 2.915252208709717, | |
| "learning_rate": 3.643536747391155e-05, | |
| "loss": 0.3191, | |
| "mean_token_accuracy": 0.9045411765575408, | |
| "num_tokens": 5692514.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 0.33512948006391524, | |
| "epoch": 2.061155152887882, | |
| "grad_norm": 3.1287646293640137, | |
| "learning_rate": 3.629259911917898e-05, | |
| "loss": 0.3079, | |
| "mean_token_accuracy": 0.9076024353504181, | |
| "num_tokens": 5723520.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 0.3023697704076767, | |
| "epoch": 2.072480181200453, | |
| "grad_norm": 2.952409029006958, | |
| "learning_rate": 3.614936656992671e-05, | |
| "loss": 0.2774, | |
| "mean_token_accuracy": 0.9159202218055725, | |
| "num_tokens": 5756911.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 0.3192523382604122, | |
| "epoch": 2.0838052095130237, | |
| "grad_norm": 2.7479248046875, | |
| "learning_rate": 3.600567571388384e-05, | |
| "loss": 0.2952, | |
| "mean_token_accuracy": 0.9110053300857544, | |
| "num_tokens": 5789223.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 0.3362482935190201, | |
| "epoch": 2.0951302378255945, | |
| "grad_norm": 2.9288883209228516, | |
| "learning_rate": 3.5861532457618647e-05, | |
| "loss": 0.3001, | |
| "mean_token_accuracy": 0.910579189658165, | |
| "num_tokens": 5820267.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 0.3218920275568962, | |
| "epoch": 2.1064552661381652, | |
| "grad_norm": 2.9838762283325195, | |
| "learning_rate": 3.571694272629583e-05, | |
| "loss": 0.3028, | |
| "mean_token_accuracy": 0.9099289119243622, | |
| "num_tokens": 5851637.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 0.33703851103782656, | |
| "epoch": 2.117780294450736, | |
| "grad_norm": 2.6930618286132812, | |
| "learning_rate": 3.557191246343294e-05, | |
| "loss": 0.3005, | |
| "mean_token_accuracy": 0.9076809674501419, | |
| "num_tokens": 5884035.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 0.33338933885097505, | |
| "epoch": 2.1291053227633068, | |
| "grad_norm": 2.9287595748901367, | |
| "learning_rate": 3.5426447630656015e-05, | |
| "loss": 0.2996, | |
| "mean_token_accuracy": 0.9076695948839187, | |
| "num_tokens": 5915289.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 0.3408766373991966, | |
| "epoch": 2.1404303510758775, | |
| "grad_norm": 3.0822606086730957, | |
| "learning_rate": 3.528055420745461e-05, | |
| "loss": 0.302, | |
| "mean_token_accuracy": 0.9080609738826751, | |
| "num_tokens": 5945986.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 0.33647180199623106, | |
| "epoch": 2.1517553793884483, | |
| "grad_norm": 2.4581234455108643, | |
| "learning_rate": 3.51342381909359e-05, | |
| "loss": 0.3117, | |
| "mean_token_accuracy": 0.9036801576614379, | |
| "num_tokens": 5979688.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 0.3434984043240547, | |
| "epoch": 2.163080407701019, | |
| "grad_norm": 3.238070487976074, | |
| "learning_rate": 3.498750559557827e-05, | |
| "loss": 0.3049, | |
| "mean_token_accuracy": 0.908153846859932, | |
| "num_tokens": 6010581.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 0.3294851824641228, | |
| "epoch": 2.17440543601359, | |
| "grad_norm": 3.0652968883514404, | |
| "learning_rate": 3.484036245298398e-05, | |
| "loss": 0.3024, | |
| "mean_token_accuracy": 0.9057962894439697, | |
| "num_tokens": 6042553.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 0.3511190369725227, | |
| "epoch": 2.185730464326161, | |
| "grad_norm": 2.9219870567321777, | |
| "learning_rate": 3.469281481163131e-05, | |
| "loss": 0.3258, | |
| "mean_token_accuracy": 0.9027796924114228, | |
| "num_tokens": 6073935.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 0.32454402297735213, | |
| "epoch": 2.197055492638732, | |
| "grad_norm": 2.8523194789886475, | |
| "learning_rate": 3.45448687366259e-05, | |
| "loss": 0.2839, | |
| "mean_token_accuracy": 0.9147208243608475, | |
| "num_tokens": 6106596.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 0.3268588811159134, | |
| "epoch": 2.2083805209513026, | |
| "grad_norm": 3.46616530418396, | |
| "learning_rate": 3.4396530309451416e-05, | |
| "loss": 0.3023, | |
| "mean_token_accuracy": 0.9075274288654327, | |
| "num_tokens": 6137408.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 0.34170345067977903, | |
| "epoch": 2.2197055492638733, | |
| "grad_norm": 2.899691104888916, | |
| "learning_rate": 3.4247805627719605e-05, | |
| "loss": 0.3087, | |
| "mean_token_accuracy": 0.9050837367773056, | |
| "num_tokens": 6169270.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 0.33029789179563523, | |
| "epoch": 2.231030577576444, | |
| "grad_norm": 3.0167901515960693, | |
| "learning_rate": 3.4098700804919614e-05, | |
| "loss": 0.2938, | |
| "mean_token_accuracy": 0.9097273588180542, | |
| "num_tokens": 6200234.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 0.3392042815685272, | |
| "epoch": 2.242355605889015, | |
| "grad_norm": 3.2007789611816406, | |
| "learning_rate": 3.394922197016671e-05, | |
| "loss": 0.308, | |
| "mean_token_accuracy": 0.9080634534358978, | |
| "num_tokens": 6229177.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 0.3530780151486397, | |
| "epoch": 2.2536806342015856, | |
| "grad_norm": 2.546180009841919, | |
| "learning_rate": 3.379937526795031e-05, | |
| "loss": 0.3062, | |
| "mean_token_accuracy": 0.9073823064565658, | |
| "num_tokens": 6257509.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 0.31698922738432883, | |
| "epoch": 2.2650056625141564, | |
| "grad_norm": 2.573667049407959, | |
| "learning_rate": 3.3649166857881444e-05, | |
| "loss": 0.2812, | |
| "mean_token_accuracy": 0.9117646932601928, | |
| "num_tokens": 6292493.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.2650056625141564, | |
| "eval_entropy": 0.3538621710752373, | |
| "eval_loss": 0.461500346660614, | |
| "eval_mean_token_accuracy": 0.8896430148117578, | |
| "eval_num_tokens": 6292493.0, | |
| "eval_runtime": 22.9271, | |
| "eval_samples_per_second": 46.626, | |
| "eval_steps_per_second": 5.845, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 0.3301647327840328, | |
| "epoch": 2.276330690826727, | |
| "grad_norm": 3.6626837253570557, | |
| "learning_rate": 3.349860291443952e-05, | |
| "loss": 0.3122, | |
| "mean_token_accuracy": 0.9060505300760269, | |
| "num_tokens": 6326995.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "entropy": 0.3390480265021324, | |
| "epoch": 2.287655719139298, | |
| "grad_norm": 3.048062801361084, | |
| "learning_rate": 3.3347689626718535e-05, | |
| "loss": 0.301, | |
| "mean_token_accuracy": 0.9094172894954682, | |
| "num_tokens": 6358142.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "entropy": 0.33652088344097136, | |
| "epoch": 2.2989807474518686, | |
| "grad_norm": 2.5861592292785645, | |
| "learning_rate": 3.319643319817266e-05, | |
| "loss": 0.3007, | |
| "mean_token_accuracy": 0.9065529972314834, | |
| "num_tokens": 6391665.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "entropy": 0.34212576597929, | |
| "epoch": 2.3103057757644394, | |
| "grad_norm": 3.1473755836486816, | |
| "learning_rate": 3.304483984636124e-05, | |
| "loss": 0.3088, | |
| "mean_token_accuracy": 0.9068757712841033, | |
| "num_tokens": 6420978.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "entropy": 0.3290658682584763, | |
| "epoch": 2.32163080407701, | |
| "grad_norm": 2.8407304286956787, | |
| "learning_rate": 3.289291580269322e-05, | |
| "loss": 0.2956, | |
| "mean_token_accuracy": 0.9091124355792999, | |
| "num_tokens": 6452000.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 0.32331828474998475, | |
| "epoch": 2.332955832389581, | |
| "grad_norm": 2.915709972381592, | |
| "learning_rate": 3.2740667312170984e-05, | |
| "loss": 0.3011, | |
| "mean_token_accuracy": 0.9096018075942993, | |
| "num_tokens": 6482800.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "entropy": 0.35195091664791106, | |
| "epoch": 2.3442808607021517, | |
| "grad_norm": 2.9953255653381348, | |
| "learning_rate": 3.258810063313367e-05, | |
| "loss": 0.3126, | |
| "mean_token_accuracy": 0.9040073782205582, | |
| "num_tokens": 6516807.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "entropy": 0.34647743552923205, | |
| "epoch": 2.3556058890147225, | |
| "grad_norm": 2.6879382133483887, | |
| "learning_rate": 3.243522203699988e-05, | |
| "loss": 0.3188, | |
| "mean_token_accuracy": 0.9036970168352128, | |
| "num_tokens": 6548271.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "entropy": 0.32736130654811857, | |
| "epoch": 2.366930917327293, | |
| "grad_norm": 3.3962409496307373, | |
| "learning_rate": 3.228203780800993e-05, | |
| "loss": 0.2937, | |
| "mean_token_accuracy": 0.91224045753479, | |
| "num_tokens": 6577404.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "entropy": 0.34083098769187925, | |
| "epoch": 2.378255945639864, | |
| "grad_norm": 2.9646759033203125, | |
| "learning_rate": 3.212855424296748e-05, | |
| "loss": 0.3093, | |
| "mean_token_accuracy": 0.9033702582120895, | |
| "num_tokens": 6610996.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 0.33483326584100725, | |
| "epoch": 2.3895809739524347, | |
| "grad_norm": 3.603712558746338, | |
| "learning_rate": 3.1974777650980735e-05, | |
| "loss": 0.3052, | |
| "mean_token_accuracy": 0.9083625108003617, | |
| "num_tokens": 6641408.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "entropy": 0.36678697168827057, | |
| "epoch": 2.4009060022650055, | |
| "grad_norm": 5.152080535888672, | |
| "learning_rate": 3.182071435320309e-05, | |
| "loss": 0.3386, | |
| "mean_token_accuracy": 0.8961910039186478, | |
| "num_tokens": 6671255.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "entropy": 0.3381674662232399, | |
| "epoch": 2.4122310305775763, | |
| "grad_norm": 2.51043438911438, | |
| "learning_rate": 3.1666370682573305e-05, | |
| "loss": 0.3018, | |
| "mean_token_accuracy": 0.9092491447925568, | |
| "num_tokens": 6701042.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "entropy": 0.340181964635849, | |
| "epoch": 2.4235560588901475, | |
| "grad_norm": 2.988914966583252, | |
| "learning_rate": 3.151175298355515e-05, | |
| "loss": 0.3186, | |
| "mean_token_accuracy": 0.9035066097974778, | |
| "num_tokens": 6731513.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "entropy": 0.3381286084651947, | |
| "epoch": 2.434881087202718, | |
| "grad_norm": 3.1499576568603516, | |
| "learning_rate": 3.1356867611876626e-05, | |
| "loss": 0.3063, | |
| "mean_token_accuracy": 0.906991371512413, | |
| "num_tokens": 6763329.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 0.34378741681575775, | |
| "epoch": 2.446206115515289, | |
| "grad_norm": 3.016859292984009, | |
| "learning_rate": 3.120172093426873e-05, | |
| "loss": 0.3184, | |
| "mean_token_accuracy": 0.9061174720525742, | |
| "num_tokens": 6794777.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "entropy": 0.336453452706337, | |
| "epoch": 2.4575311438278598, | |
| "grad_norm": 3.0973525047302246, | |
| "learning_rate": 3.10463193282037e-05, | |
| "loss": 0.304, | |
| "mean_token_accuracy": 0.9055917888879776, | |
| "num_tokens": 6826289.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "entropy": 0.35441608279943465, | |
| "epoch": 2.4688561721404305, | |
| "grad_norm": 3.2018167972564697, | |
| "learning_rate": 3.08906691816329e-05, | |
| "loss": 0.3213, | |
| "mean_token_accuracy": 0.9018235206604004, | |
| "num_tokens": 6856735.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "entropy": 0.3426896780729294, | |
| "epoch": 2.4801812004530013, | |
| "grad_norm": 3.111391067504883, | |
| "learning_rate": 3.073477689272422e-05, | |
| "loss": 0.3085, | |
| "mean_token_accuracy": 0.9053992748260498, | |
| "num_tokens": 6887751.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "entropy": 0.3537640705704689, | |
| "epoch": 2.491506228765572, | |
| "grad_norm": 3.6062848567962646, | |
| "learning_rate": 3.057864886959907e-05, | |
| "loss": 0.3161, | |
| "mean_token_accuracy": 0.9045758932828903, | |
| "num_tokens": 6916652.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 0.3475446105003357, | |
| "epoch": 2.502831257078143, | |
| "grad_norm": 2.8011586666107178, | |
| "learning_rate": 3.0422291530068953e-05, | |
| "loss": 0.3139, | |
| "mean_token_accuracy": 0.9058610677719117, | |
| "num_tokens": 6947964.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "entropy": 0.33603944927453994, | |
| "epoch": 2.5141562853907136, | |
| "grad_norm": 2.7381720542907715, | |
| "learning_rate": 3.02657113013717e-05, | |
| "loss": 0.311, | |
| "mean_token_accuracy": 0.9063718646764756, | |
| "num_tokens": 6979191.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "entropy": 0.33636826276779175, | |
| "epoch": 2.5254813137032843, | |
| "grad_norm": 2.717543363571167, | |
| "learning_rate": 3.0108914619907236e-05, | |
| "loss": 0.2989, | |
| "mean_token_accuracy": 0.9079161524772644, | |
| "num_tokens": 7011313.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "entropy": 0.32530431896448136, | |
| "epoch": 2.536806342015855, | |
| "grad_norm": 3.0896191596984863, | |
| "learning_rate": 2.9951907930972994e-05, | |
| "loss": 0.3001, | |
| "mean_token_accuracy": 0.9083738714456558, | |
| "num_tokens": 7042592.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "entropy": 0.3297998860478401, | |
| "epoch": 2.548131370328426, | |
| "grad_norm": 2.8318963050842285, | |
| "learning_rate": 2.9794697688499e-05, | |
| "loss": 0.2948, | |
| "mean_token_accuracy": 0.9093282163143158, | |
| "num_tokens": 7074945.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 0.34250654131174085, | |
| "epoch": 2.5594563986409966, | |
| "grad_norm": 3.3961706161499023, | |
| "learning_rate": 2.9637290354782587e-05, | |
| "loss": 0.3157, | |
| "mean_token_accuracy": 0.904097443819046, | |
| "num_tokens": 7107633.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "entropy": 0.3470340445637703, | |
| "epoch": 2.5707814269535674, | |
| "grad_norm": 3.1472079753875732, | |
| "learning_rate": 2.9479692400222715e-05, | |
| "loss": 0.3189, | |
| "mean_token_accuracy": 0.9026219546794891, | |
| "num_tokens": 7140772.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "entropy": 0.33536320328712466, | |
| "epoch": 2.582106455266138, | |
| "grad_norm": 3.536257266998291, | |
| "learning_rate": 2.932191030305401e-05, | |
| "loss": 0.3074, | |
| "mean_token_accuracy": 0.9077878296375275, | |
| "num_tokens": 7174565.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "entropy": 0.34011308997869494, | |
| "epoch": 2.593431483578709, | |
| "grad_norm": 2.7522521018981934, | |
| "learning_rate": 2.916395054908052e-05, | |
| "loss": 0.3063, | |
| "mean_token_accuracy": 0.9075266391038894, | |
| "num_tokens": 7205947.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "entropy": 0.3416082814335823, | |
| "epoch": 2.6047565118912797, | |
| "grad_norm": 3.5005111694335938, | |
| "learning_rate": 2.9005819631409043e-05, | |
| "loss": 0.3097, | |
| "mean_token_accuracy": 0.9054687231779098, | |
| "num_tokens": 7239660.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 0.330567429959774, | |
| "epoch": 2.6160815402038504, | |
| "grad_norm": 2.752882480621338, | |
| "learning_rate": 2.8847524050182233e-05, | |
| "loss": 0.2994, | |
| "mean_token_accuracy": 0.9095361143350601, | |
| "num_tokens": 7271351.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "entropy": 0.3262020036578178, | |
| "epoch": 2.627406568516421, | |
| "grad_norm": 2.702531337738037, | |
| "learning_rate": 2.8689070312311443e-05, | |
| "loss": 0.307, | |
| "mean_token_accuracy": 0.9070776045322418, | |
| "num_tokens": 7306146.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "entropy": 0.33680719435214995, | |
| "epoch": 2.638731596828992, | |
| "grad_norm": 2.79473614692688, | |
| "learning_rate": 2.853046493120921e-05, | |
| "loss": 0.3065, | |
| "mean_token_accuracy": 0.9064619988203049, | |
| "num_tokens": 7338021.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "entropy": 0.3301373228430748, | |
| "epoch": 2.6500566251415627, | |
| "grad_norm": 2.9453022480010986, | |
| "learning_rate": 2.837171442652155e-05, | |
| "loss": 0.2878, | |
| "mean_token_accuracy": 0.9134915202856064, | |
| "num_tokens": 7367317.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "entropy": 0.34509565234184264, | |
| "epoch": 2.661381653454134, | |
| "grad_norm": 2.921013832092285, | |
| "learning_rate": 2.821282532385991e-05, | |
| "loss": 0.3178, | |
| "mean_token_accuracy": 0.9036821633577347, | |
| "num_tokens": 7399316.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "entropy": 0.33235773593187334, | |
| "epoch": 2.6727066817667042, | |
| "grad_norm": 3.50290846824646, | |
| "learning_rate": 2.8053804154532992e-05, | |
| "loss": 0.3025, | |
| "mean_token_accuracy": 0.9082023203372955, | |
| "num_tokens": 7429163.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "entropy": 0.3544440522789955, | |
| "epoch": 2.6840317100792754, | |
| "grad_norm": 3.421252489089966, | |
| "learning_rate": 2.78946574552782e-05, | |
| "loss": 0.3255, | |
| "mean_token_accuracy": 0.9013722956180572, | |
| "num_tokens": 7459357.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "entropy": 0.3445164501667023, | |
| "epoch": 2.6953567383918458, | |
| "grad_norm": 2.7874948978424072, | |
| "learning_rate": 2.7735391767993024e-05, | |
| "loss": 0.3056, | |
| "mean_token_accuracy": 0.9070187360048294, | |
| "num_tokens": 7491443.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "entropy": 0.3235954880714417, | |
| "epoch": 2.706681766704417, | |
| "grad_norm": 2.596618413925171, | |
| "learning_rate": 2.7576013639466064e-05, | |
| "loss": 0.3016, | |
| "mean_token_accuracy": 0.9080851078033447, | |
| "num_tokens": 7525034.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "entropy": 0.33873881548643114, | |
| "epoch": 2.7180067950169873, | |
| "grad_norm": 2.730522394180298, | |
| "learning_rate": 2.741652962110794e-05, | |
| "loss": 0.3098, | |
| "mean_token_accuracy": 0.9070809602737426, | |
| "num_tokens": 7556123.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 0.3413001626729965, | |
| "epoch": 2.7293318233295585, | |
| "grad_norm": 2.565798759460449, | |
| "learning_rate": 2.7256946268681997e-05, | |
| "loss": 0.3029, | |
| "mean_token_accuracy": 0.9063649326562881, | |
| "num_tokens": 7587382.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "entropy": 0.32414872050285337, | |
| "epoch": 2.7406568516421292, | |
| "grad_norm": 2.687298536300659, | |
| "learning_rate": 2.7097270142034798e-05, | |
| "loss": 0.3039, | |
| "mean_token_accuracy": 0.90952388048172, | |
| "num_tokens": 7620486.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "entropy": 0.33378091007471083, | |
| "epoch": 2.7519818799547, | |
| "grad_norm": 2.9899933338165283, | |
| "learning_rate": 2.6937507804826505e-05, | |
| "loss": 0.3099, | |
| "mean_token_accuracy": 0.9065448194742203, | |
| "num_tokens": 7647800.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "entropy": 0.32358061224222184, | |
| "epoch": 2.7633069082672708, | |
| "grad_norm": 3.958669424057007, | |
| "learning_rate": 2.6777665824261056e-05, | |
| "loss": 0.2932, | |
| "mean_token_accuracy": 0.9114756107330322, | |
| "num_tokens": 7681853.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "entropy": 0.35349722802639005, | |
| "epoch": 2.7746319365798415, | |
| "grad_norm": 2.954362154006958, | |
| "learning_rate": 2.661775077081621e-05, | |
| "loss": 0.3166, | |
| "mean_token_accuracy": 0.9041797786951065, | |
| "num_tokens": 7711414.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "entropy": 0.3320110633969307, | |
| "epoch": 2.7859569648924123, | |
| "grad_norm": 3.5596938133239746, | |
| "learning_rate": 2.645776921797347e-05, | |
| "loss": 0.3035, | |
| "mean_token_accuracy": 0.9083330810070038, | |
| "num_tokens": 7745429.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "entropy": 0.331499008834362, | |
| "epoch": 2.797281993204983, | |
| "grad_norm": 2.9514622688293457, | |
| "learning_rate": 2.6297727741947876e-05, | |
| "loss": 0.2946, | |
| "mean_token_accuracy": 0.9112407445907593, | |
| "num_tokens": 7776539.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "entropy": 0.34212442487478256, | |
| "epoch": 2.808607021517554, | |
| "grad_norm": 4.607098579406738, | |
| "learning_rate": 2.613763292141766e-05, | |
| "loss": 0.3161, | |
| "mean_token_accuracy": 0.9037576377391815, | |
| "num_tokens": 7807498.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "entropy": 0.326346854865551, | |
| "epoch": 2.8199320498301246, | |
| "grad_norm": 2.47886323928833, | |
| "learning_rate": 2.5977491337253844e-05, | |
| "loss": 0.2944, | |
| "mean_token_accuracy": 0.911178269982338, | |
| "num_tokens": 7838306.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "entropy": 0.33608004450798035, | |
| "epoch": 2.8312570781426953, | |
| "grad_norm": 3.0412232875823975, | |
| "learning_rate": 2.5817309572249736e-05, | |
| "loss": 0.3037, | |
| "mean_token_accuracy": 0.9075497031211853, | |
| "num_tokens": 7872016.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.8312570781426953, | |
| "eval_entropy": 0.327717344151504, | |
| "eval_loss": 0.38485008478164673, | |
| "eval_mean_token_accuracy": 0.9145353396437061, | |
| "eval_num_tokens": 7872016.0, | |
| "eval_runtime": 22.6936, | |
| "eval_samples_per_second": 47.106, | |
| "eval_steps_per_second": 5.905, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 0.3390058234333992, | |
| "epoch": 2.842582106455266, | |
| "grad_norm": 2.7344987392425537, | |
| "learning_rate": 2.565709421085028e-05, | |
| "loss": 0.3032, | |
| "mean_token_accuracy": 0.9075493305921555, | |
| "num_tokens": 7904470.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "entropy": 0.3146300859749317, | |
| "epoch": 2.853907134767837, | |
| "grad_norm": 3.110588788986206, | |
| "learning_rate": 2.549685183888149e-05, | |
| "loss": 0.2803, | |
| "mean_token_accuracy": 0.9147897362709045, | |
| "num_tokens": 7935280.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "entropy": 0.3277345307171345, | |
| "epoch": 2.8652321630804076, | |
| "grad_norm": 3.927215099334717, | |
| "learning_rate": 2.5336589043279634e-05, | |
| "loss": 0.315, | |
| "mean_token_accuracy": 0.9082329630851745, | |
| "num_tokens": 7968973.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "entropy": 0.31817393004894257, | |
| "epoch": 2.8765571913929784, | |
| "grad_norm": 3.023855447769165, | |
| "learning_rate": 2.5176312411820545e-05, | |
| "loss": 0.2923, | |
| "mean_token_accuracy": 0.9107245028018951, | |
| "num_tokens": 8001676.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "entropy": 0.33554808497428895, | |
| "epoch": 2.887882219705549, | |
| "grad_norm": 3.3250324726104736, | |
| "learning_rate": 2.5016028532848768e-05, | |
| "loss": 0.3107, | |
| "mean_token_accuracy": 0.9068995088338851, | |
| "num_tokens": 8034188.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "entropy": 0.33894127756357195, | |
| "epoch": 2.89920724801812, | |
| "grad_norm": 2.6111295223236084, | |
| "learning_rate": 2.4855743995006785e-05, | |
| "loss": 0.3057, | |
| "mean_token_accuracy": 0.9076811999082566, | |
| "num_tokens": 8064782.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "entropy": 0.34306282699108126, | |
| "epoch": 2.9105322763306907, | |
| "grad_norm": 2.726879596710205, | |
| "learning_rate": 2.4695465386964154e-05, | |
| "loss": 0.3149, | |
| "mean_token_accuracy": 0.9052368462085724, | |
| "num_tokens": 8095683.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "entropy": 0.32450498044490816, | |
| "epoch": 2.921857304643262, | |
| "grad_norm": 2.7551498413085938, | |
| "learning_rate": 2.4535199297146692e-05, | |
| "loss": 0.2934, | |
| "mean_token_accuracy": 0.9113465547561646, | |
| "num_tokens": 8128181.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "entropy": 0.31809509843587874, | |
| "epoch": 2.933182332955832, | |
| "grad_norm": 3.248033046722412, | |
| "learning_rate": 2.4374952313465618e-05, | |
| "loss": 0.2929, | |
| "mean_token_accuracy": 0.9110888838768005, | |
| "num_tokens": 8161222.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "entropy": 0.3428458094596863, | |
| "epoch": 2.9445073612684034, | |
| "grad_norm": 3.0945913791656494, | |
| "learning_rate": 2.4214731023046793e-05, | |
| "loss": 0.3094, | |
| "mean_token_accuracy": 0.9040931612253189, | |
| "num_tokens": 8191149.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 0.3347173988819122, | |
| "epoch": 2.9558323895809737, | |
| "grad_norm": 3.0713417530059814, | |
| "learning_rate": 2.4054542011959925e-05, | |
| "loss": 0.3085, | |
| "mean_token_accuracy": 0.9067859917879104, | |
| "num_tokens": 8223169.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "entropy": 0.33130600601434707, | |
| "epoch": 2.967157417893545, | |
| "grad_norm": 2.971564531326294, | |
| "learning_rate": 2.3894391864947845e-05, | |
| "loss": 0.3003, | |
| "mean_token_accuracy": 0.9123528331518174, | |
| "num_tokens": 8251958.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "entropy": 0.3486244469881058, | |
| "epoch": 2.9784824462061152, | |
| "grad_norm": 3.0552313327789307, | |
| "learning_rate": 2.3734287165155814e-05, | |
| "loss": 0.3088, | |
| "mean_token_accuracy": 0.9068525344133377, | |
| "num_tokens": 8280414.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "entropy": 0.3378919124603271, | |
| "epoch": 2.9898074745186864, | |
| "grad_norm": 3.0992133617401123, | |
| "learning_rate": 2.357423449386097e-05, | |
| "loss": 0.308, | |
| "mean_token_accuracy": 0.908593600988388, | |
| "num_tokens": 8309410.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "entropy": 0.32522587329149244, | |
| "epoch": 3.001132502831257, | |
| "grad_norm": 1.8476426601409912, | |
| "learning_rate": 2.341424043020174e-05, | |
| "loss": 0.2861, | |
| "mean_token_accuracy": 0.9121149241924286, | |
| "num_tokens": 8339500.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "entropy": 0.2401589497923851, | |
| "epoch": 3.012457531143828, | |
| "grad_norm": 2.254667282104492, | |
| "learning_rate": 2.3254311550907432e-05, | |
| "loss": 0.158, | |
| "mean_token_accuracy": 0.9545799106359482, | |
| "num_tokens": 8368545.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "entropy": 0.1963147647678852, | |
| "epoch": 3.0237825594563987, | |
| "grad_norm": 3.1326498985290527, | |
| "learning_rate": 2.309445443002788e-05, | |
| "loss": 0.1527, | |
| "mean_token_accuracy": 0.9554174333810806, | |
| "num_tokens": 8398109.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "entropy": 0.18867359235882758, | |
| "epoch": 3.0351075877689695, | |
| "grad_norm": 2.721315860748291, | |
| "learning_rate": 2.293467563866319e-05, | |
| "loss": 0.1467, | |
| "mean_token_accuracy": 0.9568633317947388, | |
| "num_tokens": 8427150.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "entropy": 0.17226183749735355, | |
| "epoch": 3.0464326160815403, | |
| "grad_norm": 2.6865651607513428, | |
| "learning_rate": 2.2774981744693667e-05, | |
| "loss": 0.1333, | |
| "mean_token_accuracy": 0.9603831499814988, | |
| "num_tokens": 8459804.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "entropy": 0.17720401659607887, | |
| "epoch": 3.057757644394111, | |
| "grad_norm": 2.5850725173950195, | |
| "learning_rate": 2.2615379312509827e-05, | |
| "loss": 0.1482, | |
| "mean_token_accuracy": 0.9568986982107163, | |
| "num_tokens": 8496137.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 0.19257365614175798, | |
| "epoch": 3.069082672706682, | |
| "grad_norm": 2.543994188308716, | |
| "learning_rate": 2.245587490274253e-05, | |
| "loss": 0.1511, | |
| "mean_token_accuracy": 0.9563712388277054, | |
| "num_tokens": 8527404.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "entropy": 0.1776495523750782, | |
| "epoch": 3.0804077010192525, | |
| "grad_norm": 2.6783289909362793, | |
| "learning_rate": 2.229647507199332e-05, | |
| "loss": 0.1394, | |
| "mean_token_accuracy": 0.9598089069128036, | |
| "num_tokens": 8561000.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "entropy": 0.17914639115333558, | |
| "epoch": 3.0917327293318233, | |
| "grad_norm": 2.805384635925293, | |
| "learning_rate": 2.2137186372564918e-05, | |
| "loss": 0.1462, | |
| "mean_token_accuracy": 0.956651571393013, | |
| "num_tokens": 8593390.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "entropy": 0.19363198950886726, | |
| "epoch": 3.103057757644394, | |
| "grad_norm": 3.3500022888183594, | |
| "learning_rate": 2.1978015352191864e-05, | |
| "loss": 0.1506, | |
| "mean_token_accuracy": 0.9557105898857117, | |
| "num_tokens": 8622236.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "entropy": 0.18671710416674614, | |
| "epoch": 3.114382785956965, | |
| "grad_norm": 2.2808239459991455, | |
| "learning_rate": 2.181896855377138e-05, | |
| "loss": 0.1435, | |
| "mean_token_accuracy": 0.9574358344078064, | |
| "num_tokens": 8654502.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "entropy": 0.1893085241317749, | |
| "epoch": 3.1257078142695356, | |
| "grad_norm": 2.6378250122070312, | |
| "learning_rate": 2.1660052515094405e-05, | |
| "loss": 0.1526, | |
| "mean_token_accuracy": 0.9564388334751129, | |
| "num_tokens": 8685392.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "entropy": 0.18678954988718033, | |
| "epoch": 3.1370328425821064, | |
| "grad_norm": 2.5639476776123047, | |
| "learning_rate": 2.1501273768576852e-05, | |
| "loss": 0.1448, | |
| "mean_token_accuracy": 0.9556661039590836, | |
| "num_tokens": 8718903.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "entropy": 0.1747233547270298, | |
| "epoch": 3.148357870894677, | |
| "grad_norm": 2.5726349353790283, | |
| "learning_rate": 2.1342638840991097e-05, | |
| "loss": 0.1401, | |
| "mean_token_accuracy": 0.9590637445449829, | |
| "num_tokens": 8751352.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "entropy": 0.18045611083507537, | |
| "epoch": 3.159682899207248, | |
| "grad_norm": 2.1656455993652344, | |
| "learning_rate": 2.1184154253197684e-05, | |
| "loss": 0.1461, | |
| "mean_token_accuracy": 0.9568846583366394, | |
| "num_tokens": 8784384.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "entropy": 0.1883378468453884, | |
| "epoch": 3.1710079275198186, | |
| "grad_norm": 2.388462781906128, | |
| "learning_rate": 2.1025826519877282e-05, | |
| "loss": 0.144, | |
| "mean_token_accuracy": 0.9586382806301117, | |
| "num_tokens": 8813243.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 0.18703141212463378, | |
| "epoch": 3.1823329558323894, | |
| "grad_norm": 2.7605926990509033, | |
| "learning_rate": 2.086766214926288e-05, | |
| "loss": 0.1506, | |
| "mean_token_accuracy": 0.956416517496109, | |
| "num_tokens": 8842657.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "entropy": 0.18549679666757585, | |
| "epoch": 3.19365798414496, | |
| "grad_norm": 2.453981399536133, | |
| "learning_rate": 2.0709667642872256e-05, | |
| "loss": 0.1456, | |
| "mean_token_accuracy": 0.9563432067632676, | |
| "num_tokens": 8874598.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "entropy": 0.18915204033255578, | |
| "epoch": 3.2049830124575314, | |
| "grad_norm": 2.598982810974121, | |
| "learning_rate": 2.055184949524075e-05, | |
| "loss": 0.1502, | |
| "mean_token_accuracy": 0.9555239349603653, | |
| "num_tokens": 8905326.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "entropy": 0.18085768818855286, | |
| "epoch": 3.216308040770102, | |
| "grad_norm": 2.4260501861572266, | |
| "learning_rate": 2.039421419365427e-05, | |
| "loss": 0.1458, | |
| "mean_token_accuracy": 0.9582718342542649, | |
| "num_tokens": 8938848.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "entropy": 0.183878605812788, | |
| "epoch": 3.227633069082673, | |
| "grad_norm": 2.386054039001465, | |
| "learning_rate": 2.0236768217882683e-05, | |
| "loss": 0.153, | |
| "mean_token_accuracy": 0.956705066561699, | |
| "num_tokens": 8969858.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "entropy": 0.18992681950330734, | |
| "epoch": 3.2389580973952437, | |
| "grad_norm": 3.0777018070220947, | |
| "learning_rate": 2.0079518039913343e-05, | |
| "loss": 0.1498, | |
| "mean_token_accuracy": 0.9585150212049485, | |
| "num_tokens": 8999393.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "entropy": 0.18626221343874932, | |
| "epoch": 3.2502831257078144, | |
| "grad_norm": 2.023559808731079, | |
| "learning_rate": 1.992247012368517e-05, | |
| "loss": 0.1492, | |
| "mean_token_accuracy": 0.9551093459129334, | |
| "num_tokens": 9031528.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "entropy": 0.18020688593387604, | |
| "epoch": 3.261608154020385, | |
| "grad_norm": 2.096430540084839, | |
| "learning_rate": 1.9765630924822886e-05, | |
| "loss": 0.1421, | |
| "mean_token_accuracy": 0.9582108706235886, | |
| "num_tokens": 9063549.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "entropy": 0.18335785791277887, | |
| "epoch": 3.272933182332956, | |
| "grad_norm": 2.4209272861480713, | |
| "learning_rate": 1.9609006890371667e-05, | |
| "loss": 0.1493, | |
| "mean_token_accuracy": 0.9564682513475418, | |
| "num_tokens": 9095204.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "entropy": 0.1816806599497795, | |
| "epoch": 3.2842582106455267, | |
| "grad_norm": 2.4786605834960938, | |
| "learning_rate": 1.9452604458532113e-05, | |
| "loss": 0.1396, | |
| "mean_token_accuracy": 0.9589683145284653, | |
| "num_tokens": 9128871.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 0.18295636475086213, | |
| "epoch": 3.2955832389580975, | |
| "grad_norm": 2.6071975231170654, | |
| "learning_rate": 1.92964300583956e-05, | |
| "loss": 0.1412, | |
| "mean_token_accuracy": 0.959463146328926, | |
| "num_tokens": 9159877.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "entropy": 0.1954453021287918, | |
| "epoch": 3.3069082672706682, | |
| "grad_norm": 2.99585223197937, | |
| "learning_rate": 1.914049010968003e-05, | |
| "loss": 0.155, | |
| "mean_token_accuracy": 0.9559677958488464, | |
| "num_tokens": 9191639.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "entropy": 0.1915322221815586, | |
| "epoch": 3.318233295583239, | |
| "grad_norm": 2.6830849647521973, | |
| "learning_rate": 1.898479102246592e-05, | |
| "loss": 0.1487, | |
| "mean_token_accuracy": 0.9566442102193833, | |
| "num_tokens": 9222953.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "entropy": 0.18220141232013704, | |
| "epoch": 3.3295583238958097, | |
| "grad_norm": 2.3461086750030518, | |
| "learning_rate": 1.8829339196932927e-05, | |
| "loss": 0.1421, | |
| "mean_token_accuracy": 0.9585667610168457, | |
| "num_tokens": 9255460.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "entropy": 0.1813945546746254, | |
| "epoch": 3.3408833522083805, | |
| "grad_norm": 2.318178415298462, | |
| "learning_rate": 1.867414102309671e-05, | |
| "loss": 0.148, | |
| "mean_token_accuracy": 0.9563045471906662, | |
| "num_tokens": 9287765.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "entropy": 0.18687178418040276, | |
| "epoch": 3.3522083805209513, | |
| "grad_norm": 2.7838022708892822, | |
| "learning_rate": 1.8519202880546337e-05, | |
| "loss": 0.1443, | |
| "mean_token_accuracy": 0.9579327285289765, | |
| "num_tokens": 9320472.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "entropy": 0.18875156193971634, | |
| "epoch": 3.363533408833522, | |
| "grad_norm": 2.69187068939209, | |
| "learning_rate": 1.8364531138182002e-05, | |
| "loss": 0.1471, | |
| "mean_token_accuracy": 0.9574691027402877, | |
| "num_tokens": 9351036.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "entropy": 0.179339037835598, | |
| "epoch": 3.374858437146093, | |
| "grad_norm": 2.6196610927581787, | |
| "learning_rate": 1.821013215395322e-05, | |
| "loss": 0.1389, | |
| "mean_token_accuracy": 0.9601082682609559, | |
| "num_tokens": 9382386.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "entropy": 0.18361469805240632, | |
| "epoch": 3.3861834654586636, | |
| "grad_norm": 2.4154880046844482, | |
| "learning_rate": 1.805601227459751e-05, | |
| "loss": 0.146, | |
| "mean_token_accuracy": 0.957899197936058, | |
| "num_tokens": 9414364.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "entropy": 0.18562110662460327, | |
| "epoch": 3.3975084937712343, | |
| "grad_norm": 1.9397906064987183, | |
| "learning_rate": 1.7902177835379437e-05, | |
| "loss": 0.1424, | |
| "mean_token_accuracy": 0.9570330619812012, | |
| "num_tokens": 9446533.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.3975084937712343, | |
| "eval_entropy": 0.2167282881799029, | |
| "eval_loss": 0.3606521189212799, | |
| "eval_mean_token_accuracy": 0.9296634993446407, | |
| "eval_num_tokens": 9446533.0, | |
| "eval_runtime": 22.8577, | |
| "eval_samples_per_second": 46.768, | |
| "eval_steps_per_second": 5.862, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 0.19131319522857665, | |
| "epoch": 3.408833522083805, | |
| "grad_norm": 2.902683973312378, | |
| "learning_rate": 1.7748635159830286e-05, | |
| "loss": 0.1543, | |
| "mean_token_accuracy": 0.9540064781904221, | |
| "num_tokens": 9478106.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "entropy": 0.1933427281677723, | |
| "epoch": 3.420158550396376, | |
| "grad_norm": 3.3666422367095947, | |
| "learning_rate": 1.759539055948806e-05, | |
| "loss": 0.1498, | |
| "mean_token_accuracy": 0.9571747660636902, | |
| "num_tokens": 9511243.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "entropy": 0.1907684937119484, | |
| "epoch": 3.4314835787089466, | |
| "grad_norm": 2.353508710861206, | |
| "learning_rate": 1.7442450333638073e-05, | |
| "loss": 0.1544, | |
| "mean_token_accuracy": 0.9548930257558823, | |
| "num_tokens": 9540974.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "entropy": 0.2047928489744663, | |
| "epoch": 3.4428086070215174, | |
| "grad_norm": 2.5164899826049805, | |
| "learning_rate": 1.728982076905396e-05, | |
| "loss": 0.1651, | |
| "mean_token_accuracy": 0.9520281046628952, | |
| "num_tokens": 9571441.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "entropy": 0.18354908376932144, | |
| "epoch": 3.454133635334088, | |
| "grad_norm": 2.7729098796844482, | |
| "learning_rate": 1.7137508139739327e-05, | |
| "loss": 0.1409, | |
| "mean_token_accuracy": 0.9593415051698685, | |
| "num_tokens": 9601231.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "entropy": 0.1844599574804306, | |
| "epoch": 3.4654586636466593, | |
| "grad_norm": 2.361435890197754, | |
| "learning_rate": 1.6985518706669794e-05, | |
| "loss": 0.147, | |
| "mean_token_accuracy": 0.9563383430242538, | |
| "num_tokens": 9632314.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "entropy": 0.18727928251028061, | |
| "epoch": 3.4767836919592296, | |
| "grad_norm": 3.096991539001465, | |
| "learning_rate": 1.6833858717535657e-05, | |
| "loss": 0.1489, | |
| "mean_token_accuracy": 0.9559879064559936, | |
| "num_tokens": 9664358.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "entropy": 0.18697959557175636, | |
| "epoch": 3.488108720271801, | |
| "grad_norm": 2.7007927894592285, | |
| "learning_rate": 1.6682534406485055e-05, | |
| "loss": 0.1435, | |
| "mean_token_accuracy": 0.9573837488889694, | |
| "num_tokens": 9694084.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "entropy": 0.17963240146636963, | |
| "epoch": 3.4994337485843716, | |
| "grad_norm": 2.9009249210357666, | |
| "learning_rate": 1.6531551993867717e-05, | |
| "loss": 0.1488, | |
| "mean_token_accuracy": 0.957426118850708, | |
| "num_tokens": 9726062.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "entropy": 0.18775674030184747, | |
| "epoch": 3.5107587768969424, | |
| "grad_norm": 2.6308300495147705, | |
| "learning_rate": 1.638091768597927e-05, | |
| "loss": 0.1468, | |
| "mean_token_accuracy": 0.9567855209112167, | |
| "num_tokens": 9757366.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "entropy": 0.18580029234290124, | |
| "epoch": 3.522083805209513, | |
| "grad_norm": 2.537184715270996, | |
| "learning_rate": 1.623063767480611e-05, | |
| "loss": 0.1451, | |
| "mean_token_accuracy": 0.9578706949949265, | |
| "num_tokens": 9789224.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "entropy": 0.184904894977808, | |
| "epoch": 3.533408833522084, | |
| "grad_norm": 2.5414226055145264, | |
| "learning_rate": 1.6080718137770908e-05, | |
| "loss": 0.1492, | |
| "mean_token_accuracy": 0.9576172918081284, | |
| "num_tokens": 9820220.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "entropy": 0.1862082712352276, | |
| "epoch": 3.5447338618346547, | |
| "grad_norm": 2.7060234546661377, | |
| "learning_rate": 1.5931165237478618e-05, | |
| "loss": 0.1465, | |
| "mean_token_accuracy": 0.9575734734535217, | |
| "num_tokens": 9852382.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "entropy": 0.19045321121811867, | |
| "epoch": 3.5560588901472254, | |
| "grad_norm": 2.51277756690979, | |
| "learning_rate": 1.5781985121463215e-05, | |
| "loss": 0.1529, | |
| "mean_token_accuracy": 0.9558474659919739, | |
| "num_tokens": 9882680.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "entropy": 0.1901462681591511, | |
| "epoch": 3.567383918459796, | |
| "grad_norm": 3.2841107845306396, | |
| "learning_rate": 1.5633183921934975e-05, | |
| "loss": 0.1494, | |
| "mean_token_accuracy": 0.9553068786859512, | |
| "num_tokens": 9912907.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "entropy": 0.20788209587335588, | |
| "epoch": 3.578708946772367, | |
| "grad_norm": 2.5944294929504395, | |
| "learning_rate": 1.5484767755528396e-05, | |
| "loss": 0.1695, | |
| "mean_token_accuracy": 0.9538979262113572, | |
| "num_tokens": 9943382.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "entropy": 0.19017338678240775, | |
| "epoch": 3.5900339750849377, | |
| "grad_norm": 2.511800765991211, | |
| "learning_rate": 1.5336742723050773e-05, | |
| "loss": 0.1508, | |
| "mean_token_accuracy": 0.9550507336854934, | |
| "num_tokens": 9974102.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "entropy": 0.18529783561825752, | |
| "epoch": 3.6013590033975085, | |
| "grad_norm": 2.57311749458313, | |
| "learning_rate": 1.5189114909231417e-05, | |
| "loss": 0.1468, | |
| "mean_token_accuracy": 0.9569410651922226, | |
| "num_tokens": 10006688.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "entropy": 0.17686643823981285, | |
| "epoch": 3.6126840317100792, | |
| "grad_norm": 2.5054280757904053, | |
| "learning_rate": 1.5041890382471529e-05, | |
| "loss": 0.1447, | |
| "mean_token_accuracy": 0.9577518045902252, | |
| "num_tokens": 10041266.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "entropy": 0.17823027074337006, | |
| "epoch": 3.62400906002265, | |
| "grad_norm": 2.448424816131592, | |
| "learning_rate": 1.4895075194594772e-05, | |
| "loss": 0.1432, | |
| "mean_token_accuracy": 0.9584116190671921, | |
| "num_tokens": 10073211.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "entropy": 0.18234781473875045, | |
| "epoch": 3.6353340883352208, | |
| "grad_norm": 2.9412801265716553, | |
| "learning_rate": 1.4748675380598485e-05, | |
| "loss": 0.1437, | |
| "mean_token_accuracy": 0.958428630232811, | |
| "num_tokens": 10103170.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "entropy": 0.18727488964796066, | |
| "epoch": 3.6466591166477915, | |
| "grad_norm": 2.0503034591674805, | |
| "learning_rate": 1.4602696958405604e-05, | |
| "loss": 0.1461, | |
| "mean_token_accuracy": 0.9569843918085098, | |
| "num_tokens": 10133784.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "entropy": 0.18094087094068528, | |
| "epoch": 3.6579841449603623, | |
| "grad_norm": 2.4768571853637695, | |
| "learning_rate": 1.4457145928617318e-05, | |
| "loss": 0.1436, | |
| "mean_token_accuracy": 0.956751012802124, | |
| "num_tokens": 10165215.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "entropy": 0.1724256232380867, | |
| "epoch": 3.669309173272933, | |
| "grad_norm": 2.701741933822632, | |
| "learning_rate": 1.4312028274266368e-05, | |
| "loss": 0.1358, | |
| "mean_token_accuracy": 0.9607989430427551, | |
| "num_tokens": 10198970.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "entropy": 0.1781797580420971, | |
| "epoch": 3.680634201585504, | |
| "grad_norm": 2.8996741771698, | |
| "learning_rate": 1.4167349960571139e-05, | |
| "loss": 0.1473, | |
| "mean_token_accuracy": 0.9563544005155563, | |
| "num_tokens": 10231477.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "entropy": 0.18036175966262818, | |
| "epoch": 3.6919592298980746, | |
| "grad_norm": 2.777268409729004, | |
| "learning_rate": 1.4023116934690444e-05, | |
| "loss": 0.1398, | |
| "mean_token_accuracy": 0.9592943549156189, | |
| "num_tokens": 10264052.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "entropy": 0.17828406393527985, | |
| "epoch": 3.7032842582106458, | |
| "grad_norm": 2.637403964996338, | |
| "learning_rate": 1.3879335125479054e-05, | |
| "loss": 0.136, | |
| "mean_token_accuracy": 0.9595444172620773, | |
| "num_tokens": 10295601.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "entropy": 0.18349251225590707, | |
| "epoch": 3.714609286523216, | |
| "grad_norm": 2.5384583473205566, | |
| "learning_rate": 1.3736010443243987e-05, | |
| "loss": 0.1452, | |
| "mean_token_accuracy": 0.9566281735897064, | |
| "num_tokens": 10328327.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "entropy": 0.1727997176349163, | |
| "epoch": 3.7259343148357873, | |
| "grad_norm": 2.7597897052764893, | |
| "learning_rate": 1.3593148779501575e-05, | |
| "loss": 0.1383, | |
| "mean_token_accuracy": 0.9598427444696427, | |
| "num_tokens": 10363214.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "entropy": 0.17277977019548416, | |
| "epoch": 3.7372593431483576, | |
| "grad_norm": 2.6197760105133057, | |
| "learning_rate": 1.345075600673526e-05, | |
| "loss": 0.1351, | |
| "mean_token_accuracy": 0.9603388220071792, | |
| "num_tokens": 10394380.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "entropy": 0.18125973120331765, | |
| "epoch": 3.748584371460929, | |
| "grad_norm": 2.668290853500366, | |
| "learning_rate": 1.330883797815421e-05, | |
| "loss": 0.1351, | |
| "mean_token_accuracy": 0.9600670874118805, | |
| "num_tokens": 10426589.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "entropy": 0.17329150810837746, | |
| "epoch": 3.7599093997734996, | |
| "grad_norm": 2.1576309204101562, | |
| "learning_rate": 1.3167400527452722e-05, | |
| "loss": 0.1354, | |
| "mean_token_accuracy": 0.9584385722875595, | |
| "num_tokens": 10459861.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "entropy": 0.18031825870275497, | |
| "epoch": 3.7712344280860703, | |
| "grad_norm": 2.3556900024414062, | |
| "learning_rate": 1.3026449468570434e-05, | |
| "loss": 0.1431, | |
| "mean_token_accuracy": 0.9587339907884598, | |
| "num_tokens": 10490225.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "entropy": 0.17993075400590897, | |
| "epoch": 3.782559456398641, | |
| "grad_norm": 2.7811639308929443, | |
| "learning_rate": 1.28859905954533e-05, | |
| "loss": 0.1413, | |
| "mean_token_accuracy": 0.9577191978693008, | |
| "num_tokens": 10520675.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "entropy": 0.17930143624544143, | |
| "epoch": 3.793884484711212, | |
| "grad_norm": 2.998231887817383, | |
| "learning_rate": 1.2746029681815468e-05, | |
| "loss": 0.1378, | |
| "mean_token_accuracy": 0.9599625527858734, | |
| "num_tokens": 10551792.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "entropy": 0.17830280363559722, | |
| "epoch": 3.8052095130237826, | |
| "grad_norm": 2.379882574081421, | |
| "learning_rate": 1.2606572480901887e-05, | |
| "loss": 0.1405, | |
| "mean_token_accuracy": 0.9589870750904084, | |
| "num_tokens": 10582017.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "entropy": 0.18163416758179665, | |
| "epoch": 3.8165345413363534, | |
| "grad_norm": 2.8061420917510986, | |
| "learning_rate": 1.2467624725251881e-05, | |
| "loss": 0.1398, | |
| "mean_token_accuracy": 0.9601597607135772, | |
| "num_tokens": 10612802.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "entropy": 0.1735350415110588, | |
| "epoch": 3.827859569648924, | |
| "grad_norm": 2.3078725337982178, | |
| "learning_rate": 1.2329192126463466e-05, | |
| "loss": 0.1426, | |
| "mean_token_accuracy": 0.95964674949646, | |
| "num_tokens": 10644168.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "entropy": 0.18496472463011743, | |
| "epoch": 3.839184597961495, | |
| "grad_norm": 2.1106555461883545, | |
| "learning_rate": 1.2191280374958558e-05, | |
| "loss": 0.1397, | |
| "mean_token_accuracy": 0.9582046836614608, | |
| "num_tokens": 10675513.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "entropy": 0.18191742673516273, | |
| "epoch": 3.8505096262740657, | |
| "grad_norm": 2.747044324874878, | |
| "learning_rate": 1.2053895139749136e-05, | |
| "loss": 0.1507, | |
| "mean_token_accuracy": 0.9582120895385742, | |
| "num_tokens": 10705985.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "entropy": 0.17193833813071252, | |
| "epoch": 3.8618346545866364, | |
| "grad_norm": 2.384255886077881, | |
| "learning_rate": 1.1917042068204082e-05, | |
| "loss": 0.1298, | |
| "mean_token_accuracy": 0.9630542248487473, | |
| "num_tokens": 10737566.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "entropy": 0.17727019563317298, | |
| "epoch": 3.873159682899207, | |
| "grad_norm": 2.8619980812072754, | |
| "learning_rate": 1.1780726785817161e-05, | |
| "loss": 0.1448, | |
| "mean_token_accuracy": 0.9592967689037323, | |
| "num_tokens": 10767754.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "entropy": 0.1765161231160164, | |
| "epoch": 3.884484711211778, | |
| "grad_norm": 1.9828704595565796, | |
| "learning_rate": 1.1644954895975726e-05, | |
| "loss": 0.1391, | |
| "mean_token_accuracy": 0.9608483374118805, | |
| "num_tokens": 10799308.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "entropy": 0.17770369723439217, | |
| "epoch": 3.8958097395243487, | |
| "grad_norm": 2.346670150756836, | |
| "learning_rate": 1.1509731979730393e-05, | |
| "loss": 0.1373, | |
| "mean_token_accuracy": 0.9591909050941467, | |
| "num_tokens": 10831057.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "entropy": 0.18057307675480844, | |
| "epoch": 3.9071347678369195, | |
| "grad_norm": 2.6184489727020264, | |
| "learning_rate": 1.1375063595565597e-05, | |
| "loss": 0.1389, | |
| "mean_token_accuracy": 0.9603639721870423, | |
| "num_tokens": 10862132.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "entropy": 0.17516846358776092, | |
| "epoch": 3.9184597961494902, | |
| "grad_norm": 2.5211005210876465, | |
| "learning_rate": 1.1240955279171161e-05, | |
| "loss": 0.1355, | |
| "mean_token_accuracy": 0.9610599786043167, | |
| "num_tokens": 10892085.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "entropy": 0.17097302675247192, | |
| "epoch": 3.929784824462061, | |
| "grad_norm": 2.9019627571105957, | |
| "learning_rate": 1.1107412543214707e-05, | |
| "loss": 0.1368, | |
| "mean_token_accuracy": 0.9616718500852585, | |
| "num_tokens": 10924497.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "entropy": 0.17731274515390397, | |
| "epoch": 3.9411098527746318, | |
| "grad_norm": 2.090160369873047, | |
| "learning_rate": 1.0974440877115059e-05, | |
| "loss": 0.1429, | |
| "mean_token_accuracy": 0.9593401104211807, | |
| "num_tokens": 10954535.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "entropy": 0.174043457955122, | |
| "epoch": 3.9524348810872025, | |
| "grad_norm": 2.19345760345459, | |
| "learning_rate": 1.0842045746816604e-05, | |
| "loss": 0.1334, | |
| "mean_token_accuracy": 0.9610940903425217, | |
| "num_tokens": 10984773.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "entropy": 0.1764768786728382, | |
| "epoch": 3.9637599093997737, | |
| "grad_norm": 2.4305057525634766, | |
| "learning_rate": 1.0710232594564573e-05, | |
| "loss": 0.1395, | |
| "mean_token_accuracy": 0.9584390819072723, | |
| "num_tokens": 11016942.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 3.9637599093997737, | |
| "eval_entropy": 0.19162503318555318, | |
| "eval_loss": 0.3246709704399109, | |
| "eval_mean_token_accuracy": 0.9435568692079231, | |
| "eval_num_tokens": 11016942.0, | |
| "eval_runtime": 22.7597, | |
| "eval_samples_per_second": 46.969, | |
| "eval_steps_per_second": 5.888, | |
| "step": 3500 | |
| }, | |
| { | |
| "entropy": 0.1761431761085987, | |
| "epoch": 3.975084937712344, | |
| "grad_norm": 2.183934450149536, | |
| "learning_rate": 1.057900683868138e-05, | |
| "loss": 0.1371, | |
| "mean_token_accuracy": 0.9605765461921691, | |
| "num_tokens": 11047956.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "entropy": 0.18299319073557854, | |
| "epoch": 3.9864099660249153, | |
| "grad_norm": 2.257868766784668, | |
| "learning_rate": 1.0448373873343892e-05, | |
| "loss": 0.1442, | |
| "mean_token_accuracy": 0.9576489955186844, | |
| "num_tokens": 11077078.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "entropy": 0.1684267781674862, | |
| "epoch": 3.9977349943374856, | |
| "grad_norm": 2.2900640964508057, | |
| "learning_rate": 1.0318339068361657e-05, | |
| "loss": 0.1332, | |
| "mean_token_accuracy": 0.961523225903511, | |
| "num_tokens": 11110785.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "entropy": 0.13921927139163018, | |
| "epoch": 4.009060022650057, | |
| "grad_norm": 1.109343409538269, | |
| "learning_rate": 1.018890776895618e-05, | |
| "loss": 0.0855, | |
| "mean_token_accuracy": 0.9792260736227035, | |
| "num_tokens": 11140853.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "entropy": 0.12029264867305756, | |
| "epoch": 4.020385050962627, | |
| "grad_norm": 2.0965583324432373, | |
| "learning_rate": 1.0060085295541242e-05, | |
| "loss": 0.0692, | |
| "mean_token_accuracy": 0.982733103632927, | |
| "num_tokens": 11171432.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "entropy": 0.1098148312419653, | |
| "epoch": 4.031710079275198, | |
| "grad_norm": 1.9380742311477661, | |
| "learning_rate": 9.931876943504159e-06, | |
| "loss": 0.0706, | |
| "mean_token_accuracy": 0.9828455358743667, | |
| "num_tokens": 11202059.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "entropy": 0.10720174796879292, | |
| "epoch": 4.043035107587769, | |
| "grad_norm": 1.4595756530761719, | |
| "learning_rate": 9.804287982988128e-06, | |
| "loss": 0.0699, | |
| "mean_token_accuracy": 0.9834076315164566, | |
| "num_tokens": 11233844.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "entropy": 0.0977464810013771, | |
| "epoch": 4.05436013590034, | |
| "grad_norm": 1.4825645685195923, | |
| "learning_rate": 9.677323658675594e-06, | |
| "loss": 0.0644, | |
| "mean_token_accuracy": 0.9842707842588425, | |
| "num_tokens": 11268276.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "entropy": 0.10380081199109555, | |
| "epoch": 4.06568516421291, | |
| "grad_norm": 1.125681757926941, | |
| "learning_rate": 9.550989189572623e-06, | |
| "loss": 0.0664, | |
| "mean_token_accuracy": 0.983523941040039, | |
| "num_tokens": 11300315.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "entropy": 0.11478501930832863, | |
| "epoch": 4.077010192525481, | |
| "grad_norm": 2.136237144470215, | |
| "learning_rate": 9.425289768794433e-06, | |
| "loss": 0.0741, | |
| "mean_token_accuracy": 0.9814123988151551, | |
| "num_tokens": 11328482.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "entropy": 0.10670767351984978, | |
| "epoch": 4.088335220838052, | |
| "grad_norm": 1.1652253866195679, | |
| "learning_rate": 9.300230563351877e-06, | |
| "loss": 0.0666, | |
| "mean_token_accuracy": 0.9832827031612397, | |
| "num_tokens": 11359860.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "entropy": 0.10554791763424873, | |
| "epoch": 4.099660249150623, | |
| "grad_norm": 1.7422910928726196, | |
| "learning_rate": 9.17581671393908e-06, | |
| "loss": 0.0663, | |
| "mean_token_accuracy": 0.9834140241146088, | |
| "num_tokens": 11391326.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "entropy": 0.09950958117842675, | |
| "epoch": 4.110985277463194, | |
| "grad_norm": 1.6092437505722046, | |
| "learning_rate": 9.052053334722075e-06, | |
| "loss": 0.0665, | |
| "mean_token_accuracy": 0.983272585272789, | |
| "num_tokens": 11424746.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "entropy": 0.10113044157624244, | |
| "epoch": 4.122310305775764, | |
| "grad_norm": 1.7780877351760864, | |
| "learning_rate": 8.928945513128648e-06, | |
| "loss": 0.0679, | |
| "mean_token_accuracy": 0.9828346908092499, | |
| "num_tokens": 11456151.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "entropy": 0.10017617046833038, | |
| "epoch": 4.133635334088336, | |
| "grad_norm": 1.337113857269287, | |
| "learning_rate": 8.806498309639161e-06, | |
| "loss": 0.0653, | |
| "mean_token_accuracy": 0.9838932573795318, | |
| "num_tokens": 11489218.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "entropy": 0.09929984286427498, | |
| "epoch": 4.144960362400906, | |
| "grad_norm": 1.4660295248031616, | |
| "learning_rate": 8.684716757578559e-06, | |
| "loss": 0.0625, | |
| "mean_token_accuracy": 0.9850300490856171, | |
| "num_tokens": 11522506.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "entropy": 0.10483775176107883, | |
| "epoch": 4.156285390713477, | |
| "grad_norm": 1.3711936473846436, | |
| "learning_rate": 8.563605862909466e-06, | |
| "loss": 0.0674, | |
| "mean_token_accuracy": 0.983280673623085, | |
| "num_tokens": 11554172.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "entropy": 0.1068756926804781, | |
| "epoch": 4.1676104190260475, | |
| "grad_norm": 1.3237987756729126, | |
| "learning_rate": 8.443170604026388e-06, | |
| "loss": 0.069, | |
| "mean_token_accuracy": 0.9830775171518326, | |
| "num_tokens": 11586956.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "entropy": 0.10014188215136528, | |
| "epoch": 4.178935447338619, | |
| "grad_norm": 1.6192259788513184, | |
| "learning_rate": 8.323415931551114e-06, | |
| "loss": 0.0634, | |
| "mean_token_accuracy": 0.9846415996551514, | |
| "num_tokens": 11619759.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "entropy": 0.10651820972561836, | |
| "epoch": 4.190260475651189, | |
| "grad_norm": 1.3025074005126953, | |
| "learning_rate": 8.204346768129182e-06, | |
| "loss": 0.0694, | |
| "mean_token_accuracy": 0.9827026188373565, | |
| "num_tokens": 11652472.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "entropy": 0.10732679478824139, | |
| "epoch": 4.20158550396376, | |
| "grad_norm": 2.4277567863464355, | |
| "learning_rate": 8.085968008227545e-06, | |
| "loss": 0.0701, | |
| "mean_token_accuracy": 0.9824820995330811, | |
| "num_tokens": 11685051.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "entropy": 0.10409895218908786, | |
| "epoch": 4.2129105322763305, | |
| "grad_norm": 1.18539559841156, | |
| "learning_rate": 7.96828451793335e-06, | |
| "loss": 0.0642, | |
| "mean_token_accuracy": 0.9833295792341232, | |
| "num_tokens": 11717668.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "entropy": 0.1043463621288538, | |
| "epoch": 4.224235560588902, | |
| "grad_norm": 1.7125917673110962, | |
| "learning_rate": 7.851301134753958e-06, | |
| "loss": 0.0693, | |
| "mean_token_accuracy": 0.9828597515821457, | |
| "num_tokens": 11748692.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "entropy": 0.10716413073241711, | |
| "epoch": 4.235560588901472, | |
| "grad_norm": 1.7997198104858398, | |
| "learning_rate": 7.735022667418054e-06, | |
| "loss": 0.0666, | |
| "mean_token_accuracy": 0.9837399870157242, | |
| "num_tokens": 11779854.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "entropy": 0.10388663597404957, | |
| "epoch": 4.246885617214043, | |
| "grad_norm": 1.4692825078964233, | |
| "learning_rate": 7.619453895678002e-06, | |
| "loss": 0.0704, | |
| "mean_token_accuracy": 0.982433345913887, | |
| "num_tokens": 11811159.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "entropy": 0.09520137794315815, | |
| "epoch": 4.2582106455266135, | |
| "grad_norm": 1.4710146188735962, | |
| "learning_rate": 7.504599570113355e-06, | |
| "loss": 0.0639, | |
| "mean_token_accuracy": 0.9837541997432708, | |
| "num_tokens": 11845982.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "entropy": 0.1049416147172451, | |
| "epoch": 4.269535673839185, | |
| "grad_norm": 1.5879381895065308, | |
| "learning_rate": 7.3904644119355856e-06, | |
| "loss": 0.0672, | |
| "mean_token_accuracy": 0.9831987112760544, | |
| "num_tokens": 11877055.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "entropy": 0.11051239259541035, | |
| "epoch": 4.280860702151755, | |
| "grad_norm": 1.4687776565551758, | |
| "learning_rate": 7.277053112794008e-06, | |
| "loss": 0.0698, | |
| "mean_token_accuracy": 0.9820594847202301, | |
| "num_tokens": 11908759.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "entropy": 0.09960917495191098, | |
| "epoch": 4.292185730464326, | |
| "grad_norm": 1.4004406929016113, | |
| "learning_rate": 7.164370334582929e-06, | |
| "loss": 0.0638, | |
| "mean_token_accuracy": 0.984798014163971, | |
| "num_tokens": 11941140.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "entropy": 0.11478472761809826, | |
| "epoch": 4.303510758776897, | |
| "grad_norm": 1.6815623044967651, | |
| "learning_rate": 7.052420709250018e-06, | |
| "loss": 0.0728, | |
| "mean_token_accuracy": 0.9819641202688217, | |
| "num_tokens": 11969922.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "entropy": 0.1214354656636715, | |
| "epoch": 4.314835787089468, | |
| "grad_norm": 1.6286342144012451, | |
| "learning_rate": 6.941208838605884e-06, | |
| "loss": 0.0783, | |
| "mean_token_accuracy": 0.9797982454299927, | |
| "num_tokens": 11997173.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "entropy": 0.09785076789557934, | |
| "epoch": 4.326160815402038, | |
| "grad_norm": 1.5718897581100464, | |
| "learning_rate": 6.830739294134944e-06, | |
| "loss": 0.0642, | |
| "mean_token_accuracy": 0.9843629568815231, | |
| "num_tokens": 12030969.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "entropy": 0.09950311519205571, | |
| "epoch": 4.337485843714609, | |
| "grad_norm": 1.7603079080581665, | |
| "learning_rate": 6.721016616807499e-06, | |
| "loss": 0.0655, | |
| "mean_token_accuracy": 0.9839122086763382, | |
| "num_tokens": 12062700.0, | |
| "step": 3830 | |
| }, | |
| { | |
| "entropy": 0.10633566826581956, | |
| "epoch": 4.34881087202718, | |
| "grad_norm": 1.8313571214675903, | |
| "learning_rate": 6.6120453168930565e-06, | |
| "loss": 0.0667, | |
| "mean_token_accuracy": 0.9825680136680603, | |
| "num_tokens": 12094124.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "entropy": 0.10049457848072052, | |
| "epoch": 4.360135900339751, | |
| "grad_norm": 1.9124850034713745, | |
| "learning_rate": 6.50382987377495e-06, | |
| "loss": 0.064, | |
| "mean_token_accuracy": 0.9843475848436356, | |
| "num_tokens": 12126923.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "entropy": 0.09917602241039276, | |
| "epoch": 4.371460928652322, | |
| "grad_norm": 2.690685510635376, | |
| "learning_rate": 6.396374735766181e-06, | |
| "loss": 0.0654, | |
| "mean_token_accuracy": 0.984014829993248, | |
| "num_tokens": 12162914.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "entropy": 0.10630933344364166, | |
| "epoch": 4.382785956964892, | |
| "grad_norm": 1.5558531284332275, | |
| "learning_rate": 6.289684319926609e-06, | |
| "loss": 0.0688, | |
| "mean_token_accuracy": 0.9832538574934006, | |
| "num_tokens": 12193386.0, | |
| "step": 3870 | |
| }, | |
| { | |
| "entropy": 0.10831660889089108, | |
| "epoch": 4.394110985277464, | |
| "grad_norm": 1.1746289730072021, | |
| "learning_rate": 6.183763011881349e-06, | |
| "loss": 0.0678, | |
| "mean_token_accuracy": 0.9832908064126968, | |
| "num_tokens": 12224466.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "entropy": 0.09632682017982006, | |
| "epoch": 4.405436013590034, | |
| "grad_norm": 1.4813034534454346, | |
| "learning_rate": 6.078615165640514e-06, | |
| "loss": 0.0625, | |
| "mean_token_accuracy": 0.9853335320949554, | |
| "num_tokens": 12258416.0, | |
| "step": 3890 | |
| }, | |
| { | |
| "entropy": 0.10236595608294011, | |
| "epoch": 4.416761041902605, | |
| "grad_norm": 1.622856616973877, | |
| "learning_rate": 5.9742451034202225e-06, | |
| "loss": 0.0669, | |
| "mean_token_accuracy": 0.9836599975824356, | |
| "num_tokens": 12291050.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "entropy": 0.09877747371792793, | |
| "epoch": 4.428086070215175, | |
| "grad_norm": 1.5723583698272705, | |
| "learning_rate": 5.870657115464945e-06, | |
| "loss": 0.0649, | |
| "mean_token_accuracy": 0.9841223329305648, | |
| "num_tokens": 12323596.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "entropy": 0.1077471561729908, | |
| "epoch": 4.439411098527747, | |
| "grad_norm": 1.604130744934082, | |
| "learning_rate": 5.767855459871138e-06, | |
| "loss": 0.0692, | |
| "mean_token_accuracy": 0.9839894771575928, | |
| "num_tokens": 12353006.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "entropy": 0.11380970776081085, | |
| "epoch": 4.450736126840317, | |
| "grad_norm": 1.3762873411178589, | |
| "learning_rate": 5.66584436241222e-06, | |
| "loss": 0.0722, | |
| "mean_token_accuracy": 0.9826073527336121, | |
| "num_tokens": 12381072.0, | |
| "step": 3930 | |
| }, | |
| { | |
| "entropy": 0.1162498328834772, | |
| "epoch": 4.462061155152888, | |
| "grad_norm": 1.0174576044082642, | |
| "learning_rate": 5.564628016364862e-06, | |
| "loss": 0.0812, | |
| "mean_token_accuracy": 0.9817124873399734, | |
| "num_tokens": 12411917.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "entropy": 0.11888325586915016, | |
| "epoch": 4.4733861834654585, | |
| "grad_norm": 1.0936172008514404, | |
| "learning_rate": 5.464210582336595e-06, | |
| "loss": 0.0778, | |
| "mean_token_accuracy": 0.9785851925611496, | |
| "num_tokens": 12444726.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "entropy": 0.1004635065793991, | |
| "epoch": 4.48471121177803, | |
| "grad_norm": 1.5305367708206177, | |
| "learning_rate": 5.364596188094839e-06, | |
| "loss": 0.0646, | |
| "mean_token_accuracy": 0.9839898198843002, | |
| "num_tokens": 12478621.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "entropy": 0.11077578589320183, | |
| "epoch": 4.4960362400906, | |
| "grad_norm": 1.8867307901382446, | |
| "learning_rate": 5.265788928397172e-06, | |
| "loss": 0.0737, | |
| "mean_token_accuracy": 0.981790030002594, | |
| "num_tokens": 12506545.0, | |
| "step": 3970 | |
| }, | |
| { | |
| "entropy": 0.10919649675488471, | |
| "epoch": 4.507361268403171, | |
| "grad_norm": 1.829324722290039, | |
| "learning_rate": 5.16779286482304e-06, | |
| "loss": 0.0693, | |
| "mean_token_accuracy": 0.9827600866556168, | |
| "num_tokens": 12536367.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "entropy": 0.10490477122366429, | |
| "epoch": 4.5186862967157415, | |
| "grad_norm": 1.3586397171020508, | |
| "learning_rate": 5.070612025606799e-06, | |
| "loss": 0.0689, | |
| "mean_token_accuracy": 0.9822495877742767, | |
| "num_tokens": 12567694.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "entropy": 0.10469603314995765, | |
| "epoch": 4.530011325028313, | |
| "grad_norm": 2.1284432411193848, | |
| "learning_rate": 4.974250405472109e-06, | |
| "loss": 0.0655, | |
| "mean_token_accuracy": 0.9836343795061111, | |
| "num_tokens": 12599121.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 4.530011325028313, | |
| "eval_entropy": 0.14545185185635268, | |
| "eval_loss": 0.33652952313423157, | |
| "eval_mean_token_accuracy": 0.947298993815237, | |
| "eval_num_tokens": 12599121.0, | |
| "eval_runtime": 22.7646, | |
| "eval_samples_per_second": 46.959, | |
| "eval_steps_per_second": 5.886, | |
| "step": 4000 | |
| }, | |
| { | |
| "entropy": 0.09632386714220047, | |
| "epoch": 4.541336353340883, | |
| "grad_norm": 1.4618823528289795, | |
| "learning_rate": 4.87871196546775e-06, | |
| "loss": 0.0634, | |
| "mean_token_accuracy": 0.9848664104938507, | |
| "num_tokens": 12633049.0, | |
| "step": 4010 | |
| }, | |
| { | |
| "entropy": 0.09856323637068272, | |
| "epoch": 4.552661381653454, | |
| "grad_norm": 1.6553484201431274, | |
| "learning_rate": 4.784000632804797e-06, | |
| "loss": 0.0653, | |
| "mean_token_accuracy": 0.9834588289260864, | |
| "num_tokens": 12667038.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "entropy": 0.1045151598751545, | |
| "epoch": 4.563986409966025, | |
| "grad_norm": 1.4048621654510498, | |
| "learning_rate": 4.690120300695175e-06, | |
| "loss": 0.0669, | |
| "mean_token_accuracy": 0.9840848416090011, | |
| "num_tokens": 12698493.0, | |
| "step": 4030 | |
| }, | |
| { | |
| "entropy": 0.09922835528850556, | |
| "epoch": 4.575311438278596, | |
| "grad_norm": 1.3776944875717163, | |
| "learning_rate": 4.597074828191633e-06, | |
| "loss": 0.0649, | |
| "mean_token_accuracy": 0.9843086212873459, | |
| "num_tokens": 12731001.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "entropy": 0.10786725878715515, | |
| "epoch": 4.586636466591166, | |
| "grad_norm": 1.3582957983016968, | |
| "learning_rate": 4.504868040029112e-06, | |
| "loss": 0.0704, | |
| "mean_token_accuracy": 0.9824901282787323, | |
| "num_tokens": 12761747.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "entropy": 0.10388052202761174, | |
| "epoch": 4.597961494903737, | |
| "grad_norm": 1.312872052192688, | |
| "learning_rate": 4.4135037264675244e-06, | |
| "loss": 0.0665, | |
| "mean_token_accuracy": 0.9838407725095749, | |
| "num_tokens": 12791779.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "entropy": 0.0987605456262827, | |
| "epoch": 4.6092865232163085, | |
| "grad_norm": 1.818518042564392, | |
| "learning_rate": 4.322985643135952e-06, | |
| "loss": 0.0655, | |
| "mean_token_accuracy": 0.9834914088249207, | |
| "num_tokens": 12823785.0, | |
| "step": 4070 | |
| }, | |
| { | |
| "entropy": 0.09871327206492424, | |
| "epoch": 4.620611551528879, | |
| "grad_norm": 1.7639421224594116, | |
| "learning_rate": 4.233317510878271e-06, | |
| "loss": 0.0631, | |
| "mean_token_accuracy": 0.9850008130073548, | |
| "num_tokens": 12856087.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "entropy": 0.11288162134587765, | |
| "epoch": 4.631936579841449, | |
| "grad_norm": 1.7146990299224854, | |
| "learning_rate": 4.1445030156001765e-06, | |
| "loss": 0.0714, | |
| "mean_token_accuracy": 0.9824142724275589, | |
| "num_tokens": 12885222.0, | |
| "step": 4090 | |
| }, | |
| { | |
| "entropy": 0.11302966885268688, | |
| "epoch": 4.64326160815402, | |
| "grad_norm": 1.8406291007995605, | |
| "learning_rate": 4.0565458081177174e-06, | |
| "loss": 0.0713, | |
| "mean_token_accuracy": 0.9816466063261032, | |
| "num_tokens": 12915084.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "entropy": 0.10371216647326946, | |
| "epoch": 4.6545866364665915, | |
| "grad_norm": 1.4874554872512817, | |
| "learning_rate": 3.969449504007189e-06, | |
| "loss": 0.0655, | |
| "mean_token_accuracy": 0.9841427594423294, | |
| "num_tokens": 12946310.0, | |
| "step": 4110 | |
| }, | |
| { | |
| "entropy": 0.09823175929486752, | |
| "epoch": 4.665911664779162, | |
| "grad_norm": 1.4839348793029785, | |
| "learning_rate": 3.883217683456522e-06, | |
| "loss": 0.0624, | |
| "mean_token_accuracy": 0.984762093424797, | |
| "num_tokens": 12979829.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "entropy": 0.09992509298026561, | |
| "epoch": 4.677236693091733, | |
| "grad_norm": 1.2307822704315186, | |
| "learning_rate": 3.7978538911181054e-06, | |
| "loss": 0.065, | |
| "mean_token_accuracy": 0.9842061281204224, | |
| "num_tokens": 13011881.0, | |
| "step": 4130 | |
| }, | |
| { | |
| "entropy": 0.10551130101084709, | |
| "epoch": 4.688561721404303, | |
| "grad_norm": 1.4577821493148804, | |
| "learning_rate": 3.7133616359630974e-06, | |
| "loss": 0.0658, | |
| "mean_token_accuracy": 0.9835035592317581, | |
| "num_tokens": 13042506.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "entropy": 0.10092180483043194, | |
| "epoch": 4.699886749716875, | |
| "grad_norm": 1.5127148628234863, | |
| "learning_rate": 3.6297443911371744e-06, | |
| "loss": 0.0687, | |
| "mean_token_accuracy": 0.9826838493347168, | |
| "num_tokens": 13074275.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "entropy": 0.1087297696620226, | |
| "epoch": 4.711211778029445, | |
| "grad_norm": 1.520949125289917, | |
| "learning_rate": 3.547005593817765e-06, | |
| "loss": 0.0691, | |
| "mean_token_accuracy": 0.9830662041902543, | |
| "num_tokens": 13104642.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "entropy": 0.10933543741703033, | |
| "epoch": 4.722536806342016, | |
| "grad_norm": 1.3643742799758911, | |
| "learning_rate": 3.4651486450727633e-06, | |
| "loss": 0.0676, | |
| "mean_token_accuracy": 0.9833125978708267, | |
| "num_tokens": 13134253.0, | |
| "step": 4170 | |
| }, | |
| { | |
| "entropy": 0.11058039665222168, | |
| "epoch": 4.733861834654586, | |
| "grad_norm": 1.106510043144226, | |
| "learning_rate": 3.384176909720718e-06, | |
| "loss": 0.0668, | |
| "mean_token_accuracy": 0.9826488733291626, | |
| "num_tokens": 13165807.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "entropy": 0.10953018330037594, | |
| "epoch": 4.745186862967158, | |
| "grad_norm": 1.3522114753723145, | |
| "learning_rate": 3.304093716192527e-06, | |
| "loss": 0.0675, | |
| "mean_token_accuracy": 0.9823530614376068, | |
| "num_tokens": 13194564.0, | |
| "step": 4190 | |
| }, | |
| { | |
| "entropy": 0.1017619101330638, | |
| "epoch": 4.756511891279728, | |
| "grad_norm": 1.3031588792800903, | |
| "learning_rate": 3.22490235639461e-06, | |
| "loss": 0.0655, | |
| "mean_token_accuracy": 0.9847893178462982, | |
| "num_tokens": 13227659.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "entropy": 0.11424098871648311, | |
| "epoch": 4.767836919592299, | |
| "grad_norm": 1.2343190908432007, | |
| "learning_rate": 3.146606085573603e-06, | |
| "loss": 0.0712, | |
| "mean_token_accuracy": 0.9818738162517547, | |
| "num_tokens": 13255783.0, | |
| "step": 4210 | |
| }, | |
| { | |
| "entropy": 0.1053063541650772, | |
| "epoch": 4.7791619479048695, | |
| "grad_norm": 1.8224282264709473, | |
| "learning_rate": 3.0692081221825235e-06, | |
| "loss": 0.0655, | |
| "mean_token_accuracy": 0.9835486084222793, | |
| "num_tokens": 13286564.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "entropy": 0.10083051659166813, | |
| "epoch": 4.790486976217441, | |
| "grad_norm": 1.631349802017212, | |
| "learning_rate": 2.992711647748503e-06, | |
| "loss": 0.0626, | |
| "mean_token_accuracy": 0.9853236883878708, | |
| "num_tokens": 13319319.0, | |
| "step": 4230 | |
| }, | |
| { | |
| "entropy": 0.10127135142683982, | |
| "epoch": 4.801812004530011, | |
| "grad_norm": 1.8996741771697998, | |
| "learning_rate": 2.917119806741991e-06, | |
| "loss": 0.0642, | |
| "mean_token_accuracy": 0.9843530178070068, | |
| "num_tokens": 13351309.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "entropy": 0.10405859649181366, | |
| "epoch": 4.813137032842582, | |
| "grad_norm": 1.651363730430603, | |
| "learning_rate": 2.842435706447499e-06, | |
| "loss": 0.0676, | |
| "mean_token_accuracy": 0.9830576866865158, | |
| "num_tokens": 13381712.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "entropy": 0.10092639103531838, | |
| "epoch": 4.8244620611551525, | |
| "grad_norm": 1.4375232458114624, | |
| "learning_rate": 2.7686624168358766e-06, | |
| "loss": 0.0674, | |
| "mean_token_accuracy": 0.9841265827417374, | |
| "num_tokens": 13413182.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "entropy": 0.10942405574023724, | |
| "epoch": 4.835787089467724, | |
| "grad_norm": 1.705937385559082, | |
| "learning_rate": 2.6958029704380995e-06, | |
| "loss": 0.0691, | |
| "mean_token_accuracy": 0.9832047790288925, | |
| "num_tokens": 13443178.0, | |
| "step": 4270 | |
| }, | |
| { | |
| "entropy": 0.10540841370821, | |
| "epoch": 4.847112117780295, | |
| "grad_norm": 1.5828317403793335, | |
| "learning_rate": 2.6238603622206427e-06, | |
| "loss": 0.0681, | |
| "mean_token_accuracy": 0.9836997509002685, | |
| "num_tokens": 13472760.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "entropy": 0.10759205557405949, | |
| "epoch": 4.858437146092865, | |
| "grad_norm": 2.081759452819824, | |
| "learning_rate": 2.5528375494623446e-06, | |
| "loss": 0.0691, | |
| "mean_token_accuracy": 0.9837682038545609, | |
| "num_tokens": 13502402.0, | |
| "step": 4290 | |
| }, | |
| { | |
| "entropy": 0.10228043347597122, | |
| "epoch": 4.869762174405436, | |
| "grad_norm": 1.5169647932052612, | |
| "learning_rate": 2.482737451632877e-06, | |
| "loss": 0.0656, | |
| "mean_token_accuracy": 0.9840810686349869, | |
| "num_tokens": 13533069.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "entropy": 0.10236416049301625, | |
| "epoch": 4.881087202718007, | |
| "grad_norm": 1.245417833328247, | |
| "learning_rate": 2.4135629502726825e-06, | |
| "loss": 0.0656, | |
| "mean_token_accuracy": 0.9837470918893814, | |
| "num_tokens": 13564257.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "entropy": 0.10065983273088933, | |
| "epoch": 4.892412231030578, | |
| "grad_norm": 1.7363886833190918, | |
| "learning_rate": 2.34531688887458e-06, | |
| "loss": 0.0646, | |
| "mean_token_accuracy": 0.9847758024930954, | |
| "num_tokens": 13594944.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "entropy": 0.09625008963048458, | |
| "epoch": 4.903737259343148, | |
| "grad_norm": 1.4529474973678589, | |
| "learning_rate": 2.2780020727668523e-06, | |
| "loss": 0.063, | |
| "mean_token_accuracy": 0.9846002101898194, | |
| "num_tokens": 13626469.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "entropy": 0.10502854771912098, | |
| "epoch": 4.9150622876557195, | |
| "grad_norm": 1.4248895645141602, | |
| "learning_rate": 2.211621268997932e-06, | |
| "loss": 0.0665, | |
| "mean_token_accuracy": 0.9832880735397339, | |
| "num_tokens": 13655878.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "entropy": 0.09599109441041946, | |
| "epoch": 4.92638731596829, | |
| "grad_norm": 1.3314168453216553, | |
| "learning_rate": 2.1461772062226744e-06, | |
| "loss": 0.0618, | |
| "mean_token_accuracy": 0.9860332638025284, | |
| "num_tokens": 13689056.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "entropy": 0.10332580804824829, | |
| "epoch": 4.937712344280861, | |
| "grad_norm": 2.152930498123169, | |
| "learning_rate": 2.0816725745901598e-06, | |
| "loss": 0.0649, | |
| "mean_token_accuracy": 0.983710652589798, | |
| "num_tokens": 13720715.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "entropy": 0.10360825695097446, | |
| "epoch": 4.949037372593431, | |
| "grad_norm": 2.080280065536499, | |
| "learning_rate": 2.01811002563316e-06, | |
| "loss": 0.0649, | |
| "mean_token_accuracy": 0.9848357111215591, | |
| "num_tokens": 13751367.0, | |
| "step": 4370 | |
| }, | |
| { | |
| "entropy": 0.10016910135746002, | |
| "epoch": 4.9603624009060026, | |
| "grad_norm": 1.3621736764907837, | |
| "learning_rate": 1.955492172159096e-06, | |
| "loss": 0.0641, | |
| "mean_token_accuracy": 0.9845564514398575, | |
| "num_tokens": 13782327.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "entropy": 0.09491119757294655, | |
| "epoch": 4.971687429218573, | |
| "grad_norm": 1.176149845123291, | |
| "learning_rate": 1.8938215881426773e-06, | |
| "loss": 0.0618, | |
| "mean_token_accuracy": 0.9852659702301025, | |
| "num_tokens": 13814598.0, | |
| "step": 4390 | |
| }, | |
| { | |
| "entropy": 0.09827233254909515, | |
| "epoch": 4.983012457531144, | |
| "grad_norm": 1.7675400972366333, | |
| "learning_rate": 1.8331008086200557e-06, | |
| "loss": 0.0599, | |
| "mean_token_accuracy": 0.9853399336338043, | |
| "num_tokens": 13847789.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "entropy": 0.10404907390475274, | |
| "epoch": 4.994337485843714, | |
| "grad_norm": 1.3572454452514648, | |
| "learning_rate": 1.7733323295846483e-06, | |
| "loss": 0.0694, | |
| "mean_token_accuracy": 0.9828858345746994, | |
| "num_tokens": 13878498.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "entropy": 0.08685027938336135, | |
| "epoch": 5.005662514156286, | |
| "grad_norm": 0.5327439904212952, | |
| "learning_rate": 1.714518607884541e-06, | |
| "loss": 0.0552, | |
| "mean_token_accuracy": 0.9876868307590485, | |
| "num_tokens": 13909014.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "entropy": 0.08384513482451439, | |
| "epoch": 5.016987542468856, | |
| "grad_norm": 0.6326966285705566, | |
| "learning_rate": 1.6566620611214722e-06, | |
| "loss": 0.0448, | |
| "mean_token_accuracy": 0.9899709314107895, | |
| "num_tokens": 13940990.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "entropy": 0.08707914762198925, | |
| "epoch": 5.028312570781427, | |
| "grad_norm": 0.6900177001953125, | |
| "learning_rate": 1.5997650675514703e-06, | |
| "loss": 0.0475, | |
| "mean_token_accuracy": 0.9897072285413742, | |
| "num_tokens": 13971095.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "entropy": 0.0787117276340723, | |
| "epoch": 5.0396375990939974, | |
| "grad_norm": 0.6966872811317444, | |
| "learning_rate": 1.5438299659870897e-06, | |
| "loss": 0.0415, | |
| "mean_token_accuracy": 0.9909714490175248, | |
| "num_tokens": 14006355.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "entropy": 0.0797785248607397, | |
| "epoch": 5.050962627406569, | |
| "grad_norm": 0.8368032574653625, | |
| "learning_rate": 1.4888590557012722e-06, | |
| "loss": 0.0425, | |
| "mean_token_accuracy": 0.9908244729042053, | |
| "num_tokens": 14039617.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "entropy": 0.07591100987046957, | |
| "epoch": 5.062287655719139, | |
| "grad_norm": 0.6300851702690125, | |
| "learning_rate": 1.4348545963328326e-06, | |
| "loss": 0.0425, | |
| "mean_token_accuracy": 0.9909938126802444, | |
| "num_tokens": 14073268.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "entropy": 0.08444061297923326, | |
| "epoch": 5.07361268403171, | |
| "grad_norm": 0.9781507253646851, | |
| "learning_rate": 1.3818188077935807e-06, | |
| "loss": 0.0479, | |
| "mean_token_accuracy": 0.988967090845108, | |
| "num_tokens": 14104311.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "entropy": 0.08250515647232533, | |
| "epoch": 5.0849377123442805, | |
| "grad_norm": 0.8466204404830933, | |
| "learning_rate": 1.3297538701770501e-06, | |
| "loss": 0.0448, | |
| "mean_token_accuracy": 0.9896146982908249, | |
| "num_tokens": 14134612.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "entropy": 0.07659826464951039, | |
| "epoch": 5.096262740656852, | |
| "grad_norm": 0.804344117641449, | |
| "learning_rate": 1.2786619236689002e-06, | |
| "loss": 0.0441, | |
| "mean_token_accuracy": 0.9899546831846238, | |
| "num_tokens": 14167795.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 5.096262740656852, | |
| "eval_entropy": 0.1306759690893675, | |
| "eval_loss": 0.3403843939304352, | |
| "eval_mean_token_accuracy": 0.9495465515264824, | |
| "eval_num_tokens": 14167795.0, | |
| "eval_runtime": 22.7882, | |
| "eval_samples_per_second": 46.91, | |
| "eval_steps_per_second": 5.88, | |
| "step": 4500 | |
| }, | |
| { | |
| "entropy": 0.07452163267880678, | |
| "epoch": 5.107587768969422, | |
| "grad_norm": 0.6283188462257385, | |
| "learning_rate": 1.2285450684589444e-06, | |
| "loss": 0.0425, | |
| "mean_token_accuracy": 0.9906815826892853, | |
| "num_tokens": 14201908.0, | |
| "step": 4510 | |
| }, | |
| { | |
| "entropy": 0.07540215756744147, | |
| "epoch": 5.118912797281993, | |
| "grad_norm": 0.8341459035873413, | |
| "learning_rate": 1.1794053646548038e-06, | |
| "loss": 0.0424, | |
| "mean_token_accuracy": 0.9904711753129959, | |
| "num_tokens": 14235583.0, | |
| "step": 4520 | |
| }, | |
| { | |
| "entropy": 0.07949476931244134, | |
| "epoch": 5.130237825594564, | |
| "grad_norm": 0.7760495543479919, | |
| "learning_rate": 1.1312448321972313e-06, | |
| "loss": 0.0459, | |
| "mean_token_accuracy": 0.9900894790887833, | |
| "num_tokens": 14266482.0, | |
| "step": 4530 | |
| }, | |
| { | |
| "entropy": 0.07272812370210886, | |
| "epoch": 5.141562853907135, | |
| "grad_norm": 0.8201942443847656, | |
| "learning_rate": 1.0840654507770915e-06, | |
| "loss": 0.0422, | |
| "mean_token_accuracy": 0.9906651735305786, | |
| "num_tokens": 14300188.0, | |
| "step": 4540 | |
| }, | |
| { | |
| "entropy": 0.08043113667517901, | |
| "epoch": 5.152887882219706, | |
| "grad_norm": 1.054805040359497, | |
| "learning_rate": 1.0378691597539598e-06, | |
| "loss": 0.045, | |
| "mean_token_accuracy": 0.9899501115083694, | |
| "num_tokens": 14330803.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "entropy": 0.08116988949477673, | |
| "epoch": 5.164212910532276, | |
| "grad_norm": 0.7149173021316528, | |
| "learning_rate": 9.926578580764234e-07, | |
| "loss": 0.0465, | |
| "mean_token_accuracy": 0.9893332690000534, | |
| "num_tokens": 14362375.0, | |
| "step": 4560 | |
| }, | |
| { | |
| "entropy": 0.07761353813111782, | |
| "epoch": 5.1755379388448475, | |
| "grad_norm": 0.6609951257705688, | |
| "learning_rate": 9.484334042040161e-07, | |
| "loss": 0.0443, | |
| "mean_token_accuracy": 0.9897470384836197, | |
| "num_tokens": 14394138.0, | |
| "step": 4570 | |
| }, | |
| { | |
| "entropy": 0.08467995468527079, | |
| "epoch": 5.186862967157418, | |
| "grad_norm": 0.9895986318588257, | |
| "learning_rate": 9.051976160308173e-07, | |
| "loss": 0.0492, | |
| "mean_token_accuracy": 0.9889312475919724, | |
| "num_tokens": 14422968.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "entropy": 0.07399331219494343, | |
| "epoch": 5.198187995469989, | |
| "grad_norm": 0.5765529870986938, | |
| "learning_rate": 8.629522708107352e-07, | |
| "loss": 0.0427, | |
| "mean_token_accuracy": 0.9902592331171036, | |
| "num_tokens": 14456688.0, | |
| "step": 4590 | |
| }, | |
| { | |
| "entropy": 0.07157274391502141, | |
| "epoch": 5.209513023782559, | |
| "grad_norm": 0.6822951436042786, | |
| "learning_rate": 8.21699105084453e-07, | |
| "loss": 0.0399, | |
| "mean_token_accuracy": 0.9916774541139602, | |
| "num_tokens": 14491708.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "entropy": 0.08627440240234137, | |
| "epoch": 5.2208380520951305, | |
| "grad_norm": 1.024498701095581, | |
| "learning_rate": 7.814398146080315e-07, | |
| "loss": 0.0508, | |
| "mean_token_accuracy": 0.9888632267713546, | |
| "num_tokens": 14519565.0, | |
| "step": 4610 | |
| }, | |
| { | |
| "entropy": 0.0863100489601493, | |
| "epoch": 5.232163080407701, | |
| "grad_norm": 1.332257866859436, | |
| "learning_rate": 7.421760542832223e-07, | |
| "loss": 0.0497, | |
| "mean_token_accuracy": 0.9890684664249421, | |
| "num_tokens": 14548908.0, | |
| "step": 4620 | |
| }, | |
| { | |
| "entropy": 0.07913930304348468, | |
| "epoch": 5.243488108720272, | |
| "grad_norm": 0.6243993043899536, | |
| "learning_rate": 7.039094380894201e-07, | |
| "loss": 0.0459, | |
| "mean_token_accuracy": 0.9895853340625763, | |
| "num_tokens": 14580672.0, | |
| "step": 4630 | |
| }, | |
| { | |
| "entropy": 0.0805983567610383, | |
| "epoch": 5.254813137032842, | |
| "grad_norm": 1.0783846378326416, | |
| "learning_rate": 6.66641539017343e-07, | |
| "loss": 0.0466, | |
| "mean_token_accuracy": 0.9895779520273209, | |
| "num_tokens": 14610845.0, | |
| "step": 4640 | |
| }, | |
| { | |
| "entropy": 0.07961287405341863, | |
| "epoch": 5.266138165345414, | |
| "grad_norm": 0.718805730342865, | |
| "learning_rate": 6.303738890043486e-07, | |
| "loss": 0.0461, | |
| "mean_token_accuracy": 0.9900570958852768, | |
| "num_tokens": 14641210.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "entropy": 0.07430734038352967, | |
| "epoch": 5.277463193657984, | |
| "grad_norm": 0.6966316103935242, | |
| "learning_rate": 5.951079788714813e-07, | |
| "loss": 0.0429, | |
| "mean_token_accuracy": 0.9904509067535401, | |
| "num_tokens": 14674054.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "entropy": 0.07951049022376537, | |
| "epoch": 5.288788221970555, | |
| "grad_norm": 0.772232174873352, | |
| "learning_rate": 5.608452582621771e-07, | |
| "loss": 0.0461, | |
| "mean_token_accuracy": 0.9892933934926986, | |
| "num_tokens": 14705152.0, | |
| "step": 4670 | |
| }, | |
| { | |
| "entropy": 0.08211956918239594, | |
| "epoch": 5.300113250283125, | |
| "grad_norm": 0.787604033946991, | |
| "learning_rate": 5.275871355826828e-07, | |
| "loss": 0.0469, | |
| "mean_token_accuracy": 0.9894729733467102, | |
| "num_tokens": 14734245.0, | |
| "step": 4680 | |
| }, | |
| { | |
| "entropy": 0.07479157857596874, | |
| "epoch": 5.311438278595697, | |
| "grad_norm": 0.6638301014900208, | |
| "learning_rate": 4.953349779441619e-07, | |
| "loss": 0.0438, | |
| "mean_token_accuracy": 0.9901529848575592, | |
| "num_tokens": 14766102.0, | |
| "step": 4690 | |
| }, | |
| { | |
| "entropy": 0.0770484933629632, | |
| "epoch": 5.322763306908267, | |
| "grad_norm": 0.6229594945907593, | |
| "learning_rate": 4.6409011110648824e-07, | |
| "loss": 0.0446, | |
| "mean_token_accuracy": 0.9904583126306534, | |
| "num_tokens": 14797209.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "entropy": 0.07668529693037271, | |
| "epoch": 5.334088335220838, | |
| "grad_norm": 0.8131431937217712, | |
| "learning_rate": 4.338538194237657e-07, | |
| "loss": 0.0441, | |
| "mean_token_accuracy": 0.9898177713155747, | |
| "num_tokens": 14829020.0, | |
| "step": 4710 | |
| }, | |
| { | |
| "entropy": 0.0804054843261838, | |
| "epoch": 5.3454133635334085, | |
| "grad_norm": 1.0860034227371216, | |
| "learning_rate": 4.046273457915084e-07, | |
| "loss": 0.0465, | |
| "mean_token_accuracy": 0.989600470662117, | |
| "num_tokens": 14859012.0, | |
| "step": 4720 | |
| }, | |
| { | |
| "entropy": 0.07926053572446108, | |
| "epoch": 5.35673839184598, | |
| "grad_norm": 0.7582584023475647, | |
| "learning_rate": 3.7641189159557946e-07, | |
| "loss": 0.0465, | |
| "mean_token_accuracy": 0.9896339029073715, | |
| "num_tokens": 14888725.0, | |
| "step": 4730 | |
| }, | |
| { | |
| "entropy": 0.07964552585035563, | |
| "epoch": 5.36806342015855, | |
| "grad_norm": 0.9673193097114563, | |
| "learning_rate": 3.4920861666279116e-07, | |
| "loss": 0.0448, | |
| "mean_token_accuracy": 0.9897122889757156, | |
| "num_tokens": 14920550.0, | |
| "step": 4740 | |
| }, | |
| { | |
| "entropy": 0.0836305595934391, | |
| "epoch": 5.379388448471121, | |
| "grad_norm": 0.9790941476821899, | |
| "learning_rate": 3.2301863921322395e-07, | |
| "loss": 0.0485, | |
| "mean_token_accuracy": 0.9896321624517441, | |
| "num_tokens": 14950387.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "entropy": 0.07453758120536805, | |
| "epoch": 5.3907134767836915, | |
| "grad_norm": 0.7324961423873901, | |
| "learning_rate": 2.97843035814277e-07, | |
| "loss": 0.0427, | |
| "mean_token_accuracy": 0.9902149707078933, | |
| "num_tokens": 14984224.0, | |
| "step": 4760 | |
| }, | |
| { | |
| "entropy": 0.07159966733306647, | |
| "epoch": 5.402038505096263, | |
| "grad_norm": 0.5410106182098389, | |
| "learning_rate": 2.7368284133639234e-07, | |
| "loss": 0.0424, | |
| "mean_token_accuracy": 0.9906870782375335, | |
| "num_tokens": 15017637.0, | |
| "step": 4770 | |
| }, | |
| { | |
| "entropy": 0.07854281235486268, | |
| "epoch": 5.413363533408834, | |
| "grad_norm": 0.5609409809112549, | |
| "learning_rate": 2.505390489105419e-07, | |
| "loss": 0.0439, | |
| "mean_token_accuracy": 0.9902636379003524, | |
| "num_tokens": 15049480.0, | |
| "step": 4780 | |
| }, | |
| { | |
| "entropy": 0.08379605785012245, | |
| "epoch": 5.424688561721404, | |
| "grad_norm": 0.8252092003822327, | |
| "learning_rate": 2.2841260988738232e-07, | |
| "loss": 0.0483, | |
| "mean_token_accuracy": 0.9893587976694107, | |
| "num_tokens": 15078517.0, | |
| "step": 4790 | |
| }, | |
| { | |
| "entropy": 0.08447619508951902, | |
| "epoch": 5.436013590033975, | |
| "grad_norm": 1.0486787557601929, | |
| "learning_rate": 2.0730443379815835e-07, | |
| "loss": 0.0486, | |
| "mean_token_accuracy": 0.9892659068107605, | |
| "num_tokens": 15108019.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "entropy": 0.07685894444584847, | |
| "epoch": 5.447338618346546, | |
| "grad_norm": 0.5011602640151978, | |
| "learning_rate": 1.8721538831731334e-07, | |
| "loss": 0.0434, | |
| "mean_token_accuracy": 0.9901338279247284, | |
| "num_tokens": 15140919.0, | |
| "step": 4810 | |
| }, | |
| { | |
| "entropy": 0.08181709069758654, | |
| "epoch": 5.458663646659117, | |
| "grad_norm": 0.6206848621368408, | |
| "learning_rate": 1.6814629922682624e-07, | |
| "loss": 0.0473, | |
| "mean_token_accuracy": 0.9891577929258346, | |
| "num_tokens": 15170772.0, | |
| "step": 4820 | |
| }, | |
| { | |
| "entropy": 0.08480710126459598, | |
| "epoch": 5.469988674971687, | |
| "grad_norm": 0.8081274032592773, | |
| "learning_rate": 1.5009795038225806e-07, | |
| "loss": 0.0488, | |
| "mean_token_accuracy": 0.989034104347229, | |
| "num_tokens": 15199368.0, | |
| "step": 4830 | |
| }, | |
| { | |
| "entropy": 0.08092885501682759, | |
| "epoch": 5.4813137032842585, | |
| "grad_norm": 1.0287882089614868, | |
| "learning_rate": 1.3307108368054155e-07, | |
| "loss": 0.0472, | |
| "mean_token_accuracy": 0.9893998026847839, | |
| "num_tokens": 15228987.0, | |
| "step": 4840 | |
| }, | |
| { | |
| "entropy": 0.0767004294320941, | |
| "epoch": 5.492638731596829, | |
| "grad_norm": 0.712293267250061, | |
| "learning_rate": 1.1706639902947791e-07, | |
| "loss": 0.0451, | |
| "mean_token_accuracy": 0.9901446044445038, | |
| "num_tokens": 15260156.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "entropy": 0.08494163490831852, | |
| "epoch": 5.5039637599094, | |
| "grad_norm": 1.0310022830963135, | |
| "learning_rate": 1.0208455431896247e-07, | |
| "loss": 0.0471, | |
| "mean_token_accuracy": 0.9887308567762375, | |
| "num_tokens": 15289396.0, | |
| "step": 4860 | |
| }, | |
| { | |
| "entropy": 0.07497126292437314, | |
| "epoch": 5.51528878822197, | |
| "grad_norm": 0.7754374146461487, | |
| "learning_rate": 8.812616539395358e-08, | |
| "loss": 0.0428, | |
| "mean_token_accuracy": 0.9900702744722366, | |
| "num_tokens": 15322332.0, | |
| "step": 4870 | |
| }, | |
| { | |
| "entropy": 0.07183474581688643, | |
| "epoch": 5.5266138165345415, | |
| "grad_norm": 0.8449347615242004, | |
| "learning_rate": 7.519180602915121e-08, | |
| "loss": 0.0424, | |
| "mean_token_accuracy": 0.9906940549612046, | |
| "num_tokens": 15356004.0, | |
| "step": 4880 | |
| }, | |
| { | |
| "entropy": 0.0772839292883873, | |
| "epoch": 5.537938844847112, | |
| "grad_norm": 0.7060112357139587, | |
| "learning_rate": 6.328200790540472e-08, | |
| "loss": 0.0435, | |
| "mean_token_accuracy": 0.9895416975021363, | |
| "num_tokens": 15388730.0, | |
| "step": 4890 | |
| }, | |
| { | |
| "entropy": 0.08096686322242022, | |
| "epoch": 5.549263873159683, | |
| "grad_norm": 0.6333571672439575, | |
| "learning_rate": 5.239726058787198e-08, | |
| "loss": 0.0453, | |
| "mean_token_accuracy": 0.9899547636508942, | |
| "num_tokens": 15420424.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "entropy": 0.0785508582368493, | |
| "epoch": 5.560588901472253, | |
| "grad_norm": 0.7323933839797974, | |
| "learning_rate": 4.253801150587711e-08, | |
| "loss": 0.0464, | |
| "mean_token_accuracy": 0.9895732134580613, | |
| "num_tokens": 15451624.0, | |
| "step": 4910 | |
| }, | |
| { | |
| "entropy": 0.07821750827133656, | |
| "epoch": 5.571913929784825, | |
| "grad_norm": 0.94260174036026, | |
| "learning_rate": 3.370466593453914e-08, | |
| "loss": 0.0444, | |
| "mean_token_accuracy": 0.989855831861496, | |
| "num_tokens": 15482992.0, | |
| "step": 4920 | |
| }, | |
| { | |
| "entropy": 0.09697492402046919, | |
| "epoch": 5.583238958097395, | |
| "grad_norm": 0.9922671318054199, | |
| "learning_rate": 2.589758697809086e-08, | |
| "loss": 0.0612, | |
| "mean_token_accuracy": 0.9866875171661377, | |
| "num_tokens": 15513562.0, | |
| "step": 4930 | |
| }, | |
| { | |
| "entropy": 0.0751221977174282, | |
| "epoch": 5.594563986409966, | |
| "grad_norm": 0.5793846845626831, | |
| "learning_rate": 1.9117095554974097e-08, | |
| "loss": 0.044, | |
| "mean_token_accuracy": 0.9899530529975891, | |
| "num_tokens": 15545720.0, | |
| "step": 4940 | |
| }, | |
| { | |
| "entropy": 0.07994864359498025, | |
| "epoch": 5.605889014722536, | |
| "grad_norm": 0.6867619752883911, | |
| "learning_rate": 1.3363470384636367e-08, | |
| "loss": 0.0458, | |
| "mean_token_accuracy": 0.9898412346839904, | |
| "num_tokens": 15577126.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "entropy": 0.0812272697687149, | |
| "epoch": 5.617214043035108, | |
| "grad_norm": 0.8053082227706909, | |
| "learning_rate": 8.636947976065069e-09, | |
| "loss": 0.0473, | |
| "mean_token_accuracy": 0.9890479534864426, | |
| "num_tokens": 15607165.0, | |
| "step": 4960 | |
| }, | |
| { | |
| "entropy": 0.09132701512426138, | |
| "epoch": 5.628539071347678, | |
| "grad_norm": 0.8204180598258972, | |
| "learning_rate": 4.9377226180924444e-09, | |
| "loss": 0.0562, | |
| "mean_token_accuracy": 0.9869016110897064, | |
| "num_tokens": 15637782.0, | |
| "step": 4970 | |
| }, | |
| { | |
| "entropy": 0.08176330700516701, | |
| "epoch": 5.639864099660249, | |
| "grad_norm": 1.1686699390411377, | |
| "learning_rate": 2.2659463713770036e-09, | |
| "loss": 0.0464, | |
| "mean_token_accuracy": 0.9895883351564407, | |
| "num_tokens": 15668276.0, | |
| "step": 4980 | |
| }, | |
| { | |
| "entropy": 0.07596621736884117, | |
| "epoch": 5.65118912797282, | |
| "grad_norm": 0.7290496230125427, | |
| "learning_rate": 6.217290621807204e-10, | |
| "loss": 0.0446, | |
| "mean_token_accuracy": 0.9898297399282455, | |
| "num_tokens": 15700845.0, | |
| "step": 4990 | |
| }, | |
| { | |
| "entropy": 0.07350182663649321, | |
| "epoch": 5.662514156285391, | |
| "grad_norm": 0.7328366041183472, | |
| "learning_rate": 5.138277833771632e-12, | |
| "loss": 0.0444, | |
| "mean_token_accuracy": 0.9903534233570099, | |
| "num_tokens": 15733342.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 5.662514156285391, | |
| "eval_entropy": 0.12777303962676384, | |
| "eval_loss": 0.3470225930213928, | |
| "eval_mean_token_accuracy": 0.9495740079168064, | |
| "eval_num_tokens": 15733342.0, | |
| "eval_runtime": 22.7824, | |
| "eval_samples_per_second": 46.922, | |
| "eval_steps_per_second": 5.882, | |
| "step": 5000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.869577281465498e+16, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |