{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.662514156285391, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.2195617377758026, "epoch": 0.011325028312570781, "grad_norm": 41.854122161865234, "learning_rate": 4.5e-06, "loss": 1.4649, "mean_token_accuracy": 0.7436085730791092, "num_tokens": 33232.0, "step": 10 }, { "entropy": 0.8825877517461777, "epoch": 0.022650056625141562, "grad_norm": 10.884156227111816, "learning_rate": 9.5e-06, "loss": 0.8606, "mean_token_accuracy": 0.8266362160444259, "num_tokens": 62241.0, "step": 20 }, { "entropy": 0.7232772573828697, "epoch": 0.03397508493771234, "grad_norm": 9.259407997131348, "learning_rate": 1.45e-05, "loss": 0.6503, "mean_token_accuracy": 0.8444810211658478, "num_tokens": 92125.0, "step": 30 }, { "entropy": 0.6392747581005096, "epoch": 0.045300113250283124, "grad_norm": 7.45441198348999, "learning_rate": 1.9500000000000003e-05, "loss": 0.5675, "mean_token_accuracy": 0.8603695809841156, "num_tokens": 125345.0, "step": 40 }, { "entropy": 0.7545703947544098, "epoch": 0.056625141562853906, "grad_norm": 7.38155460357666, "learning_rate": 2.45e-05, "loss": 0.6309, "mean_token_accuracy": 0.8437724113464355, "num_tokens": 155506.0, "step": 50 }, { "entropy": 0.7073849737644196, "epoch": 0.06795016987542468, "grad_norm": 8.455394744873047, "learning_rate": 2.95e-05, "loss": 0.5849, "mean_token_accuracy": 0.8386865645647049, "num_tokens": 186173.0, "step": 60 }, { "entropy": 0.6777370646595955, "epoch": 0.07927519818799547, "grad_norm": 10.703876495361328, "learning_rate": 3.45e-05, "loss": 0.6124, "mean_token_accuracy": 0.823593533039093, "num_tokens": 218694.0, "step": 70 }, { "entropy": 0.6481794014573097, "epoch": 0.09060022650056625, "grad_norm": 7.329402923583984, "learning_rate": 3.9500000000000005e-05, "loss": 0.6196, "mean_token_accuracy": 0.8281102895736694, "num_tokens": 251166.0, "step": 80 }, { "entropy": 0.6877432465553284, "epoch": 0.10192525481313704, "grad_norm": 7.263290882110596, "learning_rate": 4.4500000000000004e-05, "loss": 0.6643, "mean_token_accuracy": 0.81614009141922, "num_tokens": 282032.0, "step": 90 }, { "entropy": 0.7204893633723259, "epoch": 0.11325028312570781, "grad_norm": 7.8037590980529785, "learning_rate": 4.9500000000000004e-05, "loss": 0.6983, "mean_token_accuracy": 0.8097896903753281, "num_tokens": 313148.0, "step": 100 }, { "entropy": 0.760354095697403, "epoch": 0.1245753114382786, "grad_norm": 5.422664642333984, "learning_rate": 4.999958380063603e-05, "loss": 0.7468, "mean_token_accuracy": 0.799429202079773, "num_tokens": 343247.0, "step": 110 }, { "entropy": 0.7298643738031387, "epoch": 0.13590033975084936, "grad_norm": 6.69228458404541, "learning_rate": 4.999814510457652e-05, "loss": 0.7326, "mean_token_accuracy": 0.8030451983213425, "num_tokens": 375581.0, "step": 120 }, { "entropy": 0.768428310751915, "epoch": 0.14722536806342015, "grad_norm": 5.626662254333496, "learning_rate": 4.99956788326828e-05, "loss": 0.7484, "mean_token_accuracy": 0.7996043682098388, "num_tokens": 408107.0, "step": 130 }, { "entropy": 0.7457070380449295, "epoch": 0.15855039637599094, "grad_norm": 6.562707901000977, "learning_rate": 4.9992185086333655e-05, "loss": 0.7464, "mean_token_accuracy": 0.8001674145460129, "num_tokens": 438643.0, "step": 140 }, { "entropy": 0.8232061445713044, "epoch": 0.16987542468856173, "grad_norm": 5.214873790740967, "learning_rate": 4.998766400914329e-05, "loss": 0.795, "mean_token_accuracy": 0.7896791487932205, "num_tokens": 469183.0, "step": 150 }, { "entropy": 0.7592632710933686, "epoch": 0.1812004530011325, "grad_norm": 4.8459625244140625, "learning_rate": 4.99821157869555e-05, "loss": 0.7495, "mean_token_accuracy": 0.8009798556566239, "num_tokens": 499213.0, "step": 160 }, { "entropy": 0.7950803458690643, "epoch": 0.19252548131370328, "grad_norm": 4.592363357543945, "learning_rate": 4.9975540647835964e-05, "loss": 0.7862, "mean_token_accuracy": 0.7938760071992874, "num_tokens": 529845.0, "step": 170 }, { "entropy": 0.7862876445055008, "epoch": 0.20385050962627407, "grad_norm": 4.092002868652344, "learning_rate": 4.99679388620629e-05, "loss": 0.7826, "mean_token_accuracy": 0.7916485607624054, "num_tokens": 564300.0, "step": 180 }, { "entropy": 0.7688385576009751, "epoch": 0.21517553793884484, "grad_norm": 4.930974006652832, "learning_rate": 4.995931074211594e-05, "loss": 0.7503, "mean_token_accuracy": 0.8021332859992981, "num_tokens": 598731.0, "step": 190 }, { "entropy": 0.7965670645236969, "epoch": 0.22650056625141562, "grad_norm": 4.959285736083984, "learning_rate": 4.994965664266331e-05, "loss": 0.7702, "mean_token_accuracy": 0.7963653445243836, "num_tokens": 629368.0, "step": 200 }, { "entropy": 0.7948065817356109, "epoch": 0.23782559456398641, "grad_norm": 4.833596229553223, "learning_rate": 4.993897696054722e-05, "loss": 0.7944, "mean_token_accuracy": 0.789614400267601, "num_tokens": 660377.0, "step": 210 }, { "entropy": 0.7758382678031921, "epoch": 0.2491506228765572, "grad_norm": 4.224271774291992, "learning_rate": 4.9927272134767566e-05, "loss": 0.7741, "mean_token_accuracy": 0.795179745554924, "num_tokens": 690586.0, "step": 220 }, { "entropy": 0.7900628983974457, "epoch": 0.26047565118912797, "grad_norm": 3.947178602218628, "learning_rate": 4.991454264646391e-05, "loss": 0.766, "mean_token_accuracy": 0.794954827427864, "num_tokens": 725572.0, "step": 230 }, { "entropy": 0.8080787748098374, "epoch": 0.2718006795016987, "grad_norm": 3.5941126346588135, "learning_rate": 4.9900789018895636e-05, "loss": 0.7964, "mean_token_accuracy": 0.7924022555351258, "num_tokens": 756763.0, "step": 240 }, { "entropy": 0.8200718402862549, "epoch": 0.28312570781426954, "grad_norm": 3.9727747440338135, "learning_rate": 4.9886011817420526e-05, "loss": 0.7799, "mean_token_accuracy": 0.7895867377519608, "num_tokens": 787596.0, "step": 250 }, { "entropy": 0.8339032739400863, "epoch": 0.2944507361268403, "grad_norm": 4.92995548248291, "learning_rate": 4.9870211649471436e-05, "loss": 0.8152, "mean_token_accuracy": 0.7852619111537933, "num_tokens": 817469.0, "step": 260 }, { "entropy": 1.0795388698577881, "epoch": 0.3057757644394111, "grad_norm": 9.842768669128418, "learning_rate": 4.9853389164531396e-05, "loss": 1.0836, "mean_token_accuracy": 0.744838410615921, "num_tokens": 850001.0, "step": 270 }, { "entropy": 1.0132719367742538, "epoch": 0.3171007927519819, "grad_norm": 5.305619239807129, "learning_rate": 4.983554505410687e-05, "loss": 1.01, "mean_token_accuracy": 0.7551202327013016, "num_tokens": 880532.0, "step": 280 }, { "entropy": 0.8525327175855637, "epoch": 0.32842582106455265, "grad_norm": 3.3658530712127686, "learning_rate": 4.981668005169934e-05, "loss": 0.8566, "mean_token_accuracy": 0.7789583891630173, "num_tokens": 915441.0, "step": 290 }, { "entropy": 0.841290682554245, "epoch": 0.33975084937712347, "grad_norm": 4.01895809173584, "learning_rate": 4.979679493277518e-05, "loss": 0.795, "mean_token_accuracy": 0.7931433916091919, "num_tokens": 949771.0, "step": 300 }, { "entropy": 0.7752810955047608, "epoch": 0.3510758776896942, "grad_norm": 3.522529125213623, "learning_rate": 4.977589051473373e-05, "loss": 0.7492, "mean_token_accuracy": 0.8046811252832413, "num_tokens": 982523.0, "step": 310 }, { "entropy": 0.8144816100597382, "epoch": 0.362400906002265, "grad_norm": 3.3812482357025146, "learning_rate": 4.975396765687375e-05, "loss": 0.7877, "mean_token_accuracy": 0.7939431577920913, "num_tokens": 1016280.0, "step": 320 }, { "entropy": 0.8689843416213989, "epoch": 0.3737259343148358, "grad_norm": 4.172185897827148, "learning_rate": 4.9731027260358056e-05, "loss": 0.8494, "mean_token_accuracy": 0.7813519924879074, "num_tokens": 1048322.0, "step": 330 }, { "entropy": 0.7746345907449722, "epoch": 0.38505096262740657, "grad_norm": 3.5320262908935547, "learning_rate": 4.9707070268176494e-05, "loss": 0.7661, "mean_token_accuracy": 0.7966699630022049, "num_tokens": 1082053.0, "step": 340 }, { "entropy": 0.8384666532278061, "epoch": 0.39637599093997733, "grad_norm": 3.9420437812805176, "learning_rate": 4.9682097665107185e-05, "loss": 0.8145, "mean_token_accuracy": 0.7866115182638168, "num_tokens": 1113077.0, "step": 350 }, { "entropy": 0.8227119207382202, "epoch": 0.40770101925254815, "grad_norm": 3.9070684909820557, "learning_rate": 4.965611047767602e-05, "loss": 0.8011, "mean_token_accuracy": 0.7920049339532852, "num_tokens": 1141483.0, "step": 360 }, { "entropy": 0.8064505487680436, "epoch": 0.4190260475651189, "grad_norm": 3.753575563430786, "learning_rate": 4.962910977411451e-05, "loss": 0.798, "mean_token_accuracy": 0.7958554178476334, "num_tokens": 1173773.0, "step": 370 }, { "entropy": 0.80653215944767, "epoch": 0.43035107587768967, "grad_norm": 4.219732284545898, "learning_rate": 4.96010966643158e-05, "loss": 0.7906, "mean_token_accuracy": 0.7969932436943055, "num_tokens": 1204159.0, "step": 380 }, { "entropy": 0.8062887847423553, "epoch": 0.4416761041902605, "grad_norm": 3.3941314220428467, "learning_rate": 4.957207229978913e-05, "loss": 0.7938, "mean_token_accuracy": 0.7924901843070984, "num_tokens": 1236691.0, "step": 390 }, { "entropy": 0.7844055652618408, "epoch": 0.45300113250283125, "grad_norm": 3.865077018737793, "learning_rate": 4.9542037873612444e-05, "loss": 0.7839, "mean_token_accuracy": 0.7974178075790406, "num_tokens": 1272447.0, "step": 400 }, { "entropy": 0.8854492843151093, "epoch": 0.464326160815402, "grad_norm": 3.856250047683716, "learning_rate": 4.9510994620383354e-05, "loss": 0.8433, "mean_token_accuracy": 0.7830950766801834, "num_tokens": 1303921.0, "step": 410 }, { "entropy": 0.805621936917305, "epoch": 0.47565118912797283, "grad_norm": 3.9170212745666504, "learning_rate": 4.9478943816168424e-05, "loss": 0.8045, "mean_token_accuracy": 0.7906429260969162, "num_tokens": 1335991.0, "step": 420 }, { "entropy": 0.7835706263780594, "epoch": 0.4869762174405436, "grad_norm": 4.020387172698975, "learning_rate": 4.944588677845068e-05, "loss": 0.7576, "mean_token_accuracy": 0.8036289840936661, "num_tokens": 1369249.0, "step": 430 }, { "entropy": 0.7757609993219375, "epoch": 0.4983012457531144, "grad_norm": 3.977161407470703, "learning_rate": 4.941182486607546e-05, "loss": 0.7724, "mean_token_accuracy": 0.801279491186142, "num_tokens": 1399744.0, "step": 440 }, { "entropy": 0.8368120342493057, "epoch": 0.5096262740656852, "grad_norm": 3.2676987648010254, "learning_rate": 4.937675947919457e-05, "loss": 0.8203, "mean_token_accuracy": 0.7870727032423019, "num_tokens": 1431385.0, "step": 450 }, { "entropy": 0.8545293360948563, "epoch": 0.5209513023782559, "grad_norm": 4.302636623382568, "learning_rate": 4.9340692059208726e-05, "loss": 0.8189, "mean_token_accuracy": 0.7853404134511948, "num_tokens": 1460138.0, "step": 460 }, { "entropy": 0.8571904718875885, "epoch": 0.5322763306908267, "grad_norm": 3.3016743659973145, "learning_rate": 4.93036240887083e-05, "loss": 0.8428, "mean_token_accuracy": 0.7849946826696396, "num_tokens": 1492180.0, "step": 470 }, { "entropy": 0.7961454778909683, "epoch": 0.5436013590033975, "grad_norm": 3.5592470169067383, "learning_rate": 4.926555709141238e-05, "loss": 0.7875, "mean_token_accuracy": 0.7980852603912354, "num_tokens": 1524445.0, "step": 480 }, { "entropy": 0.8381227642297745, "epoch": 0.5549263873159683, "grad_norm": 3.524243116378784, "learning_rate": 4.9226492632106115e-05, "loss": 0.8016, "mean_token_accuracy": 0.789078775048256, "num_tokens": 1556464.0, "step": 490 }, { "entropy": 0.8132199555635452, "epoch": 0.5662514156285391, "grad_norm": 3.4961235523223877, "learning_rate": 4.918643231657642e-05, "loss": 0.8021, "mean_token_accuracy": 0.7943854779005051, "num_tokens": 1586084.0, "step": 500 }, { "epoch": 0.5662514156285391, "eval_entropy": 0.7580640996570018, "eval_loss": 0.7161204218864441, "eval_mean_token_accuracy": 0.8122646586218877, "eval_num_tokens": 1586084.0, "eval_runtime": 23.0676, "eval_samples_per_second": 46.342, "eval_steps_per_second": 5.809, "step": 500 }, { "entropy": 0.7857061624526978, "epoch": 0.5775764439411099, "grad_norm": 3.8294427394866943, "learning_rate": 4.914537779154596e-05, "loss": 0.7603, "mean_token_accuracy": 0.79789317548275, "num_tokens": 1619063.0, "step": 510 }, { "entropy": 0.7763534516096116, "epoch": 0.5889014722536806, "grad_norm": 3.318204402923584, "learning_rate": 4.910333074460548e-05, "loss": 0.7836, "mean_token_accuracy": 0.8022702991962433, "num_tokens": 1649064.0, "step": 520 }, { "entropy": 0.7938951075077056, "epoch": 0.6002265005662514, "grad_norm": 3.7849605083465576, "learning_rate": 4.906029290414438e-05, "loss": 0.7402, "mean_token_accuracy": 0.803567036986351, "num_tokens": 1679545.0, "step": 530 }, { "entropy": 0.9141301155090332, "epoch": 0.6115515288788222, "grad_norm": 4.45604944229126, "learning_rate": 4.9016266039279703e-05, "loss": 0.8793, "mean_token_accuracy": 0.7735334932804108, "num_tokens": 1708450.0, "step": 540 }, { "entropy": 0.8632586270570755, "epoch": 0.622876557191393, "grad_norm": 3.9162728786468506, "learning_rate": 4.897125195978344e-05, "loss": 0.8436, "mean_token_accuracy": 0.784154525399208, "num_tokens": 1739828.0, "step": 550 }, { "entropy": 0.7791282266378403, "epoch": 0.6342015855039638, "grad_norm": 3.5997514724731445, "learning_rate": 4.8925252516008066e-05, "loss": 0.7647, "mean_token_accuracy": 0.7994277358055115, "num_tokens": 1771291.0, "step": 560 }, { "entropy": 0.80836041867733, "epoch": 0.6455266138165345, "grad_norm": 16.00943946838379, "learning_rate": 4.887826959881058e-05, "loss": 0.7846, "mean_token_accuracy": 0.7990806043148041, "num_tokens": 1803223.0, "step": 570 }, { "entropy": 0.7901756346225739, "epoch": 0.6568516421291053, "grad_norm": 3.633241653442383, "learning_rate": 4.883030513947466e-05, "loss": 0.7681, "mean_token_accuracy": 0.8016709625720978, "num_tokens": 1835151.0, "step": 580 }, { "entropy": 0.8144267559051513, "epoch": 0.6681766704416761, "grad_norm": 3.5593814849853516, "learning_rate": 4.878136110963139e-05, "loss": 0.7784, "mean_token_accuracy": 0.7987326920032501, "num_tokens": 1869253.0, "step": 590 }, { "entropy": 0.7699689626693725, "epoch": 0.6795016987542469, "grad_norm": 3.547506093978882, "learning_rate": 4.873143952117812e-05, "loss": 0.7678, "mean_token_accuracy": 0.7996489554643631, "num_tokens": 1901972.0, "step": 600 }, { "entropy": 0.8226570546627044, "epoch": 0.6908267270668177, "grad_norm": 3.7377145290374756, "learning_rate": 4.868054242619582e-05, "loss": 0.7953, "mean_token_accuracy": 0.7935166180133819, "num_tokens": 1934251.0, "step": 610 }, { "entropy": 0.8604654908180237, "epoch": 0.7021517553793885, "grad_norm": 4.307958126068115, "learning_rate": 4.862867191686473e-05, "loss": 0.83, "mean_token_accuracy": 0.7884718626737595, "num_tokens": 1963277.0, "step": 620 }, { "entropy": 0.8304067760705948, "epoch": 0.7134767836919592, "grad_norm": 3.559305429458618, "learning_rate": 4.857583012537831e-05, "loss": 0.8245, "mean_token_accuracy": 0.7876487731933594, "num_tokens": 1994596.0, "step": 630 }, { "entropy": 0.7983928799629212, "epoch": 0.72480181200453, "grad_norm": 3.0747625827789307, "learning_rate": 4.852201922385564e-05, "loss": 0.7524, "mean_token_accuracy": 0.8012862592935562, "num_tokens": 2024407.0, "step": 640 }, { "entropy": 0.891300618648529, "epoch": 0.7361268403171007, "grad_norm": 4.086672306060791, "learning_rate": 4.846724142425213e-05, "loss": 0.8883, "mean_token_accuracy": 0.7746320635080337, "num_tokens": 2052217.0, "step": 650 }, { "entropy": 0.8403537899255753, "epoch": 0.7474518686296716, "grad_norm": 3.159459352493286, "learning_rate": 4.841149897826856e-05, "loss": 0.8149, "mean_token_accuracy": 0.7901445269584656, "num_tokens": 2083702.0, "step": 660 }, { "entropy": 0.761424508690834, "epoch": 0.7587768969422424, "grad_norm": 3.523196220397949, "learning_rate": 4.8354794177258575e-05, "loss": 0.7393, "mean_token_accuracy": 0.8060346513986587, "num_tokens": 2117175.0, "step": 670 }, { "entropy": 0.864872795343399, "epoch": 0.7701019252548131, "grad_norm": 3.8052961826324463, "learning_rate": 4.829712935213443e-05, "loss": 0.8493, "mean_token_accuracy": 0.7827917844057083, "num_tokens": 2145885.0, "step": 680 }, { "entropy": 0.8192390084266663, "epoch": 0.7814269535673839, "grad_norm": 3.475280284881592, "learning_rate": 4.823850687327124e-05, "loss": 0.8014, "mean_token_accuracy": 0.793234434723854, "num_tokens": 2178940.0, "step": 690 }, { "entropy": 0.8399118930101395, "epoch": 0.7927519818799547, "grad_norm": 4.192668914794922, "learning_rate": 4.817892915040948e-05, "loss": 0.8282, "mean_token_accuracy": 0.7871042519807816, "num_tokens": 2208885.0, "step": 700 }, { "entropy": 0.7586694777011871, "epoch": 0.8040770101925255, "grad_norm": 3.1814303398132324, "learning_rate": 4.811839863255602e-05, "loss": 0.73, "mean_token_accuracy": 0.8088628321886062, "num_tokens": 2240222.0, "step": 710 }, { "entropy": 0.8313590168952942, "epoch": 0.8154020385050963, "grad_norm": 3.2918899059295654, "learning_rate": 4.805691780788334e-05, "loss": 0.7996, "mean_token_accuracy": 0.795934408903122, "num_tokens": 2270756.0, "step": 720 }, { "entropy": 0.8529336482286454, "epoch": 0.8267270668176671, "grad_norm": 3.0882325172424316, "learning_rate": 4.799448920362734e-05, "loss": 0.8301, "mean_token_accuracy": 0.7841264367103576, "num_tokens": 2300237.0, "step": 730 }, { "entropy": 0.8395129233598709, "epoch": 0.8380520951302378, "grad_norm": 2.9842989444732666, "learning_rate": 4.793111538598345e-05, "loss": 0.7959, "mean_token_accuracy": 0.7929731100797653, "num_tokens": 2330775.0, "step": 740 }, { "entropy": 0.8623131781816482, "epoch": 0.8493771234428086, "grad_norm": 4.138679504394531, "learning_rate": 4.786679896000107e-05, "loss": 0.8596, "mean_token_accuracy": 0.7795057654380798, "num_tokens": 2360008.0, "step": 750 }, { "entropy": 0.8062419712543487, "epoch": 0.8607021517553793, "grad_norm": 3.422121286392212, "learning_rate": 4.7801542569476576e-05, "loss": 0.7658, "mean_token_accuracy": 0.8022167205810546, "num_tokens": 2390046.0, "step": 760 }, { "entropy": 0.8502861768007278, "epoch": 0.8720271800679502, "grad_norm": 3.099006175994873, "learning_rate": 4.773534889684458e-05, "loss": 0.8283, "mean_token_accuracy": 0.7852615088224411, "num_tokens": 2421387.0, "step": 770 }, { "entropy": 0.782235860824585, "epoch": 0.883352208380521, "grad_norm": 3.4044156074523926, "learning_rate": 4.7668220663067705e-05, "loss": 0.7658, "mean_token_accuracy": 0.8062999725341797, "num_tokens": 2450799.0, "step": 780 }, { "entropy": 0.8354818016290665, "epoch": 0.8946772366930917, "grad_norm": 4.402913570404053, "learning_rate": 4.7600160627524714e-05, "loss": 0.7934, "mean_token_accuracy": 0.796307036280632, "num_tokens": 2483454.0, "step": 790 }, { "entropy": 0.8112906068563461, "epoch": 0.9060022650056625, "grad_norm": 4.405174255371094, "learning_rate": 4.753117158789711e-05, "loss": 0.7988, "mean_token_accuracy": 0.7941813617944717, "num_tokens": 2514692.0, "step": 800 }, { "entropy": 0.792055967450142, "epoch": 0.9173272933182333, "grad_norm": 3.911775827407837, "learning_rate": 4.746125638005409e-05, "loss": 0.7967, "mean_token_accuracy": 0.7978548645973206, "num_tokens": 2546588.0, "step": 810 }, { "entropy": 0.8198580473661423, "epoch": 0.928652321630804, "grad_norm": 2.8875856399536133, "learning_rate": 4.7390417877936e-05, "loss": 0.8022, "mean_token_accuracy": 0.7922891557216645, "num_tokens": 2579566.0, "step": 820 }, { "entropy": 0.8724555522203445, "epoch": 0.9399773499433749, "grad_norm": 4.037559986114502, "learning_rate": 4.7318658993436226e-05, "loss": 0.8366, "mean_token_accuracy": 0.7826345145702363, "num_tokens": 2610767.0, "step": 830 }, { "entropy": 0.7823762893676758, "epoch": 0.9513023782559457, "grad_norm": 3.327577590942383, "learning_rate": 4.724598267628143e-05, "loss": 0.7769, "mean_token_accuracy": 0.7997746199369431, "num_tokens": 2642792.0, "step": 840 }, { "entropy": 0.7697542518377304, "epoch": 0.9626274065685164, "grad_norm": 3.0890369415283203, "learning_rate": 4.717239191391038e-05, "loss": 0.741, "mean_token_accuracy": 0.8050399273633957, "num_tokens": 2676964.0, "step": 850 }, { "entropy": 0.763307249546051, "epoch": 0.9739524348810872, "grad_norm": 4.2226643562316895, "learning_rate": 4.709788973135106e-05, "loss": 0.7547, "mean_token_accuracy": 0.8071206361055374, "num_tokens": 2708988.0, "step": 860 }, { "entropy": 0.7996066987514496, "epoch": 0.9852774631936579, "grad_norm": 3.804412603378296, "learning_rate": 4.70224791910964e-05, "loss": 0.7901, "mean_token_accuracy": 0.7994960457086563, "num_tokens": 2738235.0, "step": 870 }, { "entropy": 0.815873098373413, "epoch": 0.9966024915062288, "grad_norm": 3.5629003047943115, "learning_rate": 4.694616339297835e-05, "loss": 0.785, "mean_token_accuracy": 0.7961396902799607, "num_tokens": 2769182.0, "step": 880 }, { "entropy": 0.7122308641672135, "epoch": 1.0079275198187996, "grad_norm": 3.0285568237304688, "learning_rate": 4.686894547404045e-05, "loss": 0.6199, "mean_token_accuracy": 0.8262323826551438, "num_tokens": 2799895.0, "step": 890 }, { "entropy": 0.5145712837576866, "epoch": 1.0192525481313703, "grad_norm": 3.19512939453125, "learning_rate": 4.679082860840891e-05, "loss": 0.5098, "mean_token_accuracy": 0.851035150885582, "num_tokens": 2830787.0, "step": 900 }, { "entropy": 0.5525638118386269, "epoch": 1.030577576443941, "grad_norm": 2.943143367767334, "learning_rate": 4.6711816007162124e-05, "loss": 0.53, "mean_token_accuracy": 0.851621863245964, "num_tokens": 2861630.0, "step": 910 }, { "entropy": 0.5268796980381012, "epoch": 1.0419026047565119, "grad_norm": 3.5483663082122803, "learning_rate": 4.6631910918198654e-05, "loss": 0.4995, "mean_token_accuracy": 0.8567488580942154, "num_tokens": 2895795.0, "step": 920 }, { "entropy": 0.5167795568704605, "epoch": 1.0532276330690826, "grad_norm": 3.683345317840576, "learning_rate": 4.655111662610373e-05, "loss": 0.4901, "mean_token_accuracy": 0.8578720778226853, "num_tokens": 2928474.0, "step": 930 }, { "entropy": 0.5409791812300682, "epoch": 1.0645526613816534, "grad_norm": 2.885199546813965, "learning_rate": 4.646943645201426e-05, "loss": 0.5262, "mean_token_accuracy": 0.8517296850681305, "num_tokens": 2959541.0, "step": 940 }, { "entropy": 0.57862998098135, "epoch": 1.0758776896942241, "grad_norm": 4.052005767822266, "learning_rate": 4.638687375348227e-05, "loss": 0.5375, "mean_token_accuracy": 0.8474970668554306, "num_tokens": 2989964.0, "step": 950 }, { "entropy": 0.5536176413297653, "epoch": 1.087202718006795, "grad_norm": 3.0179316997528076, "learning_rate": 4.6303431924336934e-05, "loss": 0.5525, "mean_token_accuracy": 0.8437703341245651, "num_tokens": 3021104.0, "step": 960 }, { "entropy": 0.5437994614243508, "epoch": 1.098527746319366, "grad_norm": 3.4563395977020264, "learning_rate": 4.621911439454504e-05, "loss": 0.5062, "mean_token_accuracy": 0.8552743196487427, "num_tokens": 3054000.0, "step": 970 }, { "entropy": 0.5517205700278283, "epoch": 1.1098527746319367, "grad_norm": 3.2511813640594482, "learning_rate": 4.613392463006995e-05, "loss": 0.5222, "mean_token_accuracy": 0.8511393010616303, "num_tokens": 3082595.0, "step": 980 }, { "entropy": 0.537626887857914, "epoch": 1.1211778029445074, "grad_norm": 2.921431303024292, "learning_rate": 4.6047866132729253e-05, "loss": 0.5062, "mean_token_accuracy": 0.852451679110527, "num_tokens": 3113095.0, "step": 990 }, { "entropy": 0.5493237912654877, "epoch": 1.1325028312570782, "grad_norm": 3.2696175575256348, "learning_rate": 4.596094244005069e-05, "loss": 0.5319, "mean_token_accuracy": 0.852485916018486, "num_tokens": 3143581.0, "step": 1000 }, { "epoch": 1.1325028312570782, "eval_entropy": 0.575189925174215, "eval_loss": 0.6139051914215088, "eval_mean_token_accuracy": 0.8391392471185372, "eval_num_tokens": 3143581.0, "eval_runtime": 22.7589, "eval_samples_per_second": 46.971, "eval_steps_per_second": 5.888, "step": 1000 }, { "entropy": 0.5281848132610321, "epoch": 1.143827859569649, "grad_norm": 3.3432345390319824, "learning_rate": 4.5873157125126806e-05, "loss": 0.5143, "mean_token_accuracy": 0.8539972066879272, "num_tokens": 3176671.0, "step": 1010 }, { "entropy": 0.5357129707932472, "epoch": 1.1551528878822197, "grad_norm": 2.681213617324829, "learning_rate": 4.578451379646808e-05, "loss": 0.5176, "mean_token_accuracy": 0.8533397912979126, "num_tokens": 3207113.0, "step": 1020 }, { "entropy": 0.5573878586292267, "epoch": 1.1664779161947905, "grad_norm": 2.8207201957702637, "learning_rate": 4.569501609785455e-05, "loss": 0.5267, "mean_token_accuracy": 0.8482785791158676, "num_tokens": 3242168.0, "step": 1030 }, { "entropy": 0.5452752098441124, "epoch": 1.1778029445073612, "grad_norm": 3.3376107215881348, "learning_rate": 4.560466770818608e-05, "loss": 0.5172, "mean_token_accuracy": 0.8519764631986618, "num_tokens": 3273260.0, "step": 1040 }, { "entropy": 0.5476395100355148, "epoch": 1.189127972819932, "grad_norm": 3.6320974826812744, "learning_rate": 4.5513472341331076e-05, "loss": 0.5439, "mean_token_accuracy": 0.8460662096738816, "num_tokens": 3302219.0, "step": 1050 }, { "entropy": 0.5614733591675758, "epoch": 1.2004530011325028, "grad_norm": 2.790870428085327, "learning_rate": 4.542143374597391e-05, "loss": 0.5218, "mean_token_accuracy": 0.8489633470773696, "num_tokens": 3336388.0, "step": 1060 }, { "entropy": 0.5463294044137001, "epoch": 1.2117780294450737, "grad_norm": 3.163102388381958, "learning_rate": 4.5328555705460716e-05, "loss": 0.5324, "mean_token_accuracy": 0.847563111782074, "num_tokens": 3368580.0, "step": 1070 }, { "entropy": 0.5477871373295784, "epoch": 1.2231030577576445, "grad_norm": 2.990779161453247, "learning_rate": 4.5234842037643985e-05, "loss": 0.5314, "mean_token_accuracy": 0.8470851451158523, "num_tokens": 3399370.0, "step": 1080 }, { "entropy": 0.6027134999632835, "epoch": 1.2344280860702153, "grad_norm": 3.5813815593719482, "learning_rate": 4.5140296594725544e-05, "loss": 0.5768, "mean_token_accuracy": 0.8372454822063446, "num_tokens": 3433838.0, "step": 1090 }, { "entropy": 0.5187774360179901, "epoch": 1.245753114382786, "grad_norm": 3.008913993835449, "learning_rate": 4.504492326309824e-05, "loss": 0.5077, "mean_token_accuracy": 0.8522711008787155, "num_tokens": 3465362.0, "step": 1100 }, { "entropy": 0.5649975180625916, "epoch": 1.2570781426953568, "grad_norm": 3.3389503955841064, "learning_rate": 4.494872596318618e-05, "loss": 0.5394, "mean_token_accuracy": 0.8493940323591233, "num_tokens": 3497415.0, "step": 1110 }, { "entropy": 0.6041379794478416, "epoch": 1.2684031710079275, "grad_norm": 4.550158977508545, "learning_rate": 4.48517086492836e-05, "loss": 0.5791, "mean_token_accuracy": 0.8398642212152481, "num_tokens": 3528071.0, "step": 1120 }, { "entropy": 0.5308591544628143, "epoch": 1.2797281993204983, "grad_norm": 3.043844223022461, "learning_rate": 4.4753875309392266e-05, "loss": 0.5014, "mean_token_accuracy": 0.8557395815849305, "num_tokens": 3562511.0, "step": 1130 }, { "entropy": 0.5395269095897675, "epoch": 1.291053227633069, "grad_norm": 3.781155586242676, "learning_rate": 4.46552299650576e-05, "loss": 0.5364, "mean_token_accuracy": 0.8473162204027176, "num_tokens": 3592113.0, "step": 1140 }, { "entropy": 0.5485832586884498, "epoch": 1.3023782559456398, "grad_norm": 3.388087511062622, "learning_rate": 4.455577667120333e-05, "loss": 0.5232, "mean_token_accuracy": 0.8495394617319107, "num_tokens": 3623419.0, "step": 1150 }, { "entropy": 0.5457410931587219, "epoch": 1.3137032842582106, "grad_norm": 2.7439608573913574, "learning_rate": 4.445551951596486e-05, "loss": 0.5239, "mean_token_accuracy": 0.8516563266515732, "num_tokens": 3655712.0, "step": 1160 }, { "entropy": 0.5817072510719299, "epoch": 1.3250283125707814, "grad_norm": 3.6495678424835205, "learning_rate": 4.4354462620521144e-05, "loss": 0.552, "mean_token_accuracy": 0.8431410670280457, "num_tokens": 3687624.0, "step": 1170 }, { "entropy": 0.5500650435686112, "epoch": 1.3363533408833521, "grad_norm": 2.702406406402588, "learning_rate": 4.425261013892534e-05, "loss": 0.5376, "mean_token_accuracy": 0.8446941554546357, "num_tokens": 3718974.0, "step": 1180 }, { "entropy": 0.5515587076544761, "epoch": 1.3476783691959229, "grad_norm": 3.3534793853759766, "learning_rate": 4.414996625793404e-05, "loss": 0.5358, "mean_token_accuracy": 0.8478793174028396, "num_tokens": 3749961.0, "step": 1190 }, { "entropy": 0.6033241957426071, "epoch": 1.3590033975084936, "grad_norm": 4.399415016174316, "learning_rate": 4.404653519683517e-05, "loss": 0.568, "mean_token_accuracy": 0.838269567489624, "num_tokens": 3779674.0, "step": 1200 }, { "entropy": 0.5632871612906456, "epoch": 1.3703284258210646, "grad_norm": 3.128950834274292, "learning_rate": 4.394232120727453e-05, "loss": 0.5478, "mean_token_accuracy": 0.8447149246931076, "num_tokens": 3811144.0, "step": 1210 }, { "entropy": 0.5675986737012864, "epoch": 1.3816534541336354, "grad_norm": 3.912665843963623, "learning_rate": 4.3837328573081057e-05, "loss": 0.551, "mean_token_accuracy": 0.8420376777648926, "num_tokens": 3841697.0, "step": 1220 }, { "entropy": 0.5417166292667389, "epoch": 1.3929784824462061, "grad_norm": 3.0909736156463623, "learning_rate": 4.373156161009071e-05, "loss": 0.5136, "mean_token_accuracy": 0.8509470343589782, "num_tokens": 3875026.0, "step": 1230 }, { "entropy": 0.6006052121520042, "epoch": 1.404303510758777, "grad_norm": 3.4114279747009277, "learning_rate": 4.3625024665969086e-05, "loss": 0.5766, "mean_token_accuracy": 0.8366666853427887, "num_tokens": 3905166.0, "step": 1240 }, { "entropy": 0.5631272241473197, "epoch": 1.4156285390713477, "grad_norm": 3.365401029586792, "learning_rate": 4.351772212003268e-05, "loss": 0.5295, "mean_token_accuracy": 0.8479926735162735, "num_tokens": 3936113.0, "step": 1250 }, { "entropy": 0.560055273771286, "epoch": 1.4269535673839184, "grad_norm": 3.087405204772949, "learning_rate": 4.3409658383068865e-05, "loss": 0.5347, "mean_token_accuracy": 0.8461541920900345, "num_tokens": 3966883.0, "step": 1260 }, { "entropy": 0.5575806692242622, "epoch": 1.4382785956964892, "grad_norm": 3.507297992706299, "learning_rate": 4.3300837897154636e-05, "loss": 0.5417, "mean_token_accuracy": 0.8473523437976838, "num_tokens": 3997355.0, "step": 1270 }, { "entropy": 0.6279172241687775, "epoch": 1.44960362400906, "grad_norm": 4.649575710296631, "learning_rate": 4.3191265135473926e-05, "loss": 0.5937, "mean_token_accuracy": 0.8347241580486298, "num_tokens": 4026260.0, "step": 1280 }, { "entropy": 0.5787097245454789, "epoch": 1.460928652321631, "grad_norm": 3.5137453079223633, "learning_rate": 4.3080944602133804e-05, "loss": 0.5253, "mean_token_accuracy": 0.8458025574684143, "num_tokens": 4056398.0, "step": 1290 }, { "entropy": 0.5460534125566483, "epoch": 1.4722536806342017, "grad_norm": 3.8779032230377197, "learning_rate": 4.296988083197932e-05, "loss": 0.553, "mean_token_accuracy": 0.8447301775217056, "num_tokens": 4087702.0, "step": 1300 }, { "entropy": 0.6065227225422859, "epoch": 1.4835787089467725, "grad_norm": 3.0295608043670654, "learning_rate": 4.285807839040702e-05, "loss": 0.5682, "mean_token_accuracy": 0.8381103157997132, "num_tokens": 4119831.0, "step": 1310 }, { "entropy": 0.5128992781043052, "epoch": 1.4949037372593432, "grad_norm": 2.5592479705810547, "learning_rate": 4.274554187317742e-05, "loss": 0.4974, "mean_token_accuracy": 0.8573563575744629, "num_tokens": 4153546.0, "step": 1320 }, { "entropy": 0.5610502302646637, "epoch": 1.506228765571914, "grad_norm": 2.9809346199035645, "learning_rate": 4.263227590622594e-05, "loss": 0.5337, "mean_token_accuracy": 0.8465662121772766, "num_tokens": 4184159.0, "step": 1330 }, { "entropy": 0.5405657783150672, "epoch": 1.5175537938844847, "grad_norm": 3.1333119869232178, "learning_rate": 4.251828514547285e-05, "loss": 0.5222, "mean_token_accuracy": 0.851176279783249, "num_tokens": 4216675.0, "step": 1340 }, { "entropy": 0.566189019382, "epoch": 1.5288788221970555, "grad_norm": 3.00138783454895, "learning_rate": 4.2403574276631864e-05, "loss": 0.5444, "mean_token_accuracy": 0.8445883154869079, "num_tokens": 4248386.0, "step": 1350 }, { "entropy": 0.527710796892643, "epoch": 1.5402038505096263, "grad_norm": 3.269314765930176, "learning_rate": 4.2288148015017505e-05, "loss": 0.5053, "mean_token_accuracy": 0.8536709517240524, "num_tokens": 4282402.0, "step": 1360 }, { "entropy": 0.5625803858041764, "epoch": 1.551528878822197, "grad_norm": 3.079254388809204, "learning_rate": 4.2172011105351294e-05, "loss": 0.5384, "mean_token_accuracy": 0.8477725654840469, "num_tokens": 4313169.0, "step": 1370 }, { "entropy": 0.5694461062550544, "epoch": 1.5628539071347678, "grad_norm": 3.1012649536132812, "learning_rate": 4.205516832156671e-05, "loss": 0.5568, "mean_token_accuracy": 0.8427411139011383, "num_tokens": 4344614.0, "step": 1380 }, { "entropy": 0.5670416623353958, "epoch": 1.5741789354473386, "grad_norm": 3.6018662452697754, "learning_rate": 4.193762446661294e-05, "loss": 0.5314, "mean_token_accuracy": 0.8488269418478012, "num_tokens": 4375036.0, "step": 1390 }, { "entropy": 0.5953057199716568, "epoch": 1.5855039637599093, "grad_norm": 3.029942274093628, "learning_rate": 4.18193843722575e-05, "loss": 0.5799, "mean_token_accuracy": 0.837462404370308, "num_tokens": 4404701.0, "step": 1400 }, { "entropy": 0.5928519412875175, "epoch": 1.59682899207248, "grad_norm": 3.283315896987915, "learning_rate": 4.170045289888753e-05, "loss": 0.5624, "mean_token_accuracy": 0.8420351535081864, "num_tokens": 4435592.0, "step": 1410 }, { "entropy": 0.5582237809896469, "epoch": 1.6081540203850508, "grad_norm": 3.4651055335998535, "learning_rate": 4.15808349353101e-05, "loss": 0.5491, "mean_token_accuracy": 0.8469580680131912, "num_tokens": 4466796.0, "step": 1420 }, { "entropy": 0.5469602465629577, "epoch": 1.6194790486976216, "grad_norm": 3.3069710731506348, "learning_rate": 4.1460535398551156e-05, "loss": 0.5196, "mean_token_accuracy": 0.8506799221038819, "num_tokens": 4500213.0, "step": 1430 }, { "entropy": 0.5610076829791069, "epoch": 1.6308040770101924, "grad_norm": 2.9331612586975098, "learning_rate": 4.1339559233653484e-05, "loss": 0.5464, "mean_token_accuracy": 0.8456663846969604, "num_tokens": 4533383.0, "step": 1440 }, { "entropy": 0.5389099940657616, "epoch": 1.6421291053227633, "grad_norm": 3.7289347648620605, "learning_rate": 4.12179114134734e-05, "loss": 0.5173, "mean_token_accuracy": 0.855185616016388, "num_tokens": 4566354.0, "step": 1450 }, { "entropy": 0.5230394601821899, "epoch": 1.6534541336353341, "grad_norm": 3.2918195724487305, "learning_rate": 4.1095596938476316e-05, "loss": 0.5053, "mean_token_accuracy": 0.8556439846754074, "num_tokens": 4598001.0, "step": 1460 }, { "entropy": 0.5553015500307084, "epoch": 1.6647791619479049, "grad_norm": 3.6326143741607666, "learning_rate": 4.097262083653123e-05, "loss": 0.5369, "mean_token_accuracy": 0.8448810696601867, "num_tokens": 4630452.0, "step": 1470 }, { "entropy": 0.6212641701102257, "epoch": 1.6761041902604756, "grad_norm": 3.491703748703003, "learning_rate": 4.084898816270405e-05, "loss": 0.5945, "mean_token_accuracy": 0.8339376747608185, "num_tokens": 4660517.0, "step": 1480 }, { "entropy": 0.5631924226880074, "epoch": 1.6874292185730464, "grad_norm": 2.8656742572784424, "learning_rate": 4.072470399904972e-05, "loss": 0.5362, "mean_token_accuracy": 0.8484214007854461, "num_tokens": 4693572.0, "step": 1490 }, { "entropy": 0.5927946478128433, "epoch": 1.6987542468856174, "grad_norm": 3.262650489807129, "learning_rate": 4.059977345440345e-05, "loss": 0.5631, "mean_token_accuracy": 0.8415592044591904, "num_tokens": 4725726.0, "step": 1500 }, { "epoch": 1.6987542468856174, "eval_entropy": 0.5341141203890986, "eval_loss": 0.5169265270233154, "eval_mean_token_accuracy": 0.8661515663808851, "eval_num_tokens": 4725726.0, "eval_runtime": 22.9682, "eval_samples_per_second": 46.543, "eval_steps_per_second": 5.834, "step": 1500 }, { "entropy": 0.5506062388420105, "epoch": 1.7100792751981881, "grad_norm": 3.5445897579193115, "learning_rate": 4.0474201664170605e-05, "loss": 0.5315, "mean_token_accuracy": 0.8496404081583023, "num_tokens": 4755710.0, "step": 1510 }, { "entropy": 0.5794235303997993, "epoch": 1.721404303510759, "grad_norm": 3.481981039047241, "learning_rate": 4.034799379011565e-05, "loss": 0.5681, "mean_token_accuracy": 0.8396411448717117, "num_tokens": 4788007.0, "step": 1520 }, { "entropy": 0.5977949738502503, "epoch": 1.7327293318233297, "grad_norm": 3.0650320053100586, "learning_rate": 4.0221155020149956e-05, "loss": 0.5602, "mean_token_accuracy": 0.838371816277504, "num_tokens": 4823110.0, "step": 1530 }, { "entropy": 0.5333872765302659, "epoch": 1.7440543601359004, "grad_norm": 3.4268224239349365, "learning_rate": 4.009369056811857e-05, "loss": 0.5284, "mean_token_accuracy": 0.8493491917848587, "num_tokens": 4854122.0, "step": 1540 }, { "entropy": 0.5541011542081833, "epoch": 1.7553793884484712, "grad_norm": 2.7358953952789307, "learning_rate": 3.9965605673585885e-05, "loss": 0.5322, "mean_token_accuracy": 0.8492937862873078, "num_tokens": 4885040.0, "step": 1550 }, { "entropy": 0.5873281508684158, "epoch": 1.766704416761042, "grad_norm": 3.24727463722229, "learning_rate": 3.983690560162021e-05, "loss": 0.5406, "mean_token_accuracy": 0.8487812489271164, "num_tokens": 4913565.0, "step": 1560 }, { "entropy": 0.5366435587406159, "epoch": 1.7780294450736127, "grad_norm": 2.3538382053375244, "learning_rate": 3.970759564257744e-05, "loss": 0.5122, "mean_token_accuracy": 0.852169793844223, "num_tokens": 4945547.0, "step": 1570 }, { "entropy": 0.5801732629537583, "epoch": 1.7893544733861835, "grad_norm": 3.008432388305664, "learning_rate": 3.9577681111883525e-05, "loss": 0.5498, "mean_token_accuracy": 0.8420356780290603, "num_tokens": 4975227.0, "step": 1580 }, { "entropy": 0.5545943319797516, "epoch": 1.8006795016987542, "grad_norm": 3.050703525543213, "learning_rate": 3.9447167349815975e-05, "loss": 0.5202, "mean_token_accuracy": 0.8499834507703781, "num_tokens": 5006740.0, "step": 1590 }, { "entropy": 0.614229929447174, "epoch": 1.812004530011325, "grad_norm": 3.1485025882720947, "learning_rate": 3.931605972128434e-05, "loss": 0.5971, "mean_token_accuracy": 0.8335285931825638, "num_tokens": 5038134.0, "step": 1600 }, { "entropy": 0.5501451089978218, "epoch": 1.8233295583238958, "grad_norm": 3.1998586654663086, "learning_rate": 3.918436361560975e-05, "loss": 0.5239, "mean_token_accuracy": 0.8538520008325576, "num_tokens": 5069253.0, "step": 1610 }, { "entropy": 0.5649801835417747, "epoch": 1.8346545866364665, "grad_norm": 3.609856128692627, "learning_rate": 3.905208444630327e-05, "loss": 0.544, "mean_token_accuracy": 0.8445982128381729, "num_tokens": 5099796.0, "step": 1620 }, { "entropy": 0.6116504520177841, "epoch": 1.8459796149490373, "grad_norm": 2.742194175720215, "learning_rate": 3.8919227650843446e-05, "loss": 0.5889, "mean_token_accuracy": 0.8342977851629257, "num_tokens": 5132503.0, "step": 1630 }, { "entropy": 0.5255660384893417, "epoch": 1.857304643261608, "grad_norm": 2.977027654647827, "learning_rate": 3.878579869045279e-05, "loss": 0.4961, "mean_token_accuracy": 0.8565999537706375, "num_tokens": 5162678.0, "step": 1640 }, { "entropy": 0.5009402737021447, "epoch": 1.8686296715741788, "grad_norm": 2.497317314147949, "learning_rate": 3.865180304987325e-05, "loss": 0.4848, "mean_token_accuracy": 0.8591432988643646, "num_tokens": 5196737.0, "step": 1650 }, { "entropy": 0.5639937236905098, "epoch": 1.8799546998867496, "grad_norm": 3.1093459129333496, "learning_rate": 3.851724623714078e-05, "loss": 0.5568, "mean_token_accuracy": 0.845026895403862, "num_tokens": 5227400.0, "step": 1660 }, { "entropy": 0.5589576527476311, "epoch": 1.8912797281993206, "grad_norm": 3.0768072605133057, "learning_rate": 3.838213378335893e-05, "loss": 0.5291, "mean_token_accuracy": 0.8501736044883728, "num_tokens": 5259004.0, "step": 1670 }, { "entropy": 0.5750891670584679, "epoch": 1.9026047565118913, "grad_norm": 2.6023709774017334, "learning_rate": 3.824647124247149e-05, "loss": 0.5517, "mean_token_accuracy": 0.8456086486577987, "num_tokens": 5290222.0, "step": 1680 }, { "entropy": 0.5511097982525826, "epoch": 1.913929784824462, "grad_norm": 2.9224021434783936, "learning_rate": 3.811026419103415e-05, "loss": 0.5325, "mean_token_accuracy": 0.8483414232730866, "num_tokens": 5322612.0, "step": 1690 }, { "entropy": 0.5870863348245621, "epoch": 1.9252548131370328, "grad_norm": 3.0950560569763184, "learning_rate": 3.79735182279853e-05, "loss": 0.5522, "mean_token_accuracy": 0.8423091471195221, "num_tokens": 5352812.0, "step": 1700 }, { "entropy": 0.5567023307085037, "epoch": 1.9365798414496036, "grad_norm": 2.833249092102051, "learning_rate": 3.78362389744159e-05, "loss": 0.5457, "mean_token_accuracy": 0.8451368689537049, "num_tokens": 5383796.0, "step": 1710 }, { "entropy": 0.55447788387537, "epoch": 1.9479048697621744, "grad_norm": 3.5825366973876953, "learning_rate": 3.769843207333837e-05, "loss": 0.5214, "mean_token_accuracy": 0.8480161488056183, "num_tokens": 5416796.0, "step": 1720 }, { "entropy": 0.5370402306318283, "epoch": 1.9592298980747453, "grad_norm": 2.9841582775115967, "learning_rate": 3.7560103189454654e-05, "loss": 0.5232, "mean_token_accuracy": 0.8511360079050064, "num_tokens": 5447812.0, "step": 1730 }, { "entropy": 0.5704680263996125, "epoch": 1.970554926387316, "grad_norm": 3.3857421875, "learning_rate": 3.742125800892339e-05, "loss": 0.5528, "mean_token_accuracy": 0.8471181511878967, "num_tokens": 5477860.0, "step": 1740 }, { "entropy": 0.5869046375155449, "epoch": 1.9818799546998869, "grad_norm": 3.149844169616699, "learning_rate": 3.7281902239126114e-05, "loss": 0.5758, "mean_token_accuracy": 0.8392861634492874, "num_tokens": 5508399.0, "step": 1750 }, { "entropy": 0.5820975095033646, "epoch": 1.9932049830124576, "grad_norm": 3.422454833984375, "learning_rate": 3.714204160843271e-05, "loss": 0.5391, "mean_token_accuracy": 0.8477762788534164, "num_tokens": 5538408.0, "step": 1760 }, { "entropy": 0.5504972025752067, "epoch": 2.0045300113250284, "grad_norm": 2.6110424995422363, "learning_rate": 3.700168186596591e-05, "loss": 0.4606, "mean_token_accuracy": 0.864372169971466, "num_tokens": 5568168.0, "step": 1770 }, { "entropy": 0.33086139261722564, "epoch": 2.015855039637599, "grad_norm": 3.0384321212768555, "learning_rate": 3.686082878136497e-05, "loss": 0.2968, "mean_token_accuracy": 0.9110626548528671, "num_tokens": 5600301.0, "step": 1780 }, { "entropy": 0.3262224726378918, "epoch": 2.02718006795017, "grad_norm": 3.177510976791382, "learning_rate": 3.671948814454852e-05, "loss": 0.3034, "mean_token_accuracy": 0.9086451709270478, "num_tokens": 5631080.0, "step": 1790 }, { "entropy": 0.33181734979152677, "epoch": 2.0385050962627407, "grad_norm": 2.8883979320526123, "learning_rate": 3.6577665765476534e-05, "loss": 0.2977, "mean_token_accuracy": 0.9095875382423401, "num_tokens": 5663677.0, "step": 1800 }, { "entropy": 0.34994775205850603, "epoch": 2.0498301245753114, "grad_norm": 2.915252208709717, "learning_rate": 3.643536747391155e-05, "loss": 0.3191, "mean_token_accuracy": 0.9045411765575408, "num_tokens": 5692514.0, "step": 1810 }, { "entropy": 0.33512948006391524, "epoch": 2.061155152887882, "grad_norm": 3.1287646293640137, "learning_rate": 3.629259911917898e-05, "loss": 0.3079, "mean_token_accuracy": 0.9076024353504181, "num_tokens": 5723520.0, "step": 1820 }, { "entropy": 0.3023697704076767, "epoch": 2.072480181200453, "grad_norm": 2.952409029006958, "learning_rate": 3.614936656992671e-05, "loss": 0.2774, "mean_token_accuracy": 0.9159202218055725, "num_tokens": 5756911.0, "step": 1830 }, { "entropy": 0.3192523382604122, "epoch": 2.0838052095130237, "grad_norm": 2.7479248046875, "learning_rate": 3.600567571388384e-05, "loss": 0.2952, "mean_token_accuracy": 0.9110053300857544, "num_tokens": 5789223.0, "step": 1840 }, { "entropy": 0.3362482935190201, "epoch": 2.0951302378255945, "grad_norm": 2.9288883209228516, "learning_rate": 3.5861532457618647e-05, "loss": 0.3001, "mean_token_accuracy": 0.910579189658165, "num_tokens": 5820267.0, "step": 1850 }, { "entropy": 0.3218920275568962, "epoch": 2.1064552661381652, "grad_norm": 2.9838762283325195, "learning_rate": 3.571694272629583e-05, "loss": 0.3028, "mean_token_accuracy": 0.9099289119243622, "num_tokens": 5851637.0, "step": 1860 }, { "entropy": 0.33703851103782656, "epoch": 2.117780294450736, "grad_norm": 2.6930618286132812, "learning_rate": 3.557191246343294e-05, "loss": 0.3005, "mean_token_accuracy": 0.9076809674501419, "num_tokens": 5884035.0, "step": 1870 }, { "entropy": 0.33338933885097505, "epoch": 2.1291053227633068, "grad_norm": 2.9287595748901367, "learning_rate": 3.5426447630656015e-05, "loss": 0.2996, "mean_token_accuracy": 0.9076695948839187, "num_tokens": 5915289.0, "step": 1880 }, { "entropy": 0.3408766373991966, "epoch": 2.1404303510758775, "grad_norm": 3.0822606086730957, "learning_rate": 3.528055420745461e-05, "loss": 0.302, "mean_token_accuracy": 0.9080609738826751, "num_tokens": 5945986.0, "step": 1890 }, { "entropy": 0.33647180199623106, "epoch": 2.1517553793884483, "grad_norm": 2.4581234455108643, "learning_rate": 3.51342381909359e-05, "loss": 0.3117, "mean_token_accuracy": 0.9036801576614379, "num_tokens": 5979688.0, "step": 1900 }, { "entropy": 0.3434984043240547, "epoch": 2.163080407701019, "grad_norm": 3.238070487976074, "learning_rate": 3.498750559557827e-05, "loss": 0.3049, "mean_token_accuracy": 0.908153846859932, "num_tokens": 6010581.0, "step": 1910 }, { "entropy": 0.3294851824641228, "epoch": 2.17440543601359, "grad_norm": 3.0652968883514404, "learning_rate": 3.484036245298398e-05, "loss": 0.3024, "mean_token_accuracy": 0.9057962894439697, "num_tokens": 6042553.0, "step": 1920 }, { "entropy": 0.3511190369725227, "epoch": 2.185730464326161, "grad_norm": 2.9219870567321777, "learning_rate": 3.469281481163131e-05, "loss": 0.3258, "mean_token_accuracy": 0.9027796924114228, "num_tokens": 6073935.0, "step": 1930 }, { "entropy": 0.32454402297735213, "epoch": 2.197055492638732, "grad_norm": 2.8523194789886475, "learning_rate": 3.45448687366259e-05, "loss": 0.2839, "mean_token_accuracy": 0.9147208243608475, "num_tokens": 6106596.0, "step": 1940 }, { "entropy": 0.3268588811159134, "epoch": 2.2083805209513026, "grad_norm": 3.46616530418396, "learning_rate": 3.4396530309451416e-05, "loss": 0.3023, "mean_token_accuracy": 0.9075274288654327, "num_tokens": 6137408.0, "step": 1950 }, { "entropy": 0.34170345067977903, "epoch": 2.2197055492638733, "grad_norm": 2.899691104888916, "learning_rate": 3.4247805627719605e-05, "loss": 0.3087, "mean_token_accuracy": 0.9050837367773056, "num_tokens": 6169270.0, "step": 1960 }, { "entropy": 0.33029789179563523, "epoch": 2.231030577576444, "grad_norm": 3.0167901515960693, "learning_rate": 3.4098700804919614e-05, "loss": 0.2938, "mean_token_accuracy": 0.9097273588180542, "num_tokens": 6200234.0, "step": 1970 }, { "entropy": 0.3392042815685272, "epoch": 2.242355605889015, "grad_norm": 3.2007789611816406, "learning_rate": 3.394922197016671e-05, "loss": 0.308, "mean_token_accuracy": 0.9080634534358978, "num_tokens": 6229177.0, "step": 1980 }, { "entropy": 0.3530780151486397, "epoch": 2.2536806342015856, "grad_norm": 2.546180009841919, "learning_rate": 3.379937526795031e-05, "loss": 0.3062, "mean_token_accuracy": 0.9073823064565658, "num_tokens": 6257509.0, "step": 1990 }, { "entropy": 0.31698922738432883, "epoch": 2.2650056625141564, "grad_norm": 2.573667049407959, "learning_rate": 3.3649166857881444e-05, "loss": 0.2812, "mean_token_accuracy": 0.9117646932601928, "num_tokens": 6292493.0, "step": 2000 }, { "epoch": 2.2650056625141564, "eval_entropy": 0.3538621710752373, "eval_loss": 0.461500346660614, "eval_mean_token_accuracy": 0.8896430148117578, "eval_num_tokens": 6292493.0, "eval_runtime": 22.9271, "eval_samples_per_second": 46.626, "eval_steps_per_second": 5.845, "step": 2000 }, { "entropy": 0.3301647327840328, "epoch": 2.276330690826727, "grad_norm": 3.6626837253570557, "learning_rate": 3.349860291443952e-05, "loss": 0.3122, "mean_token_accuracy": 0.9060505300760269, "num_tokens": 6326995.0, "step": 2010 }, { "entropy": 0.3390480265021324, "epoch": 2.287655719139298, "grad_norm": 3.048062801361084, "learning_rate": 3.3347689626718535e-05, "loss": 0.301, "mean_token_accuracy": 0.9094172894954682, "num_tokens": 6358142.0, "step": 2020 }, { "entropy": 0.33652088344097136, "epoch": 2.2989807474518686, "grad_norm": 2.5861592292785645, "learning_rate": 3.319643319817266e-05, "loss": 0.3007, "mean_token_accuracy": 0.9065529972314834, "num_tokens": 6391665.0, "step": 2030 }, { "entropy": 0.34212576597929, "epoch": 2.3103057757644394, "grad_norm": 3.1473755836486816, "learning_rate": 3.304483984636124e-05, "loss": 0.3088, "mean_token_accuracy": 0.9068757712841033, "num_tokens": 6420978.0, "step": 2040 }, { "entropy": 0.3290658682584763, "epoch": 2.32163080407701, "grad_norm": 2.8407304286956787, "learning_rate": 3.289291580269322e-05, "loss": 0.2956, "mean_token_accuracy": 0.9091124355792999, "num_tokens": 6452000.0, "step": 2050 }, { "entropy": 0.32331828474998475, "epoch": 2.332955832389581, "grad_norm": 2.915709972381592, "learning_rate": 3.2740667312170984e-05, "loss": 0.3011, "mean_token_accuracy": 0.9096018075942993, "num_tokens": 6482800.0, "step": 2060 }, { "entropy": 0.35195091664791106, "epoch": 2.3442808607021517, "grad_norm": 2.9953255653381348, "learning_rate": 3.258810063313367e-05, "loss": 0.3126, "mean_token_accuracy": 0.9040073782205582, "num_tokens": 6516807.0, "step": 2070 }, { "entropy": 0.34647743552923205, "epoch": 2.3556058890147225, "grad_norm": 2.6879382133483887, "learning_rate": 3.243522203699988e-05, "loss": 0.3188, "mean_token_accuracy": 0.9036970168352128, "num_tokens": 6548271.0, "step": 2080 }, { "entropy": 0.32736130654811857, "epoch": 2.366930917327293, "grad_norm": 3.3962409496307373, "learning_rate": 3.228203780800993e-05, "loss": 0.2937, "mean_token_accuracy": 0.91224045753479, "num_tokens": 6577404.0, "step": 2090 }, { "entropy": 0.34083098769187925, "epoch": 2.378255945639864, "grad_norm": 2.9646759033203125, "learning_rate": 3.212855424296748e-05, "loss": 0.3093, "mean_token_accuracy": 0.9033702582120895, "num_tokens": 6610996.0, "step": 2100 }, { "entropy": 0.33483326584100725, "epoch": 2.3895809739524347, "grad_norm": 3.603712558746338, "learning_rate": 3.1974777650980735e-05, "loss": 0.3052, "mean_token_accuracy": 0.9083625108003617, "num_tokens": 6641408.0, "step": 2110 }, { "entropy": 0.36678697168827057, "epoch": 2.4009060022650055, "grad_norm": 5.152080535888672, "learning_rate": 3.182071435320309e-05, "loss": 0.3386, "mean_token_accuracy": 0.8961910039186478, "num_tokens": 6671255.0, "step": 2120 }, { "entropy": 0.3381674662232399, "epoch": 2.4122310305775763, "grad_norm": 2.51043438911438, "learning_rate": 3.1666370682573305e-05, "loss": 0.3018, "mean_token_accuracy": 0.9092491447925568, "num_tokens": 6701042.0, "step": 2130 }, { "entropy": 0.340181964635849, "epoch": 2.4235560588901475, "grad_norm": 2.988914966583252, "learning_rate": 3.151175298355515e-05, "loss": 0.3186, "mean_token_accuracy": 0.9035066097974778, "num_tokens": 6731513.0, "step": 2140 }, { "entropy": 0.3381286084651947, "epoch": 2.434881087202718, "grad_norm": 3.1499576568603516, "learning_rate": 3.1356867611876626e-05, "loss": 0.3063, "mean_token_accuracy": 0.906991371512413, "num_tokens": 6763329.0, "step": 2150 }, { "entropy": 0.34378741681575775, "epoch": 2.446206115515289, "grad_norm": 3.016859292984009, "learning_rate": 3.120172093426873e-05, "loss": 0.3184, "mean_token_accuracy": 0.9061174720525742, "num_tokens": 6794777.0, "step": 2160 }, { "entropy": 0.336453452706337, "epoch": 2.4575311438278598, "grad_norm": 3.0973525047302246, "learning_rate": 3.10463193282037e-05, "loss": 0.304, "mean_token_accuracy": 0.9055917888879776, "num_tokens": 6826289.0, "step": 2170 }, { "entropy": 0.35441608279943465, "epoch": 2.4688561721404305, "grad_norm": 3.2018167972564697, "learning_rate": 3.08906691816329e-05, "loss": 0.3213, "mean_token_accuracy": 0.9018235206604004, "num_tokens": 6856735.0, "step": 2180 }, { "entropy": 0.3426896780729294, "epoch": 2.4801812004530013, "grad_norm": 3.111391067504883, "learning_rate": 3.073477689272422e-05, "loss": 0.3085, "mean_token_accuracy": 0.9053992748260498, "num_tokens": 6887751.0, "step": 2190 }, { "entropy": 0.3537640705704689, "epoch": 2.491506228765572, "grad_norm": 3.6062848567962646, "learning_rate": 3.057864886959907e-05, "loss": 0.3161, "mean_token_accuracy": 0.9045758932828903, "num_tokens": 6916652.0, "step": 2200 }, { "entropy": 0.3475446105003357, "epoch": 2.502831257078143, "grad_norm": 2.8011586666107178, "learning_rate": 3.0422291530068953e-05, "loss": 0.3139, "mean_token_accuracy": 0.9058610677719117, "num_tokens": 6947964.0, "step": 2210 }, { "entropy": 0.33603944927453994, "epoch": 2.5141562853907136, "grad_norm": 2.7381720542907715, "learning_rate": 3.02657113013717e-05, "loss": 0.311, "mean_token_accuracy": 0.9063718646764756, "num_tokens": 6979191.0, "step": 2220 }, { "entropy": 0.33636826276779175, "epoch": 2.5254813137032843, "grad_norm": 2.717543363571167, "learning_rate": 3.0108914619907236e-05, "loss": 0.2989, "mean_token_accuracy": 0.9079161524772644, "num_tokens": 7011313.0, "step": 2230 }, { "entropy": 0.32530431896448136, "epoch": 2.536806342015855, "grad_norm": 3.0896191596984863, "learning_rate": 2.9951907930972994e-05, "loss": 0.3001, "mean_token_accuracy": 0.9083738714456558, "num_tokens": 7042592.0, "step": 2240 }, { "entropy": 0.3297998860478401, "epoch": 2.548131370328426, "grad_norm": 2.8318963050842285, "learning_rate": 2.9794697688499e-05, "loss": 0.2948, "mean_token_accuracy": 0.9093282163143158, "num_tokens": 7074945.0, "step": 2250 }, { "entropy": 0.34250654131174085, "epoch": 2.5594563986409966, "grad_norm": 3.3961706161499023, "learning_rate": 2.9637290354782587e-05, "loss": 0.3157, "mean_token_accuracy": 0.904097443819046, "num_tokens": 7107633.0, "step": 2260 }, { "entropy": 0.3470340445637703, "epoch": 2.5707814269535674, "grad_norm": 3.1472079753875732, "learning_rate": 2.9479692400222715e-05, "loss": 0.3189, "mean_token_accuracy": 0.9026219546794891, "num_tokens": 7140772.0, "step": 2270 }, { "entropy": 0.33536320328712466, "epoch": 2.582106455266138, "grad_norm": 3.536257266998291, "learning_rate": 2.932191030305401e-05, "loss": 0.3074, "mean_token_accuracy": 0.9077878296375275, "num_tokens": 7174565.0, "step": 2280 }, { "entropy": 0.34011308997869494, "epoch": 2.593431483578709, "grad_norm": 2.7522521018981934, "learning_rate": 2.916395054908052e-05, "loss": 0.3063, "mean_token_accuracy": 0.9075266391038894, "num_tokens": 7205947.0, "step": 2290 }, { "entropy": 0.3416082814335823, "epoch": 2.6047565118912797, "grad_norm": 3.5005111694335938, "learning_rate": 2.9005819631409043e-05, "loss": 0.3097, "mean_token_accuracy": 0.9054687231779098, "num_tokens": 7239660.0, "step": 2300 }, { "entropy": 0.330567429959774, "epoch": 2.6160815402038504, "grad_norm": 2.752882480621338, "learning_rate": 2.8847524050182233e-05, "loss": 0.2994, "mean_token_accuracy": 0.9095361143350601, "num_tokens": 7271351.0, "step": 2310 }, { "entropy": 0.3262020036578178, "epoch": 2.627406568516421, "grad_norm": 2.702531337738037, "learning_rate": 2.8689070312311443e-05, "loss": 0.307, "mean_token_accuracy": 0.9070776045322418, "num_tokens": 7306146.0, "step": 2320 }, { "entropy": 0.33680719435214995, "epoch": 2.638731596828992, "grad_norm": 2.79473614692688, "learning_rate": 2.853046493120921e-05, "loss": 0.3065, "mean_token_accuracy": 0.9064619988203049, "num_tokens": 7338021.0, "step": 2330 }, { "entropy": 0.3301373228430748, "epoch": 2.6500566251415627, "grad_norm": 2.9453022480010986, "learning_rate": 2.837171442652155e-05, "loss": 0.2878, "mean_token_accuracy": 0.9134915202856064, "num_tokens": 7367317.0, "step": 2340 }, { "entropy": 0.34509565234184264, "epoch": 2.661381653454134, "grad_norm": 2.921013832092285, "learning_rate": 2.821282532385991e-05, "loss": 0.3178, "mean_token_accuracy": 0.9036821633577347, "num_tokens": 7399316.0, "step": 2350 }, { "entropy": 0.33235773593187334, "epoch": 2.6727066817667042, "grad_norm": 3.50290846824646, "learning_rate": 2.8053804154532992e-05, "loss": 0.3025, "mean_token_accuracy": 0.9082023203372955, "num_tokens": 7429163.0, "step": 2360 }, { "entropy": 0.3544440522789955, "epoch": 2.6840317100792754, "grad_norm": 3.421252489089966, "learning_rate": 2.78946574552782e-05, "loss": 0.3255, "mean_token_accuracy": 0.9013722956180572, "num_tokens": 7459357.0, "step": 2370 }, { "entropy": 0.3445164501667023, "epoch": 2.6953567383918458, "grad_norm": 2.7874948978424072, "learning_rate": 2.7735391767993024e-05, "loss": 0.3056, "mean_token_accuracy": 0.9070187360048294, "num_tokens": 7491443.0, "step": 2380 }, { "entropy": 0.3235954880714417, "epoch": 2.706681766704417, "grad_norm": 2.596618413925171, "learning_rate": 2.7576013639466064e-05, "loss": 0.3016, "mean_token_accuracy": 0.9080851078033447, "num_tokens": 7525034.0, "step": 2390 }, { "entropy": 0.33873881548643114, "epoch": 2.7180067950169873, "grad_norm": 2.730522394180298, "learning_rate": 2.741652962110794e-05, "loss": 0.3098, "mean_token_accuracy": 0.9070809602737426, "num_tokens": 7556123.0, "step": 2400 }, { "entropy": 0.3413001626729965, "epoch": 2.7293318233295585, "grad_norm": 2.565798759460449, "learning_rate": 2.7256946268681997e-05, "loss": 0.3029, "mean_token_accuracy": 0.9063649326562881, "num_tokens": 7587382.0, "step": 2410 }, { "entropy": 0.32414872050285337, "epoch": 2.7406568516421292, "grad_norm": 2.687298536300659, "learning_rate": 2.7097270142034798e-05, "loss": 0.3039, "mean_token_accuracy": 0.90952388048172, "num_tokens": 7620486.0, "step": 2420 }, { "entropy": 0.33378091007471083, "epoch": 2.7519818799547, "grad_norm": 2.9899933338165283, "learning_rate": 2.6937507804826505e-05, "loss": 0.3099, "mean_token_accuracy": 0.9065448194742203, "num_tokens": 7647800.0, "step": 2430 }, { "entropy": 0.32358061224222184, "epoch": 2.7633069082672708, "grad_norm": 3.958669424057007, "learning_rate": 2.6777665824261056e-05, "loss": 0.2932, "mean_token_accuracy": 0.9114756107330322, "num_tokens": 7681853.0, "step": 2440 }, { "entropy": 0.35349722802639005, "epoch": 2.7746319365798415, "grad_norm": 2.954362154006958, "learning_rate": 2.661775077081621e-05, "loss": 0.3166, "mean_token_accuracy": 0.9041797786951065, "num_tokens": 7711414.0, "step": 2450 }, { "entropy": 0.3320110633969307, "epoch": 2.7859569648924123, "grad_norm": 3.5596938133239746, "learning_rate": 2.645776921797347e-05, "loss": 0.3035, "mean_token_accuracy": 0.9083330810070038, "num_tokens": 7745429.0, "step": 2460 }, { "entropy": 0.331499008834362, "epoch": 2.797281993204983, "grad_norm": 2.9514622688293457, "learning_rate": 2.6297727741947876e-05, "loss": 0.2946, "mean_token_accuracy": 0.9112407445907593, "num_tokens": 7776539.0, "step": 2470 }, { "entropy": 0.34212442487478256, "epoch": 2.808607021517554, "grad_norm": 4.607098579406738, "learning_rate": 2.613763292141766e-05, "loss": 0.3161, "mean_token_accuracy": 0.9037576377391815, "num_tokens": 7807498.0, "step": 2480 }, { "entropy": 0.326346854865551, "epoch": 2.8199320498301246, "grad_norm": 2.47886323928833, "learning_rate": 2.5977491337253844e-05, "loss": 0.2944, "mean_token_accuracy": 0.911178269982338, "num_tokens": 7838306.0, "step": 2490 }, { "entropy": 0.33608004450798035, "epoch": 2.8312570781426953, "grad_norm": 3.0412232875823975, "learning_rate": 2.5817309572249736e-05, "loss": 0.3037, "mean_token_accuracy": 0.9075497031211853, "num_tokens": 7872016.0, "step": 2500 }, { "epoch": 2.8312570781426953, "eval_entropy": 0.327717344151504, "eval_loss": 0.38485008478164673, "eval_mean_token_accuracy": 0.9145353396437061, "eval_num_tokens": 7872016.0, "eval_runtime": 22.6936, "eval_samples_per_second": 47.106, "eval_steps_per_second": 5.905, "step": 2500 }, { "entropy": 0.3390058234333992, "epoch": 2.842582106455266, "grad_norm": 2.7344987392425537, "learning_rate": 2.565709421085028e-05, "loss": 0.3032, "mean_token_accuracy": 0.9075493305921555, "num_tokens": 7904470.0, "step": 2510 }, { "entropy": 0.3146300859749317, "epoch": 2.853907134767837, "grad_norm": 3.110588788986206, "learning_rate": 2.549685183888149e-05, "loss": 0.2803, "mean_token_accuracy": 0.9147897362709045, "num_tokens": 7935280.0, "step": 2520 }, { "entropy": 0.3277345307171345, "epoch": 2.8652321630804076, "grad_norm": 3.927215099334717, "learning_rate": 2.5336589043279634e-05, "loss": 0.315, "mean_token_accuracy": 0.9082329630851745, "num_tokens": 7968973.0, "step": 2530 }, { "entropy": 0.31817393004894257, "epoch": 2.8765571913929784, "grad_norm": 3.023855447769165, "learning_rate": 2.5176312411820545e-05, "loss": 0.2923, "mean_token_accuracy": 0.9107245028018951, "num_tokens": 8001676.0, "step": 2540 }, { "entropy": 0.33554808497428895, "epoch": 2.887882219705549, "grad_norm": 3.3250324726104736, "learning_rate": 2.5016028532848768e-05, "loss": 0.3107, "mean_token_accuracy": 0.9068995088338851, "num_tokens": 8034188.0, "step": 2550 }, { "entropy": 0.33894127756357195, "epoch": 2.89920724801812, "grad_norm": 2.6111295223236084, "learning_rate": 2.4855743995006785e-05, "loss": 0.3057, "mean_token_accuracy": 0.9076811999082566, "num_tokens": 8064782.0, "step": 2560 }, { "entropy": 0.34306282699108126, "epoch": 2.9105322763306907, "grad_norm": 2.726879596710205, "learning_rate": 2.4695465386964154e-05, "loss": 0.3149, "mean_token_accuracy": 0.9052368462085724, "num_tokens": 8095683.0, "step": 2570 }, { "entropy": 0.32450498044490816, "epoch": 2.921857304643262, "grad_norm": 2.7551498413085938, "learning_rate": 2.4535199297146692e-05, "loss": 0.2934, "mean_token_accuracy": 0.9113465547561646, "num_tokens": 8128181.0, "step": 2580 }, { "entropy": 0.31809509843587874, "epoch": 2.933182332955832, "grad_norm": 3.248033046722412, "learning_rate": 2.4374952313465618e-05, "loss": 0.2929, "mean_token_accuracy": 0.9110888838768005, "num_tokens": 8161222.0, "step": 2590 }, { "entropy": 0.3428458094596863, "epoch": 2.9445073612684034, "grad_norm": 3.0945913791656494, "learning_rate": 2.4214731023046793e-05, "loss": 0.3094, "mean_token_accuracy": 0.9040931612253189, "num_tokens": 8191149.0, "step": 2600 }, { "entropy": 0.3347173988819122, "epoch": 2.9558323895809737, "grad_norm": 3.0713417530059814, "learning_rate": 2.4054542011959925e-05, "loss": 0.3085, "mean_token_accuracy": 0.9067859917879104, "num_tokens": 8223169.0, "step": 2610 }, { "entropy": 0.33130600601434707, "epoch": 2.967157417893545, "grad_norm": 2.971564531326294, "learning_rate": 2.3894391864947845e-05, "loss": 0.3003, "mean_token_accuracy": 0.9123528331518174, "num_tokens": 8251958.0, "step": 2620 }, { "entropy": 0.3486244469881058, "epoch": 2.9784824462061152, "grad_norm": 3.0552313327789307, "learning_rate": 2.3734287165155814e-05, "loss": 0.3088, "mean_token_accuracy": 0.9068525344133377, "num_tokens": 8280414.0, "step": 2630 }, { "entropy": 0.3378919124603271, "epoch": 2.9898074745186864, "grad_norm": 3.0992133617401123, "learning_rate": 2.357423449386097e-05, "loss": 0.308, "mean_token_accuracy": 0.908593600988388, "num_tokens": 8309410.0, "step": 2640 }, { "entropy": 0.32522587329149244, "epoch": 3.001132502831257, "grad_norm": 1.8476426601409912, "learning_rate": 2.341424043020174e-05, "loss": 0.2861, "mean_token_accuracy": 0.9121149241924286, "num_tokens": 8339500.0, "step": 2650 }, { "entropy": 0.2401589497923851, "epoch": 3.012457531143828, "grad_norm": 2.254667282104492, "learning_rate": 2.3254311550907432e-05, "loss": 0.158, "mean_token_accuracy": 0.9545799106359482, "num_tokens": 8368545.0, "step": 2660 }, { "entropy": 0.1963147647678852, "epoch": 3.0237825594563987, "grad_norm": 3.1326498985290527, "learning_rate": 2.309445443002788e-05, "loss": 0.1527, "mean_token_accuracy": 0.9554174333810806, "num_tokens": 8398109.0, "step": 2670 }, { "entropy": 0.18867359235882758, "epoch": 3.0351075877689695, "grad_norm": 2.721315860748291, "learning_rate": 2.293467563866319e-05, "loss": 0.1467, "mean_token_accuracy": 0.9568633317947388, "num_tokens": 8427150.0, "step": 2680 }, { "entropy": 0.17226183749735355, "epoch": 3.0464326160815403, "grad_norm": 2.6865651607513428, "learning_rate": 2.2774981744693667e-05, "loss": 0.1333, "mean_token_accuracy": 0.9603831499814988, "num_tokens": 8459804.0, "step": 2690 }, { "entropy": 0.17720401659607887, "epoch": 3.057757644394111, "grad_norm": 2.5850725173950195, "learning_rate": 2.2615379312509827e-05, "loss": 0.1482, "mean_token_accuracy": 0.9568986982107163, "num_tokens": 8496137.0, "step": 2700 }, { "entropy": 0.19257365614175798, "epoch": 3.069082672706682, "grad_norm": 2.543994188308716, "learning_rate": 2.245587490274253e-05, "loss": 0.1511, "mean_token_accuracy": 0.9563712388277054, "num_tokens": 8527404.0, "step": 2710 }, { "entropy": 0.1776495523750782, "epoch": 3.0804077010192525, "grad_norm": 2.6783289909362793, "learning_rate": 2.229647507199332e-05, "loss": 0.1394, "mean_token_accuracy": 0.9598089069128036, "num_tokens": 8561000.0, "step": 2720 }, { "entropy": 0.17914639115333558, "epoch": 3.0917327293318233, "grad_norm": 2.805384635925293, "learning_rate": 2.2137186372564918e-05, "loss": 0.1462, "mean_token_accuracy": 0.956651571393013, "num_tokens": 8593390.0, "step": 2730 }, { "entropy": 0.19363198950886726, "epoch": 3.103057757644394, "grad_norm": 3.3500022888183594, "learning_rate": 2.1978015352191864e-05, "loss": 0.1506, "mean_token_accuracy": 0.9557105898857117, "num_tokens": 8622236.0, "step": 2740 }, { "entropy": 0.18671710416674614, "epoch": 3.114382785956965, "grad_norm": 2.2808239459991455, "learning_rate": 2.181896855377138e-05, "loss": 0.1435, "mean_token_accuracy": 0.9574358344078064, "num_tokens": 8654502.0, "step": 2750 }, { "entropy": 0.1893085241317749, "epoch": 3.1257078142695356, "grad_norm": 2.6378250122070312, "learning_rate": 2.1660052515094405e-05, "loss": 0.1526, "mean_token_accuracy": 0.9564388334751129, "num_tokens": 8685392.0, "step": 2760 }, { "entropy": 0.18678954988718033, "epoch": 3.1370328425821064, "grad_norm": 2.5639476776123047, "learning_rate": 2.1501273768576852e-05, "loss": 0.1448, "mean_token_accuracy": 0.9556661039590836, "num_tokens": 8718903.0, "step": 2770 }, { "entropy": 0.1747233547270298, "epoch": 3.148357870894677, "grad_norm": 2.5726349353790283, "learning_rate": 2.1342638840991097e-05, "loss": 0.1401, "mean_token_accuracy": 0.9590637445449829, "num_tokens": 8751352.0, "step": 2780 }, { "entropy": 0.18045611083507537, "epoch": 3.159682899207248, "grad_norm": 2.1656455993652344, "learning_rate": 2.1184154253197684e-05, "loss": 0.1461, "mean_token_accuracy": 0.9568846583366394, "num_tokens": 8784384.0, "step": 2790 }, { "entropy": 0.1883378468453884, "epoch": 3.1710079275198186, "grad_norm": 2.388462781906128, "learning_rate": 2.1025826519877282e-05, "loss": 0.144, "mean_token_accuracy": 0.9586382806301117, "num_tokens": 8813243.0, "step": 2800 }, { "entropy": 0.18703141212463378, "epoch": 3.1823329558323894, "grad_norm": 2.7605926990509033, "learning_rate": 2.086766214926288e-05, "loss": 0.1506, "mean_token_accuracy": 0.956416517496109, "num_tokens": 8842657.0, "step": 2810 }, { "entropy": 0.18549679666757585, "epoch": 3.19365798414496, "grad_norm": 2.453981399536133, "learning_rate": 2.0709667642872256e-05, "loss": 0.1456, "mean_token_accuracy": 0.9563432067632676, "num_tokens": 8874598.0, "step": 2820 }, { "entropy": 0.18915204033255578, "epoch": 3.2049830124575314, "grad_norm": 2.598982810974121, "learning_rate": 2.055184949524075e-05, "loss": 0.1502, "mean_token_accuracy": 0.9555239349603653, "num_tokens": 8905326.0, "step": 2830 }, { "entropy": 0.18085768818855286, "epoch": 3.216308040770102, "grad_norm": 2.4260501861572266, "learning_rate": 2.039421419365427e-05, "loss": 0.1458, "mean_token_accuracy": 0.9582718342542649, "num_tokens": 8938848.0, "step": 2840 }, { "entropy": 0.183878605812788, "epoch": 3.227633069082673, "grad_norm": 2.386054039001465, "learning_rate": 2.0236768217882683e-05, "loss": 0.153, "mean_token_accuracy": 0.956705066561699, "num_tokens": 8969858.0, "step": 2850 }, { "entropy": 0.18992681950330734, "epoch": 3.2389580973952437, "grad_norm": 3.0777018070220947, "learning_rate": 2.0079518039913343e-05, "loss": 0.1498, "mean_token_accuracy": 0.9585150212049485, "num_tokens": 8999393.0, "step": 2860 }, { "entropy": 0.18626221343874932, "epoch": 3.2502831257078144, "grad_norm": 2.023559808731079, "learning_rate": 1.992247012368517e-05, "loss": 0.1492, "mean_token_accuracy": 0.9551093459129334, "num_tokens": 9031528.0, "step": 2870 }, { "entropy": 0.18020688593387604, "epoch": 3.261608154020385, "grad_norm": 2.096430540084839, "learning_rate": 1.9765630924822886e-05, "loss": 0.1421, "mean_token_accuracy": 0.9582108706235886, "num_tokens": 9063549.0, "step": 2880 }, { "entropy": 0.18335785791277887, "epoch": 3.272933182332956, "grad_norm": 2.4209272861480713, "learning_rate": 1.9609006890371667e-05, "loss": 0.1493, "mean_token_accuracy": 0.9564682513475418, "num_tokens": 9095204.0, "step": 2890 }, { "entropy": 0.1816806599497795, "epoch": 3.2842582106455267, "grad_norm": 2.4786605834960938, "learning_rate": 1.9452604458532113e-05, "loss": 0.1396, "mean_token_accuracy": 0.9589683145284653, "num_tokens": 9128871.0, "step": 2900 }, { "entropy": 0.18295636475086213, "epoch": 3.2955832389580975, "grad_norm": 2.6071975231170654, "learning_rate": 1.92964300583956e-05, "loss": 0.1412, "mean_token_accuracy": 0.959463146328926, "num_tokens": 9159877.0, "step": 2910 }, { "entropy": 0.1954453021287918, "epoch": 3.3069082672706682, "grad_norm": 2.99585223197937, "learning_rate": 1.914049010968003e-05, "loss": 0.155, "mean_token_accuracy": 0.9559677958488464, "num_tokens": 9191639.0, "step": 2920 }, { "entropy": 0.1915322221815586, "epoch": 3.318233295583239, "grad_norm": 2.6830849647521973, "learning_rate": 1.898479102246592e-05, "loss": 0.1487, "mean_token_accuracy": 0.9566442102193833, "num_tokens": 9222953.0, "step": 2930 }, { "entropy": 0.18220141232013704, "epoch": 3.3295583238958097, "grad_norm": 2.3461086750030518, "learning_rate": 1.8829339196932927e-05, "loss": 0.1421, "mean_token_accuracy": 0.9585667610168457, "num_tokens": 9255460.0, "step": 2940 }, { "entropy": 0.1813945546746254, "epoch": 3.3408833522083805, "grad_norm": 2.318178415298462, "learning_rate": 1.867414102309671e-05, "loss": 0.148, "mean_token_accuracy": 0.9563045471906662, "num_tokens": 9287765.0, "step": 2950 }, { "entropy": 0.18687178418040276, "epoch": 3.3522083805209513, "grad_norm": 2.7838022708892822, "learning_rate": 1.8519202880546337e-05, "loss": 0.1443, "mean_token_accuracy": 0.9579327285289765, "num_tokens": 9320472.0, "step": 2960 }, { "entropy": 0.18875156193971634, "epoch": 3.363533408833522, "grad_norm": 2.69187068939209, "learning_rate": 1.8364531138182002e-05, "loss": 0.1471, "mean_token_accuracy": 0.9574691027402877, "num_tokens": 9351036.0, "step": 2970 }, { "entropy": 0.179339037835598, "epoch": 3.374858437146093, "grad_norm": 2.6196610927581787, "learning_rate": 1.821013215395322e-05, "loss": 0.1389, "mean_token_accuracy": 0.9601082682609559, "num_tokens": 9382386.0, "step": 2980 }, { "entropy": 0.18361469805240632, "epoch": 3.3861834654586636, "grad_norm": 2.4154880046844482, "learning_rate": 1.805601227459751e-05, "loss": 0.146, "mean_token_accuracy": 0.957899197936058, "num_tokens": 9414364.0, "step": 2990 }, { "entropy": 0.18562110662460327, "epoch": 3.3975084937712343, "grad_norm": 1.9397906064987183, "learning_rate": 1.7902177835379437e-05, "loss": 0.1424, "mean_token_accuracy": 0.9570330619812012, "num_tokens": 9446533.0, "step": 3000 }, { "epoch": 3.3975084937712343, "eval_entropy": 0.2167282881799029, "eval_loss": 0.3606521189212799, "eval_mean_token_accuracy": 0.9296634993446407, "eval_num_tokens": 9446533.0, "eval_runtime": 22.8577, "eval_samples_per_second": 46.768, "eval_steps_per_second": 5.862, "step": 3000 }, { "entropy": 0.19131319522857665, "epoch": 3.408833522083805, "grad_norm": 2.902683973312378, "learning_rate": 1.7748635159830286e-05, "loss": 0.1543, "mean_token_accuracy": 0.9540064781904221, "num_tokens": 9478106.0, "step": 3010 }, { "entropy": 0.1933427281677723, "epoch": 3.420158550396376, "grad_norm": 3.3666422367095947, "learning_rate": 1.759539055948806e-05, "loss": 0.1498, "mean_token_accuracy": 0.9571747660636902, "num_tokens": 9511243.0, "step": 3020 }, { "entropy": 0.1907684937119484, "epoch": 3.4314835787089466, "grad_norm": 2.353508710861206, "learning_rate": 1.7442450333638073e-05, "loss": 0.1544, "mean_token_accuracy": 0.9548930257558823, "num_tokens": 9540974.0, "step": 3030 }, { "entropy": 0.2047928489744663, "epoch": 3.4428086070215174, "grad_norm": 2.5164899826049805, "learning_rate": 1.728982076905396e-05, "loss": 0.1651, "mean_token_accuracy": 0.9520281046628952, "num_tokens": 9571441.0, "step": 3040 }, { "entropy": 0.18354908376932144, "epoch": 3.454133635334088, "grad_norm": 2.7729098796844482, "learning_rate": 1.7137508139739327e-05, "loss": 0.1409, "mean_token_accuracy": 0.9593415051698685, "num_tokens": 9601231.0, "step": 3050 }, { "entropy": 0.1844599574804306, "epoch": 3.4654586636466593, "grad_norm": 2.361435890197754, "learning_rate": 1.6985518706669794e-05, "loss": 0.147, "mean_token_accuracy": 0.9563383430242538, "num_tokens": 9632314.0, "step": 3060 }, { "entropy": 0.18727928251028061, "epoch": 3.4767836919592296, "grad_norm": 3.096991539001465, "learning_rate": 1.6833858717535657e-05, "loss": 0.1489, "mean_token_accuracy": 0.9559879064559936, "num_tokens": 9664358.0, "step": 3070 }, { "entropy": 0.18697959557175636, "epoch": 3.488108720271801, "grad_norm": 2.7007927894592285, "learning_rate": 1.6682534406485055e-05, "loss": 0.1435, "mean_token_accuracy": 0.9573837488889694, "num_tokens": 9694084.0, "step": 3080 }, { "entropy": 0.17963240146636963, "epoch": 3.4994337485843716, "grad_norm": 2.9009249210357666, "learning_rate": 1.6531551993867717e-05, "loss": 0.1488, "mean_token_accuracy": 0.957426118850708, "num_tokens": 9726062.0, "step": 3090 }, { "entropy": 0.18775674030184747, "epoch": 3.5107587768969424, "grad_norm": 2.6308300495147705, "learning_rate": 1.638091768597927e-05, "loss": 0.1468, "mean_token_accuracy": 0.9567855209112167, "num_tokens": 9757366.0, "step": 3100 }, { "entropy": 0.18580029234290124, "epoch": 3.522083805209513, "grad_norm": 2.537184715270996, "learning_rate": 1.623063767480611e-05, "loss": 0.1451, "mean_token_accuracy": 0.9578706949949265, "num_tokens": 9789224.0, "step": 3110 }, { "entropy": 0.184904894977808, "epoch": 3.533408833522084, "grad_norm": 2.5414226055145264, "learning_rate": 1.6080718137770908e-05, "loss": 0.1492, "mean_token_accuracy": 0.9576172918081284, "num_tokens": 9820220.0, "step": 3120 }, { "entropy": 0.1862082712352276, "epoch": 3.5447338618346547, "grad_norm": 2.7060234546661377, "learning_rate": 1.5931165237478618e-05, "loss": 0.1465, "mean_token_accuracy": 0.9575734734535217, "num_tokens": 9852382.0, "step": 3130 }, { "entropy": 0.19045321121811867, "epoch": 3.5560588901472254, "grad_norm": 2.51277756690979, "learning_rate": 1.5781985121463215e-05, "loss": 0.1529, "mean_token_accuracy": 0.9558474659919739, "num_tokens": 9882680.0, "step": 3140 }, { "entropy": 0.1901462681591511, "epoch": 3.567383918459796, "grad_norm": 3.2841107845306396, "learning_rate": 1.5633183921934975e-05, "loss": 0.1494, "mean_token_accuracy": 0.9553068786859512, "num_tokens": 9912907.0, "step": 3150 }, { "entropy": 0.20788209587335588, "epoch": 3.578708946772367, "grad_norm": 2.5944294929504395, "learning_rate": 1.5484767755528396e-05, "loss": 0.1695, "mean_token_accuracy": 0.9538979262113572, "num_tokens": 9943382.0, "step": 3160 }, { "entropy": 0.19017338678240775, "epoch": 3.5900339750849377, "grad_norm": 2.511800765991211, "learning_rate": 1.5336742723050773e-05, "loss": 0.1508, "mean_token_accuracy": 0.9550507336854934, "num_tokens": 9974102.0, "step": 3170 }, { "entropy": 0.18529783561825752, "epoch": 3.6013590033975085, "grad_norm": 2.57311749458313, "learning_rate": 1.5189114909231417e-05, "loss": 0.1468, "mean_token_accuracy": 0.9569410651922226, "num_tokens": 10006688.0, "step": 3180 }, { "entropy": 0.17686643823981285, "epoch": 3.6126840317100792, "grad_norm": 2.5054280757904053, "learning_rate": 1.5041890382471529e-05, "loss": 0.1447, "mean_token_accuracy": 0.9577518045902252, "num_tokens": 10041266.0, "step": 3190 }, { "entropy": 0.17823027074337006, "epoch": 3.62400906002265, "grad_norm": 2.448424816131592, "learning_rate": 1.4895075194594772e-05, "loss": 0.1432, "mean_token_accuracy": 0.9584116190671921, "num_tokens": 10073211.0, "step": 3200 }, { "entropy": 0.18234781473875045, "epoch": 3.6353340883352208, "grad_norm": 2.9412801265716553, "learning_rate": 1.4748675380598485e-05, "loss": 0.1437, "mean_token_accuracy": 0.958428630232811, "num_tokens": 10103170.0, "step": 3210 }, { "entropy": 0.18727488964796066, "epoch": 3.6466591166477915, "grad_norm": 2.0503034591674805, "learning_rate": 1.4602696958405604e-05, "loss": 0.1461, "mean_token_accuracy": 0.9569843918085098, "num_tokens": 10133784.0, "step": 3220 }, { "entropy": 0.18094087094068528, "epoch": 3.6579841449603623, "grad_norm": 2.4768571853637695, "learning_rate": 1.4457145928617318e-05, "loss": 0.1436, "mean_token_accuracy": 0.956751012802124, "num_tokens": 10165215.0, "step": 3230 }, { "entropy": 0.1724256232380867, "epoch": 3.669309173272933, "grad_norm": 2.701741933822632, "learning_rate": 1.4312028274266368e-05, "loss": 0.1358, "mean_token_accuracy": 0.9607989430427551, "num_tokens": 10198970.0, "step": 3240 }, { "entropy": 0.1781797580420971, "epoch": 3.680634201585504, "grad_norm": 2.8996741771698, "learning_rate": 1.4167349960571139e-05, "loss": 0.1473, "mean_token_accuracy": 0.9563544005155563, "num_tokens": 10231477.0, "step": 3250 }, { "entropy": 0.18036175966262818, "epoch": 3.6919592298980746, "grad_norm": 2.777268409729004, "learning_rate": 1.4023116934690444e-05, "loss": 0.1398, "mean_token_accuracy": 0.9592943549156189, "num_tokens": 10264052.0, "step": 3260 }, { "entropy": 0.17828406393527985, "epoch": 3.7032842582106458, "grad_norm": 2.637403964996338, "learning_rate": 1.3879335125479054e-05, "loss": 0.136, "mean_token_accuracy": 0.9595444172620773, "num_tokens": 10295601.0, "step": 3270 }, { "entropy": 0.18349251225590707, "epoch": 3.714609286523216, "grad_norm": 2.5384583473205566, "learning_rate": 1.3736010443243987e-05, "loss": 0.1452, "mean_token_accuracy": 0.9566281735897064, "num_tokens": 10328327.0, "step": 3280 }, { "entropy": 0.1727997176349163, "epoch": 3.7259343148357873, "grad_norm": 2.7597897052764893, "learning_rate": 1.3593148779501575e-05, "loss": 0.1383, "mean_token_accuracy": 0.9598427444696427, "num_tokens": 10363214.0, "step": 3290 }, { "entropy": 0.17277977019548416, "epoch": 3.7372593431483576, "grad_norm": 2.6197760105133057, "learning_rate": 1.345075600673526e-05, "loss": 0.1351, "mean_token_accuracy": 0.9603388220071792, "num_tokens": 10394380.0, "step": 3300 }, { "entropy": 0.18125973120331765, "epoch": 3.748584371460929, "grad_norm": 2.668290853500366, "learning_rate": 1.330883797815421e-05, "loss": 0.1351, "mean_token_accuracy": 0.9600670874118805, "num_tokens": 10426589.0, "step": 3310 }, { "entropy": 0.17329150810837746, "epoch": 3.7599093997734996, "grad_norm": 2.1576309204101562, "learning_rate": 1.3167400527452722e-05, "loss": 0.1354, "mean_token_accuracy": 0.9584385722875595, "num_tokens": 10459861.0, "step": 3320 }, { "entropy": 0.18031825870275497, "epoch": 3.7712344280860703, "grad_norm": 2.3556900024414062, "learning_rate": 1.3026449468570434e-05, "loss": 0.1431, "mean_token_accuracy": 0.9587339907884598, "num_tokens": 10490225.0, "step": 3330 }, { "entropy": 0.17993075400590897, "epoch": 3.782559456398641, "grad_norm": 2.7811639308929443, "learning_rate": 1.28859905954533e-05, "loss": 0.1413, "mean_token_accuracy": 0.9577191978693008, "num_tokens": 10520675.0, "step": 3340 }, { "entropy": 0.17930143624544143, "epoch": 3.793884484711212, "grad_norm": 2.998231887817383, "learning_rate": 1.2746029681815468e-05, "loss": 0.1378, "mean_token_accuracy": 0.9599625527858734, "num_tokens": 10551792.0, "step": 3350 }, { "entropy": 0.17830280363559722, "epoch": 3.8052095130237826, "grad_norm": 2.379882574081421, "learning_rate": 1.2606572480901887e-05, "loss": 0.1405, "mean_token_accuracy": 0.9589870750904084, "num_tokens": 10582017.0, "step": 3360 }, { "entropy": 0.18163416758179665, "epoch": 3.8165345413363534, "grad_norm": 2.8061420917510986, "learning_rate": 1.2467624725251881e-05, "loss": 0.1398, "mean_token_accuracy": 0.9601597607135772, "num_tokens": 10612802.0, "step": 3370 }, { "entropy": 0.1735350415110588, "epoch": 3.827859569648924, "grad_norm": 2.3078725337982178, "learning_rate": 1.2329192126463466e-05, "loss": 0.1426, "mean_token_accuracy": 0.95964674949646, "num_tokens": 10644168.0, "step": 3380 }, { "entropy": 0.18496472463011743, "epoch": 3.839184597961495, "grad_norm": 2.1106555461883545, "learning_rate": 1.2191280374958558e-05, "loss": 0.1397, "mean_token_accuracy": 0.9582046836614608, "num_tokens": 10675513.0, "step": 3390 }, { "entropy": 0.18191742673516273, "epoch": 3.8505096262740657, "grad_norm": 2.747044324874878, "learning_rate": 1.2053895139749136e-05, "loss": 0.1507, "mean_token_accuracy": 0.9582120895385742, "num_tokens": 10705985.0, "step": 3400 }, { "entropy": 0.17193833813071252, "epoch": 3.8618346545866364, "grad_norm": 2.384255886077881, "learning_rate": 1.1917042068204082e-05, "loss": 0.1298, "mean_token_accuracy": 0.9630542248487473, "num_tokens": 10737566.0, "step": 3410 }, { "entropy": 0.17727019563317298, "epoch": 3.873159682899207, "grad_norm": 2.8619980812072754, "learning_rate": 1.1780726785817161e-05, "loss": 0.1448, "mean_token_accuracy": 0.9592967689037323, "num_tokens": 10767754.0, "step": 3420 }, { "entropy": 0.1765161231160164, "epoch": 3.884484711211778, "grad_norm": 1.9828704595565796, "learning_rate": 1.1644954895975726e-05, "loss": 0.1391, "mean_token_accuracy": 0.9608483374118805, "num_tokens": 10799308.0, "step": 3430 }, { "entropy": 0.17770369723439217, "epoch": 3.8958097395243487, "grad_norm": 2.346670150756836, "learning_rate": 1.1509731979730393e-05, "loss": 0.1373, "mean_token_accuracy": 0.9591909050941467, "num_tokens": 10831057.0, "step": 3440 }, { "entropy": 0.18057307675480844, "epoch": 3.9071347678369195, "grad_norm": 2.6184489727020264, "learning_rate": 1.1375063595565597e-05, "loss": 0.1389, "mean_token_accuracy": 0.9603639721870423, "num_tokens": 10862132.0, "step": 3450 }, { "entropy": 0.17516846358776092, "epoch": 3.9184597961494902, "grad_norm": 2.5211005210876465, "learning_rate": 1.1240955279171161e-05, "loss": 0.1355, "mean_token_accuracy": 0.9610599786043167, "num_tokens": 10892085.0, "step": 3460 }, { "entropy": 0.17097302675247192, "epoch": 3.929784824462061, "grad_norm": 2.9019627571105957, "learning_rate": 1.1107412543214707e-05, "loss": 0.1368, "mean_token_accuracy": 0.9616718500852585, "num_tokens": 10924497.0, "step": 3470 }, { "entropy": 0.17731274515390397, "epoch": 3.9411098527746318, "grad_norm": 2.090160369873047, "learning_rate": 1.0974440877115059e-05, "loss": 0.1429, "mean_token_accuracy": 0.9593401104211807, "num_tokens": 10954535.0, "step": 3480 }, { "entropy": 0.174043457955122, "epoch": 3.9524348810872025, "grad_norm": 2.19345760345459, "learning_rate": 1.0842045746816604e-05, "loss": 0.1334, "mean_token_accuracy": 0.9610940903425217, "num_tokens": 10984773.0, "step": 3490 }, { "entropy": 0.1764768786728382, "epoch": 3.9637599093997737, "grad_norm": 2.4305057525634766, "learning_rate": 1.0710232594564573e-05, "loss": 0.1395, "mean_token_accuracy": 0.9584390819072723, "num_tokens": 11016942.0, "step": 3500 }, { "epoch": 3.9637599093997737, "eval_entropy": 0.19162503318555318, "eval_loss": 0.3246709704399109, "eval_mean_token_accuracy": 0.9435568692079231, "eval_num_tokens": 11016942.0, "eval_runtime": 22.7597, "eval_samples_per_second": 46.969, "eval_steps_per_second": 5.888, "step": 3500 }, { "entropy": 0.1761431761085987, "epoch": 3.975084937712344, "grad_norm": 2.183934450149536, "learning_rate": 1.057900683868138e-05, "loss": 0.1371, "mean_token_accuracy": 0.9605765461921691, "num_tokens": 11047956.0, "step": 3510 }, { "entropy": 0.18299319073557854, "epoch": 3.9864099660249153, "grad_norm": 2.257868766784668, "learning_rate": 1.0448373873343892e-05, "loss": 0.1442, "mean_token_accuracy": 0.9576489955186844, "num_tokens": 11077078.0, "step": 3520 }, { "entropy": 0.1684267781674862, "epoch": 3.9977349943374856, "grad_norm": 2.2900640964508057, "learning_rate": 1.0318339068361657e-05, "loss": 0.1332, "mean_token_accuracy": 0.961523225903511, "num_tokens": 11110785.0, "step": 3530 }, { "entropy": 0.13921927139163018, "epoch": 4.009060022650057, "grad_norm": 1.109343409538269, "learning_rate": 1.018890776895618e-05, "loss": 0.0855, "mean_token_accuracy": 0.9792260736227035, "num_tokens": 11140853.0, "step": 3540 }, { "entropy": 0.12029264867305756, "epoch": 4.020385050962627, "grad_norm": 2.0965583324432373, "learning_rate": 1.0060085295541242e-05, "loss": 0.0692, "mean_token_accuracy": 0.982733103632927, "num_tokens": 11171432.0, "step": 3550 }, { "entropy": 0.1098148312419653, "epoch": 4.031710079275198, "grad_norm": 1.9380742311477661, "learning_rate": 9.931876943504159e-06, "loss": 0.0706, "mean_token_accuracy": 0.9828455358743667, "num_tokens": 11202059.0, "step": 3560 }, { "entropy": 0.10720174796879292, "epoch": 4.043035107587769, "grad_norm": 1.4595756530761719, "learning_rate": 9.804287982988128e-06, "loss": 0.0699, "mean_token_accuracy": 0.9834076315164566, "num_tokens": 11233844.0, "step": 3570 }, { "entropy": 0.0977464810013771, "epoch": 4.05436013590034, "grad_norm": 1.4825645685195923, "learning_rate": 9.677323658675594e-06, "loss": 0.0644, "mean_token_accuracy": 0.9842707842588425, "num_tokens": 11268276.0, "step": 3580 }, { "entropy": 0.10380081199109555, "epoch": 4.06568516421291, "grad_norm": 1.125681757926941, "learning_rate": 9.550989189572623e-06, "loss": 0.0664, "mean_token_accuracy": 0.983523941040039, "num_tokens": 11300315.0, "step": 3590 }, { "entropy": 0.11478501930832863, "epoch": 4.077010192525481, "grad_norm": 2.136237144470215, "learning_rate": 9.425289768794433e-06, "loss": 0.0741, "mean_token_accuracy": 0.9814123988151551, "num_tokens": 11328482.0, "step": 3600 }, { "entropy": 0.10670767351984978, "epoch": 4.088335220838052, "grad_norm": 1.1652253866195679, "learning_rate": 9.300230563351877e-06, "loss": 0.0666, "mean_token_accuracy": 0.9832827031612397, "num_tokens": 11359860.0, "step": 3610 }, { "entropy": 0.10554791763424873, "epoch": 4.099660249150623, "grad_norm": 1.7422910928726196, "learning_rate": 9.17581671393908e-06, "loss": 0.0663, "mean_token_accuracy": 0.9834140241146088, "num_tokens": 11391326.0, "step": 3620 }, { "entropy": 0.09950958117842675, "epoch": 4.110985277463194, "grad_norm": 1.6092437505722046, "learning_rate": 9.052053334722075e-06, "loss": 0.0665, "mean_token_accuracy": 0.983272585272789, "num_tokens": 11424746.0, "step": 3630 }, { "entropy": 0.10113044157624244, "epoch": 4.122310305775764, "grad_norm": 1.7780877351760864, "learning_rate": 8.928945513128648e-06, "loss": 0.0679, "mean_token_accuracy": 0.9828346908092499, "num_tokens": 11456151.0, "step": 3640 }, { "entropy": 0.10017617046833038, "epoch": 4.133635334088336, "grad_norm": 1.337113857269287, "learning_rate": 8.806498309639161e-06, "loss": 0.0653, "mean_token_accuracy": 0.9838932573795318, "num_tokens": 11489218.0, "step": 3650 }, { "entropy": 0.09929984286427498, "epoch": 4.144960362400906, "grad_norm": 1.4660295248031616, "learning_rate": 8.684716757578559e-06, "loss": 0.0625, "mean_token_accuracy": 0.9850300490856171, "num_tokens": 11522506.0, "step": 3660 }, { "entropy": 0.10483775176107883, "epoch": 4.156285390713477, "grad_norm": 1.3711936473846436, "learning_rate": 8.563605862909466e-06, "loss": 0.0674, "mean_token_accuracy": 0.983280673623085, "num_tokens": 11554172.0, "step": 3670 }, { "entropy": 0.1068756926804781, "epoch": 4.1676104190260475, "grad_norm": 1.3237987756729126, "learning_rate": 8.443170604026388e-06, "loss": 0.069, "mean_token_accuracy": 0.9830775171518326, "num_tokens": 11586956.0, "step": 3680 }, { "entropy": 0.10014188215136528, "epoch": 4.178935447338619, "grad_norm": 1.6192259788513184, "learning_rate": 8.323415931551114e-06, "loss": 0.0634, "mean_token_accuracy": 0.9846415996551514, "num_tokens": 11619759.0, "step": 3690 }, { "entropy": 0.10651820972561836, "epoch": 4.190260475651189, "grad_norm": 1.3025074005126953, "learning_rate": 8.204346768129182e-06, "loss": 0.0694, "mean_token_accuracy": 0.9827026188373565, "num_tokens": 11652472.0, "step": 3700 }, { "entropy": 0.10732679478824139, "epoch": 4.20158550396376, "grad_norm": 2.4277567863464355, "learning_rate": 8.085968008227545e-06, "loss": 0.0701, "mean_token_accuracy": 0.9824820995330811, "num_tokens": 11685051.0, "step": 3710 }, { "entropy": 0.10409895218908786, "epoch": 4.2129105322763305, "grad_norm": 1.18539559841156, "learning_rate": 7.96828451793335e-06, "loss": 0.0642, "mean_token_accuracy": 0.9833295792341232, "num_tokens": 11717668.0, "step": 3720 }, { "entropy": 0.1043463621288538, "epoch": 4.224235560588902, "grad_norm": 1.7125917673110962, "learning_rate": 7.851301134753958e-06, "loss": 0.0693, "mean_token_accuracy": 0.9828597515821457, "num_tokens": 11748692.0, "step": 3730 }, { "entropy": 0.10716413073241711, "epoch": 4.235560588901472, "grad_norm": 1.7997198104858398, "learning_rate": 7.735022667418054e-06, "loss": 0.0666, "mean_token_accuracy": 0.9837399870157242, "num_tokens": 11779854.0, "step": 3740 }, { "entropy": 0.10388663597404957, "epoch": 4.246885617214043, "grad_norm": 1.4692825078964233, "learning_rate": 7.619453895678002e-06, "loss": 0.0704, "mean_token_accuracy": 0.982433345913887, "num_tokens": 11811159.0, "step": 3750 }, { "entropy": 0.09520137794315815, "epoch": 4.2582106455266135, "grad_norm": 1.4710146188735962, "learning_rate": 7.504599570113355e-06, "loss": 0.0639, "mean_token_accuracy": 0.9837541997432708, "num_tokens": 11845982.0, "step": 3760 }, { "entropy": 0.1049416147172451, "epoch": 4.269535673839185, "grad_norm": 1.5879381895065308, "learning_rate": 7.3904644119355856e-06, "loss": 0.0672, "mean_token_accuracy": 0.9831987112760544, "num_tokens": 11877055.0, "step": 3770 }, { "entropy": 0.11051239259541035, "epoch": 4.280860702151755, "grad_norm": 1.4687776565551758, "learning_rate": 7.277053112794008e-06, "loss": 0.0698, "mean_token_accuracy": 0.9820594847202301, "num_tokens": 11908759.0, "step": 3780 }, { "entropy": 0.09960917495191098, "epoch": 4.292185730464326, "grad_norm": 1.4004406929016113, "learning_rate": 7.164370334582929e-06, "loss": 0.0638, "mean_token_accuracy": 0.984798014163971, "num_tokens": 11941140.0, "step": 3790 }, { "entropy": 0.11478472761809826, "epoch": 4.303510758776897, "grad_norm": 1.6815623044967651, "learning_rate": 7.052420709250018e-06, "loss": 0.0728, "mean_token_accuracy": 0.9819641202688217, "num_tokens": 11969922.0, "step": 3800 }, { "entropy": 0.1214354656636715, "epoch": 4.314835787089468, "grad_norm": 1.6286342144012451, "learning_rate": 6.941208838605884e-06, "loss": 0.0783, "mean_token_accuracy": 0.9797982454299927, "num_tokens": 11997173.0, "step": 3810 }, { "entropy": 0.09785076789557934, "epoch": 4.326160815402038, "grad_norm": 1.5718897581100464, "learning_rate": 6.830739294134944e-06, "loss": 0.0642, "mean_token_accuracy": 0.9843629568815231, "num_tokens": 12030969.0, "step": 3820 }, { "entropy": 0.09950311519205571, "epoch": 4.337485843714609, "grad_norm": 1.7603079080581665, "learning_rate": 6.721016616807499e-06, "loss": 0.0655, "mean_token_accuracy": 0.9839122086763382, "num_tokens": 12062700.0, "step": 3830 }, { "entropy": 0.10633566826581956, "epoch": 4.34881087202718, "grad_norm": 1.8313571214675903, "learning_rate": 6.6120453168930565e-06, "loss": 0.0667, "mean_token_accuracy": 0.9825680136680603, "num_tokens": 12094124.0, "step": 3840 }, { "entropy": 0.10049457848072052, "epoch": 4.360135900339751, "grad_norm": 1.9124850034713745, "learning_rate": 6.50382987377495e-06, "loss": 0.064, "mean_token_accuracy": 0.9843475848436356, "num_tokens": 12126923.0, "step": 3850 }, { "entropy": 0.09917602241039276, "epoch": 4.371460928652322, "grad_norm": 2.690685510635376, "learning_rate": 6.396374735766181e-06, "loss": 0.0654, "mean_token_accuracy": 0.984014829993248, "num_tokens": 12162914.0, "step": 3860 }, { "entropy": 0.10630933344364166, "epoch": 4.382785956964892, "grad_norm": 1.5558531284332275, "learning_rate": 6.289684319926609e-06, "loss": 0.0688, "mean_token_accuracy": 0.9832538574934006, "num_tokens": 12193386.0, "step": 3870 }, { "entropy": 0.10831660889089108, "epoch": 4.394110985277464, "grad_norm": 1.1746289730072021, "learning_rate": 6.183763011881349e-06, "loss": 0.0678, "mean_token_accuracy": 0.9832908064126968, "num_tokens": 12224466.0, "step": 3880 }, { "entropy": 0.09632682017982006, "epoch": 4.405436013590034, "grad_norm": 1.4813034534454346, "learning_rate": 6.078615165640514e-06, "loss": 0.0625, "mean_token_accuracy": 0.9853335320949554, "num_tokens": 12258416.0, "step": 3890 }, { "entropy": 0.10236595608294011, "epoch": 4.416761041902605, "grad_norm": 1.622856616973877, "learning_rate": 5.9742451034202225e-06, "loss": 0.0669, "mean_token_accuracy": 0.9836599975824356, "num_tokens": 12291050.0, "step": 3900 }, { "entropy": 0.09877747371792793, "epoch": 4.428086070215175, "grad_norm": 1.5723583698272705, "learning_rate": 5.870657115464945e-06, "loss": 0.0649, "mean_token_accuracy": 0.9841223329305648, "num_tokens": 12323596.0, "step": 3910 }, { "entropy": 0.1077471561729908, "epoch": 4.439411098527747, "grad_norm": 1.604130744934082, "learning_rate": 5.767855459871138e-06, "loss": 0.0692, "mean_token_accuracy": 0.9839894771575928, "num_tokens": 12353006.0, "step": 3920 }, { "entropy": 0.11380970776081085, "epoch": 4.450736126840317, "grad_norm": 1.3762873411178589, "learning_rate": 5.66584436241222e-06, "loss": 0.0722, "mean_token_accuracy": 0.9826073527336121, "num_tokens": 12381072.0, "step": 3930 }, { "entropy": 0.1162498328834772, "epoch": 4.462061155152888, "grad_norm": 1.0174576044082642, "learning_rate": 5.564628016364862e-06, "loss": 0.0812, "mean_token_accuracy": 0.9817124873399734, "num_tokens": 12411917.0, "step": 3940 }, { "entropy": 0.11888325586915016, "epoch": 4.4733861834654585, "grad_norm": 1.0936172008514404, "learning_rate": 5.464210582336595e-06, "loss": 0.0778, "mean_token_accuracy": 0.9785851925611496, "num_tokens": 12444726.0, "step": 3950 }, { "entropy": 0.1004635065793991, "epoch": 4.48471121177803, "grad_norm": 1.5305367708206177, "learning_rate": 5.364596188094839e-06, "loss": 0.0646, "mean_token_accuracy": 0.9839898198843002, "num_tokens": 12478621.0, "step": 3960 }, { "entropy": 0.11077578589320183, "epoch": 4.4960362400906, "grad_norm": 1.8867307901382446, "learning_rate": 5.265788928397172e-06, "loss": 0.0737, "mean_token_accuracy": 0.981790030002594, "num_tokens": 12506545.0, "step": 3970 }, { "entropy": 0.10919649675488471, "epoch": 4.507361268403171, "grad_norm": 1.829324722290039, "learning_rate": 5.16779286482304e-06, "loss": 0.0693, "mean_token_accuracy": 0.9827600866556168, "num_tokens": 12536367.0, "step": 3980 }, { "entropy": 0.10490477122366429, "epoch": 4.5186862967157415, "grad_norm": 1.3586397171020508, "learning_rate": 5.070612025606799e-06, "loss": 0.0689, "mean_token_accuracy": 0.9822495877742767, "num_tokens": 12567694.0, "step": 3990 }, { "entropy": 0.10469603314995765, "epoch": 4.530011325028313, "grad_norm": 2.1284432411193848, "learning_rate": 4.974250405472109e-06, "loss": 0.0655, "mean_token_accuracy": 0.9836343795061111, "num_tokens": 12599121.0, "step": 4000 }, { "epoch": 4.530011325028313, "eval_entropy": 0.14545185185635268, "eval_loss": 0.33652952313423157, "eval_mean_token_accuracy": 0.947298993815237, "eval_num_tokens": 12599121.0, "eval_runtime": 22.7646, "eval_samples_per_second": 46.959, "eval_steps_per_second": 5.886, "step": 4000 }, { "entropy": 0.09632386714220047, "epoch": 4.541336353340883, "grad_norm": 1.4618823528289795, "learning_rate": 4.87871196546775e-06, "loss": 0.0634, "mean_token_accuracy": 0.9848664104938507, "num_tokens": 12633049.0, "step": 4010 }, { "entropy": 0.09856323637068272, "epoch": 4.552661381653454, "grad_norm": 1.6553484201431274, "learning_rate": 4.784000632804797e-06, "loss": 0.0653, "mean_token_accuracy": 0.9834588289260864, "num_tokens": 12667038.0, "step": 4020 }, { "entropy": 0.1045151598751545, "epoch": 4.563986409966025, "grad_norm": 1.4048621654510498, "learning_rate": 4.690120300695175e-06, "loss": 0.0669, "mean_token_accuracy": 0.9840848416090011, "num_tokens": 12698493.0, "step": 4030 }, { "entropy": 0.09922835528850556, "epoch": 4.575311438278596, "grad_norm": 1.3776944875717163, "learning_rate": 4.597074828191633e-06, "loss": 0.0649, "mean_token_accuracy": 0.9843086212873459, "num_tokens": 12731001.0, "step": 4040 }, { "entropy": 0.10786725878715515, "epoch": 4.586636466591166, "grad_norm": 1.3582957983016968, "learning_rate": 4.504868040029112e-06, "loss": 0.0704, "mean_token_accuracy": 0.9824901282787323, "num_tokens": 12761747.0, "step": 4050 }, { "entropy": 0.10388052202761174, "epoch": 4.597961494903737, "grad_norm": 1.312872052192688, "learning_rate": 4.4135037264675244e-06, "loss": 0.0665, "mean_token_accuracy": 0.9838407725095749, "num_tokens": 12791779.0, "step": 4060 }, { "entropy": 0.0987605456262827, "epoch": 4.6092865232163085, "grad_norm": 1.818518042564392, "learning_rate": 4.322985643135952e-06, "loss": 0.0655, "mean_token_accuracy": 0.9834914088249207, "num_tokens": 12823785.0, "step": 4070 }, { "entropy": 0.09871327206492424, "epoch": 4.620611551528879, "grad_norm": 1.7639421224594116, "learning_rate": 4.233317510878271e-06, "loss": 0.0631, "mean_token_accuracy": 0.9850008130073548, "num_tokens": 12856087.0, "step": 4080 }, { "entropy": 0.11288162134587765, "epoch": 4.631936579841449, "grad_norm": 1.7146990299224854, "learning_rate": 4.1445030156001765e-06, "loss": 0.0714, "mean_token_accuracy": 0.9824142724275589, "num_tokens": 12885222.0, "step": 4090 }, { "entropy": 0.11302966885268688, "epoch": 4.64326160815402, "grad_norm": 1.8406291007995605, "learning_rate": 4.0565458081177174e-06, "loss": 0.0713, "mean_token_accuracy": 0.9816466063261032, "num_tokens": 12915084.0, "step": 4100 }, { "entropy": 0.10371216647326946, "epoch": 4.6545866364665915, "grad_norm": 1.4874554872512817, "learning_rate": 3.969449504007189e-06, "loss": 0.0655, "mean_token_accuracy": 0.9841427594423294, "num_tokens": 12946310.0, "step": 4110 }, { "entropy": 0.09823175929486752, "epoch": 4.665911664779162, "grad_norm": 1.4839348793029785, "learning_rate": 3.883217683456522e-06, "loss": 0.0624, "mean_token_accuracy": 0.984762093424797, "num_tokens": 12979829.0, "step": 4120 }, { "entropy": 0.09992509298026561, "epoch": 4.677236693091733, "grad_norm": 1.2307822704315186, "learning_rate": 3.7978538911181054e-06, "loss": 0.065, "mean_token_accuracy": 0.9842061281204224, "num_tokens": 13011881.0, "step": 4130 }, { "entropy": 0.10551130101084709, "epoch": 4.688561721404303, "grad_norm": 1.4577821493148804, "learning_rate": 3.7133616359630974e-06, "loss": 0.0658, "mean_token_accuracy": 0.9835035592317581, "num_tokens": 13042506.0, "step": 4140 }, { "entropy": 0.10092180483043194, "epoch": 4.699886749716875, "grad_norm": 1.5127148628234863, "learning_rate": 3.6297443911371744e-06, "loss": 0.0687, "mean_token_accuracy": 0.9826838493347168, "num_tokens": 13074275.0, "step": 4150 }, { "entropy": 0.1087297696620226, "epoch": 4.711211778029445, "grad_norm": 1.520949125289917, "learning_rate": 3.547005593817765e-06, "loss": 0.0691, "mean_token_accuracy": 0.9830662041902543, "num_tokens": 13104642.0, "step": 4160 }, { "entropy": 0.10933543741703033, "epoch": 4.722536806342016, "grad_norm": 1.3643742799758911, "learning_rate": 3.4651486450727633e-06, "loss": 0.0676, "mean_token_accuracy": 0.9833125978708267, "num_tokens": 13134253.0, "step": 4170 }, { "entropy": 0.11058039665222168, "epoch": 4.733861834654586, "grad_norm": 1.106510043144226, "learning_rate": 3.384176909720718e-06, "loss": 0.0668, "mean_token_accuracy": 0.9826488733291626, "num_tokens": 13165807.0, "step": 4180 }, { "entropy": 0.10953018330037594, "epoch": 4.745186862967158, "grad_norm": 1.3522114753723145, "learning_rate": 3.304093716192527e-06, "loss": 0.0675, "mean_token_accuracy": 0.9823530614376068, "num_tokens": 13194564.0, "step": 4190 }, { "entropy": 0.1017619101330638, "epoch": 4.756511891279728, "grad_norm": 1.3031588792800903, "learning_rate": 3.22490235639461e-06, "loss": 0.0655, "mean_token_accuracy": 0.9847893178462982, "num_tokens": 13227659.0, "step": 4200 }, { "entropy": 0.11424098871648311, "epoch": 4.767836919592299, "grad_norm": 1.2343190908432007, "learning_rate": 3.146606085573603e-06, "loss": 0.0712, "mean_token_accuracy": 0.9818738162517547, "num_tokens": 13255783.0, "step": 4210 }, { "entropy": 0.1053063541650772, "epoch": 4.7791619479048695, "grad_norm": 1.8224282264709473, "learning_rate": 3.0692081221825235e-06, "loss": 0.0655, "mean_token_accuracy": 0.9835486084222793, "num_tokens": 13286564.0, "step": 4220 }, { "entropy": 0.10083051659166813, "epoch": 4.790486976217441, "grad_norm": 1.631349802017212, "learning_rate": 2.992711647748503e-06, "loss": 0.0626, "mean_token_accuracy": 0.9853236883878708, "num_tokens": 13319319.0, "step": 4230 }, { "entropy": 0.10127135142683982, "epoch": 4.801812004530011, "grad_norm": 1.8996741771697998, "learning_rate": 2.917119806741991e-06, "loss": 0.0642, "mean_token_accuracy": 0.9843530178070068, "num_tokens": 13351309.0, "step": 4240 }, { "entropy": 0.10405859649181366, "epoch": 4.813137032842582, "grad_norm": 1.651363730430603, "learning_rate": 2.842435706447499e-06, "loss": 0.0676, "mean_token_accuracy": 0.9830576866865158, "num_tokens": 13381712.0, "step": 4250 }, { "entropy": 0.10092639103531838, "epoch": 4.8244620611551525, "grad_norm": 1.4375232458114624, "learning_rate": 2.7686624168358766e-06, "loss": 0.0674, "mean_token_accuracy": 0.9841265827417374, "num_tokens": 13413182.0, "step": 4260 }, { "entropy": 0.10942405574023724, "epoch": 4.835787089467724, "grad_norm": 1.705937385559082, "learning_rate": 2.6958029704380995e-06, "loss": 0.0691, "mean_token_accuracy": 0.9832047790288925, "num_tokens": 13443178.0, "step": 4270 }, { "entropy": 0.10540841370821, "epoch": 4.847112117780295, "grad_norm": 1.5828317403793335, "learning_rate": 2.6238603622206427e-06, "loss": 0.0681, "mean_token_accuracy": 0.9836997509002685, "num_tokens": 13472760.0, "step": 4280 }, { "entropy": 0.10759205557405949, "epoch": 4.858437146092865, "grad_norm": 2.081759452819824, "learning_rate": 2.5528375494623446e-06, "loss": 0.0691, "mean_token_accuracy": 0.9837682038545609, "num_tokens": 13502402.0, "step": 4290 }, { "entropy": 0.10228043347597122, "epoch": 4.869762174405436, "grad_norm": 1.5169647932052612, "learning_rate": 2.482737451632877e-06, "loss": 0.0656, "mean_token_accuracy": 0.9840810686349869, "num_tokens": 13533069.0, "step": 4300 }, { "entropy": 0.10236416049301625, "epoch": 4.881087202718007, "grad_norm": 1.245417833328247, "learning_rate": 2.4135629502726825e-06, "loss": 0.0656, "mean_token_accuracy": 0.9837470918893814, "num_tokens": 13564257.0, "step": 4310 }, { "entropy": 0.10065983273088933, "epoch": 4.892412231030578, "grad_norm": 1.7363886833190918, "learning_rate": 2.34531688887458e-06, "loss": 0.0646, "mean_token_accuracy": 0.9847758024930954, "num_tokens": 13594944.0, "step": 4320 }, { "entropy": 0.09625008963048458, "epoch": 4.903737259343148, "grad_norm": 1.4529474973678589, "learning_rate": 2.2780020727668523e-06, "loss": 0.063, "mean_token_accuracy": 0.9846002101898194, "num_tokens": 13626469.0, "step": 4330 }, { "entropy": 0.10502854771912098, "epoch": 4.9150622876557195, "grad_norm": 1.4248895645141602, "learning_rate": 2.211621268997932e-06, "loss": 0.0665, "mean_token_accuracy": 0.9832880735397339, "num_tokens": 13655878.0, "step": 4340 }, { "entropy": 0.09599109441041946, "epoch": 4.92638731596829, "grad_norm": 1.3314168453216553, "learning_rate": 2.1461772062226744e-06, "loss": 0.0618, "mean_token_accuracy": 0.9860332638025284, "num_tokens": 13689056.0, "step": 4350 }, { "entropy": 0.10332580804824829, "epoch": 4.937712344280861, "grad_norm": 2.152930498123169, "learning_rate": 2.0816725745901598e-06, "loss": 0.0649, "mean_token_accuracy": 0.983710652589798, "num_tokens": 13720715.0, "step": 4360 }, { "entropy": 0.10360825695097446, "epoch": 4.949037372593431, "grad_norm": 2.080280065536499, "learning_rate": 2.01811002563316e-06, "loss": 0.0649, "mean_token_accuracy": 0.9848357111215591, "num_tokens": 13751367.0, "step": 4370 }, { "entropy": 0.10016910135746002, "epoch": 4.9603624009060026, "grad_norm": 1.3621736764907837, "learning_rate": 1.955492172159096e-06, "loss": 0.0641, "mean_token_accuracy": 0.9845564514398575, "num_tokens": 13782327.0, "step": 4380 }, { "entropy": 0.09491119757294655, "epoch": 4.971687429218573, "grad_norm": 1.176149845123291, "learning_rate": 1.8938215881426773e-06, "loss": 0.0618, "mean_token_accuracy": 0.9852659702301025, "num_tokens": 13814598.0, "step": 4390 }, { "entropy": 0.09827233254909515, "epoch": 4.983012457531144, "grad_norm": 1.7675400972366333, "learning_rate": 1.8331008086200557e-06, "loss": 0.0599, "mean_token_accuracy": 0.9853399336338043, "num_tokens": 13847789.0, "step": 4400 }, { "entropy": 0.10404907390475274, "epoch": 4.994337485843714, "grad_norm": 1.3572454452514648, "learning_rate": 1.7733323295846483e-06, "loss": 0.0694, "mean_token_accuracy": 0.9828858345746994, "num_tokens": 13878498.0, "step": 4410 }, { "entropy": 0.08685027938336135, "epoch": 5.005662514156286, "grad_norm": 0.5327439904212952, "learning_rate": 1.714518607884541e-06, "loss": 0.0552, "mean_token_accuracy": 0.9876868307590485, "num_tokens": 13909014.0, "step": 4420 }, { "entropy": 0.08384513482451439, "epoch": 5.016987542468856, "grad_norm": 0.6326966285705566, "learning_rate": 1.6566620611214722e-06, "loss": 0.0448, "mean_token_accuracy": 0.9899709314107895, "num_tokens": 13940990.0, "step": 4430 }, { "entropy": 0.08707914762198925, "epoch": 5.028312570781427, "grad_norm": 0.6900177001953125, "learning_rate": 1.5997650675514703e-06, "loss": 0.0475, "mean_token_accuracy": 0.9897072285413742, "num_tokens": 13971095.0, "step": 4440 }, { "entropy": 0.0787117276340723, "epoch": 5.0396375990939974, "grad_norm": 0.6966872811317444, "learning_rate": 1.5438299659870897e-06, "loss": 0.0415, "mean_token_accuracy": 0.9909714490175248, "num_tokens": 14006355.0, "step": 4450 }, { "entropy": 0.0797785248607397, "epoch": 5.050962627406569, "grad_norm": 0.8368032574653625, "learning_rate": 1.4888590557012722e-06, "loss": 0.0425, "mean_token_accuracy": 0.9908244729042053, "num_tokens": 14039617.0, "step": 4460 }, { "entropy": 0.07591100987046957, "epoch": 5.062287655719139, "grad_norm": 0.6300851702690125, "learning_rate": 1.4348545963328326e-06, "loss": 0.0425, "mean_token_accuracy": 0.9909938126802444, "num_tokens": 14073268.0, "step": 4470 }, { "entropy": 0.08444061297923326, "epoch": 5.07361268403171, "grad_norm": 0.9781507253646851, "learning_rate": 1.3818188077935807e-06, "loss": 0.0479, "mean_token_accuracy": 0.988967090845108, "num_tokens": 14104311.0, "step": 4480 }, { "entropy": 0.08250515647232533, "epoch": 5.0849377123442805, "grad_norm": 0.8466204404830933, "learning_rate": 1.3297538701770501e-06, "loss": 0.0448, "mean_token_accuracy": 0.9896146982908249, "num_tokens": 14134612.0, "step": 4490 }, { "entropy": 0.07659826464951039, "epoch": 5.096262740656852, "grad_norm": 0.804344117641449, "learning_rate": 1.2786619236689002e-06, "loss": 0.0441, "mean_token_accuracy": 0.9899546831846238, "num_tokens": 14167795.0, "step": 4500 }, { "epoch": 5.096262740656852, "eval_entropy": 0.1306759690893675, "eval_loss": 0.3403843939304352, "eval_mean_token_accuracy": 0.9495465515264824, "eval_num_tokens": 14167795.0, "eval_runtime": 22.7882, "eval_samples_per_second": 46.91, "eval_steps_per_second": 5.88, "step": 4500 }, { "entropy": 0.07452163267880678, "epoch": 5.107587768969422, "grad_norm": 0.6283188462257385, "learning_rate": 1.2285450684589444e-06, "loss": 0.0425, "mean_token_accuracy": 0.9906815826892853, "num_tokens": 14201908.0, "step": 4510 }, { "entropy": 0.07540215756744147, "epoch": 5.118912797281993, "grad_norm": 0.8341459035873413, "learning_rate": 1.1794053646548038e-06, "loss": 0.0424, "mean_token_accuracy": 0.9904711753129959, "num_tokens": 14235583.0, "step": 4520 }, { "entropy": 0.07949476931244134, "epoch": 5.130237825594564, "grad_norm": 0.7760495543479919, "learning_rate": 1.1312448321972313e-06, "loss": 0.0459, "mean_token_accuracy": 0.9900894790887833, "num_tokens": 14266482.0, "step": 4530 }, { "entropy": 0.07272812370210886, "epoch": 5.141562853907135, "grad_norm": 0.8201942443847656, "learning_rate": 1.0840654507770915e-06, "loss": 0.0422, "mean_token_accuracy": 0.9906651735305786, "num_tokens": 14300188.0, "step": 4540 }, { "entropy": 0.08043113667517901, "epoch": 5.152887882219706, "grad_norm": 1.054805040359497, "learning_rate": 1.0378691597539598e-06, "loss": 0.045, "mean_token_accuracy": 0.9899501115083694, "num_tokens": 14330803.0, "step": 4550 }, { "entropy": 0.08116988949477673, "epoch": 5.164212910532276, "grad_norm": 0.7149173021316528, "learning_rate": 9.926578580764234e-07, "loss": 0.0465, "mean_token_accuracy": 0.9893332690000534, "num_tokens": 14362375.0, "step": 4560 }, { "entropy": 0.07761353813111782, "epoch": 5.1755379388448475, "grad_norm": 0.6609951257705688, "learning_rate": 9.484334042040161e-07, "loss": 0.0443, "mean_token_accuracy": 0.9897470384836197, "num_tokens": 14394138.0, "step": 4570 }, { "entropy": 0.08467995468527079, "epoch": 5.186862967157418, "grad_norm": 0.9895986318588257, "learning_rate": 9.051976160308173e-07, "loss": 0.0492, "mean_token_accuracy": 0.9889312475919724, "num_tokens": 14422968.0, "step": 4580 }, { "entropy": 0.07399331219494343, "epoch": 5.198187995469989, "grad_norm": 0.5765529870986938, "learning_rate": 8.629522708107352e-07, "loss": 0.0427, "mean_token_accuracy": 0.9902592331171036, "num_tokens": 14456688.0, "step": 4590 }, { "entropy": 0.07157274391502141, "epoch": 5.209513023782559, "grad_norm": 0.6822951436042786, "learning_rate": 8.21699105084453e-07, "loss": 0.0399, "mean_token_accuracy": 0.9916774541139602, "num_tokens": 14491708.0, "step": 4600 }, { "entropy": 0.08627440240234137, "epoch": 5.2208380520951305, "grad_norm": 1.024498701095581, "learning_rate": 7.814398146080315e-07, "loss": 0.0508, "mean_token_accuracy": 0.9888632267713546, "num_tokens": 14519565.0, "step": 4610 }, { "entropy": 0.0863100489601493, "epoch": 5.232163080407701, "grad_norm": 1.332257866859436, "learning_rate": 7.421760542832223e-07, "loss": 0.0497, "mean_token_accuracy": 0.9890684664249421, "num_tokens": 14548908.0, "step": 4620 }, { "entropy": 0.07913930304348468, "epoch": 5.243488108720272, "grad_norm": 0.6243993043899536, "learning_rate": 7.039094380894201e-07, "loss": 0.0459, "mean_token_accuracy": 0.9895853340625763, "num_tokens": 14580672.0, "step": 4630 }, { "entropy": 0.0805983567610383, "epoch": 5.254813137032842, "grad_norm": 1.0783846378326416, "learning_rate": 6.66641539017343e-07, "loss": 0.0466, "mean_token_accuracy": 0.9895779520273209, "num_tokens": 14610845.0, "step": 4640 }, { "entropy": 0.07961287405341863, "epoch": 5.266138165345414, "grad_norm": 0.718805730342865, "learning_rate": 6.303738890043486e-07, "loss": 0.0461, "mean_token_accuracy": 0.9900570958852768, "num_tokens": 14641210.0, "step": 4650 }, { "entropy": 0.07430734038352967, "epoch": 5.277463193657984, "grad_norm": 0.6966316103935242, "learning_rate": 5.951079788714813e-07, "loss": 0.0429, "mean_token_accuracy": 0.9904509067535401, "num_tokens": 14674054.0, "step": 4660 }, { "entropy": 0.07951049022376537, "epoch": 5.288788221970555, "grad_norm": 0.772232174873352, "learning_rate": 5.608452582621771e-07, "loss": 0.0461, "mean_token_accuracy": 0.9892933934926986, "num_tokens": 14705152.0, "step": 4670 }, { "entropy": 0.08211956918239594, "epoch": 5.300113250283125, "grad_norm": 0.787604033946991, "learning_rate": 5.275871355826828e-07, "loss": 0.0469, "mean_token_accuracy": 0.9894729733467102, "num_tokens": 14734245.0, "step": 4680 }, { "entropy": 0.07479157857596874, "epoch": 5.311438278595697, "grad_norm": 0.6638301014900208, "learning_rate": 4.953349779441619e-07, "loss": 0.0438, "mean_token_accuracy": 0.9901529848575592, "num_tokens": 14766102.0, "step": 4690 }, { "entropy": 0.0770484933629632, "epoch": 5.322763306908267, "grad_norm": 0.6229594945907593, "learning_rate": 4.6409011110648824e-07, "loss": 0.0446, "mean_token_accuracy": 0.9904583126306534, "num_tokens": 14797209.0, "step": 4700 }, { "entropy": 0.07668529693037271, "epoch": 5.334088335220838, "grad_norm": 0.8131431937217712, "learning_rate": 4.338538194237657e-07, "loss": 0.0441, "mean_token_accuracy": 0.9898177713155747, "num_tokens": 14829020.0, "step": 4710 }, { "entropy": 0.0804054843261838, "epoch": 5.3454133635334085, "grad_norm": 1.0860034227371216, "learning_rate": 4.046273457915084e-07, "loss": 0.0465, "mean_token_accuracy": 0.989600470662117, "num_tokens": 14859012.0, "step": 4720 }, { "entropy": 0.07926053572446108, "epoch": 5.35673839184598, "grad_norm": 0.7582584023475647, "learning_rate": 3.7641189159557946e-07, "loss": 0.0465, "mean_token_accuracy": 0.9896339029073715, "num_tokens": 14888725.0, "step": 4730 }, { "entropy": 0.07964552585035563, "epoch": 5.36806342015855, "grad_norm": 0.9673193097114563, "learning_rate": 3.4920861666279116e-07, "loss": 0.0448, "mean_token_accuracy": 0.9897122889757156, "num_tokens": 14920550.0, "step": 4740 }, { "entropy": 0.0836305595934391, "epoch": 5.379388448471121, "grad_norm": 0.9790941476821899, "learning_rate": 3.2301863921322395e-07, "loss": 0.0485, "mean_token_accuracy": 0.9896321624517441, "num_tokens": 14950387.0, "step": 4750 }, { "entropy": 0.07453758120536805, "epoch": 5.3907134767836915, "grad_norm": 0.7324961423873901, "learning_rate": 2.97843035814277e-07, "loss": 0.0427, "mean_token_accuracy": 0.9902149707078933, "num_tokens": 14984224.0, "step": 4760 }, { "entropy": 0.07159966733306647, "epoch": 5.402038505096263, "grad_norm": 0.5410106182098389, "learning_rate": 2.7368284133639234e-07, "loss": 0.0424, "mean_token_accuracy": 0.9906870782375335, "num_tokens": 15017637.0, "step": 4770 }, { "entropy": 0.07854281235486268, "epoch": 5.413363533408834, "grad_norm": 0.5609409809112549, "learning_rate": 2.505390489105419e-07, "loss": 0.0439, "mean_token_accuracy": 0.9902636379003524, "num_tokens": 15049480.0, "step": 4780 }, { "entropy": 0.08379605785012245, "epoch": 5.424688561721404, "grad_norm": 0.8252092003822327, "learning_rate": 2.2841260988738232e-07, "loss": 0.0483, "mean_token_accuracy": 0.9893587976694107, "num_tokens": 15078517.0, "step": 4790 }, { "entropy": 0.08447619508951902, "epoch": 5.436013590033975, "grad_norm": 1.0486787557601929, "learning_rate": 2.0730443379815835e-07, "loss": 0.0486, "mean_token_accuracy": 0.9892659068107605, "num_tokens": 15108019.0, "step": 4800 }, { "entropy": 0.07685894444584847, "epoch": 5.447338618346546, "grad_norm": 0.5011602640151978, "learning_rate": 1.8721538831731334e-07, "loss": 0.0434, "mean_token_accuracy": 0.9901338279247284, "num_tokens": 15140919.0, "step": 4810 }, { "entropy": 0.08181709069758654, "epoch": 5.458663646659117, "grad_norm": 0.6206848621368408, "learning_rate": 1.6814629922682624e-07, "loss": 0.0473, "mean_token_accuracy": 0.9891577929258346, "num_tokens": 15170772.0, "step": 4820 }, { "entropy": 0.08480710126459598, "epoch": 5.469988674971687, "grad_norm": 0.8081274032592773, "learning_rate": 1.5009795038225806e-07, "loss": 0.0488, "mean_token_accuracy": 0.989034104347229, "num_tokens": 15199368.0, "step": 4830 }, { "entropy": 0.08092885501682759, "epoch": 5.4813137032842585, "grad_norm": 1.0287882089614868, "learning_rate": 1.3307108368054155e-07, "loss": 0.0472, "mean_token_accuracy": 0.9893998026847839, "num_tokens": 15228987.0, "step": 4840 }, { "entropy": 0.0767004294320941, "epoch": 5.492638731596829, "grad_norm": 0.712293267250061, "learning_rate": 1.1706639902947791e-07, "loss": 0.0451, "mean_token_accuracy": 0.9901446044445038, "num_tokens": 15260156.0, "step": 4850 }, { "entropy": 0.08494163490831852, "epoch": 5.5039637599094, "grad_norm": 1.0310022830963135, "learning_rate": 1.0208455431896247e-07, "loss": 0.0471, "mean_token_accuracy": 0.9887308567762375, "num_tokens": 15289396.0, "step": 4860 }, { "entropy": 0.07497126292437314, "epoch": 5.51528878822197, "grad_norm": 0.7754374146461487, "learning_rate": 8.812616539395358e-08, "loss": 0.0428, "mean_token_accuracy": 0.9900702744722366, "num_tokens": 15322332.0, "step": 4870 }, { "entropy": 0.07183474581688643, "epoch": 5.5266138165345415, "grad_norm": 0.8449347615242004, "learning_rate": 7.519180602915121e-08, "loss": 0.0424, "mean_token_accuracy": 0.9906940549612046, "num_tokens": 15356004.0, "step": 4880 }, { "entropy": 0.0772839292883873, "epoch": 5.537938844847112, "grad_norm": 0.7060112357139587, "learning_rate": 6.328200790540472e-08, "loss": 0.0435, "mean_token_accuracy": 0.9895416975021363, "num_tokens": 15388730.0, "step": 4890 }, { "entropy": 0.08096686322242022, "epoch": 5.549263873159683, "grad_norm": 0.6333571672439575, "learning_rate": 5.239726058787198e-08, "loss": 0.0453, "mean_token_accuracy": 0.9899547636508942, "num_tokens": 15420424.0, "step": 4900 }, { "entropy": 0.0785508582368493, "epoch": 5.560588901472253, "grad_norm": 0.7323933839797974, "learning_rate": 4.253801150587711e-08, "loss": 0.0464, "mean_token_accuracy": 0.9895732134580613, "num_tokens": 15451624.0, "step": 4910 }, { "entropy": 0.07821750827133656, "epoch": 5.571913929784825, "grad_norm": 0.94260174036026, "learning_rate": 3.370466593453914e-08, "loss": 0.0444, "mean_token_accuracy": 0.989855831861496, "num_tokens": 15482992.0, "step": 4920 }, { "entropy": 0.09697492402046919, "epoch": 5.583238958097395, "grad_norm": 0.9922671318054199, "learning_rate": 2.589758697809086e-08, "loss": 0.0612, "mean_token_accuracy": 0.9866875171661377, "num_tokens": 15513562.0, "step": 4930 }, { "entropy": 0.0751221977174282, "epoch": 5.594563986409966, "grad_norm": 0.5793846845626831, "learning_rate": 1.9117095554974097e-08, "loss": 0.044, "mean_token_accuracy": 0.9899530529975891, "num_tokens": 15545720.0, "step": 4940 }, { "entropy": 0.07994864359498025, "epoch": 5.605889014722536, "grad_norm": 0.6867619752883911, "learning_rate": 1.3363470384636367e-08, "loss": 0.0458, "mean_token_accuracy": 0.9898412346839904, "num_tokens": 15577126.0, "step": 4950 }, { "entropy": 0.0812272697687149, "epoch": 5.617214043035108, "grad_norm": 0.8053082227706909, "learning_rate": 8.636947976065069e-09, "loss": 0.0473, "mean_token_accuracy": 0.9890479534864426, "num_tokens": 15607165.0, "step": 4960 }, { "entropy": 0.09132701512426138, "epoch": 5.628539071347678, "grad_norm": 0.8204180598258972, "learning_rate": 4.9377226180924444e-09, "loss": 0.0562, "mean_token_accuracy": 0.9869016110897064, "num_tokens": 15637782.0, "step": 4970 }, { "entropy": 0.08176330700516701, "epoch": 5.639864099660249, "grad_norm": 1.1686699390411377, "learning_rate": 2.2659463713770036e-09, "loss": 0.0464, "mean_token_accuracy": 0.9895883351564407, "num_tokens": 15668276.0, "step": 4980 }, { "entropy": 0.07596621736884117, "epoch": 5.65118912797282, "grad_norm": 0.7290496230125427, "learning_rate": 6.217290621807204e-10, "loss": 0.0446, "mean_token_accuracy": 0.9898297399282455, "num_tokens": 15700845.0, "step": 4990 }, { "entropy": 0.07350182663649321, "epoch": 5.662514156285391, "grad_norm": 0.7328366041183472, "learning_rate": 5.138277833771632e-12, "loss": 0.0444, "mean_token_accuracy": 0.9903534233570099, "num_tokens": 15733342.0, "step": 5000 }, { "epoch": 5.662514156285391, "eval_entropy": 0.12777303962676384, "eval_loss": 0.3470225930213928, "eval_mean_token_accuracy": 0.9495740079168064, "eval_num_tokens": 15733342.0, "eval_runtime": 22.7824, "eval_samples_per_second": 46.922, "eval_steps_per_second": 5.882, "step": 5000 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.869577281465498e+16, "train_batch_size": 6, "trial_name": null, "trial_params": null }