Safetensors
qwen2
qwen-0.5b-reasoning-v2 / trainer_state.json
44David's picture
Upload model
924deea verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.662514156285391,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.2195617377758026,
"epoch": 0.011325028312570781,
"grad_norm": 41.854122161865234,
"learning_rate": 4.5e-06,
"loss": 1.4649,
"mean_token_accuracy": 0.7436085730791092,
"num_tokens": 33232.0,
"step": 10
},
{
"entropy": 0.8825877517461777,
"epoch": 0.022650056625141562,
"grad_norm": 10.884156227111816,
"learning_rate": 9.5e-06,
"loss": 0.8606,
"mean_token_accuracy": 0.8266362160444259,
"num_tokens": 62241.0,
"step": 20
},
{
"entropy": 0.7232772573828697,
"epoch": 0.03397508493771234,
"grad_norm": 9.259407997131348,
"learning_rate": 1.45e-05,
"loss": 0.6503,
"mean_token_accuracy": 0.8444810211658478,
"num_tokens": 92125.0,
"step": 30
},
{
"entropy": 0.6392747581005096,
"epoch": 0.045300113250283124,
"grad_norm": 7.45441198348999,
"learning_rate": 1.9500000000000003e-05,
"loss": 0.5675,
"mean_token_accuracy": 0.8603695809841156,
"num_tokens": 125345.0,
"step": 40
},
{
"entropy": 0.7545703947544098,
"epoch": 0.056625141562853906,
"grad_norm": 7.38155460357666,
"learning_rate": 2.45e-05,
"loss": 0.6309,
"mean_token_accuracy": 0.8437724113464355,
"num_tokens": 155506.0,
"step": 50
},
{
"entropy": 0.7073849737644196,
"epoch": 0.06795016987542468,
"grad_norm": 8.455394744873047,
"learning_rate": 2.95e-05,
"loss": 0.5849,
"mean_token_accuracy": 0.8386865645647049,
"num_tokens": 186173.0,
"step": 60
},
{
"entropy": 0.6777370646595955,
"epoch": 0.07927519818799547,
"grad_norm": 10.703876495361328,
"learning_rate": 3.45e-05,
"loss": 0.6124,
"mean_token_accuracy": 0.823593533039093,
"num_tokens": 218694.0,
"step": 70
},
{
"entropy": 0.6481794014573097,
"epoch": 0.09060022650056625,
"grad_norm": 7.329402923583984,
"learning_rate": 3.9500000000000005e-05,
"loss": 0.6196,
"mean_token_accuracy": 0.8281102895736694,
"num_tokens": 251166.0,
"step": 80
},
{
"entropy": 0.6877432465553284,
"epoch": 0.10192525481313704,
"grad_norm": 7.263290882110596,
"learning_rate": 4.4500000000000004e-05,
"loss": 0.6643,
"mean_token_accuracy": 0.81614009141922,
"num_tokens": 282032.0,
"step": 90
},
{
"entropy": 0.7204893633723259,
"epoch": 0.11325028312570781,
"grad_norm": 7.8037590980529785,
"learning_rate": 4.9500000000000004e-05,
"loss": 0.6983,
"mean_token_accuracy": 0.8097896903753281,
"num_tokens": 313148.0,
"step": 100
},
{
"entropy": 0.760354095697403,
"epoch": 0.1245753114382786,
"grad_norm": 5.422664642333984,
"learning_rate": 4.999958380063603e-05,
"loss": 0.7468,
"mean_token_accuracy": 0.799429202079773,
"num_tokens": 343247.0,
"step": 110
},
{
"entropy": 0.7298643738031387,
"epoch": 0.13590033975084936,
"grad_norm": 6.69228458404541,
"learning_rate": 4.999814510457652e-05,
"loss": 0.7326,
"mean_token_accuracy": 0.8030451983213425,
"num_tokens": 375581.0,
"step": 120
},
{
"entropy": 0.768428310751915,
"epoch": 0.14722536806342015,
"grad_norm": 5.626662254333496,
"learning_rate": 4.99956788326828e-05,
"loss": 0.7484,
"mean_token_accuracy": 0.7996043682098388,
"num_tokens": 408107.0,
"step": 130
},
{
"entropy": 0.7457070380449295,
"epoch": 0.15855039637599094,
"grad_norm": 6.562707901000977,
"learning_rate": 4.9992185086333655e-05,
"loss": 0.7464,
"mean_token_accuracy": 0.8001674145460129,
"num_tokens": 438643.0,
"step": 140
},
{
"entropy": 0.8232061445713044,
"epoch": 0.16987542468856173,
"grad_norm": 5.214873790740967,
"learning_rate": 4.998766400914329e-05,
"loss": 0.795,
"mean_token_accuracy": 0.7896791487932205,
"num_tokens": 469183.0,
"step": 150
},
{
"entropy": 0.7592632710933686,
"epoch": 0.1812004530011325,
"grad_norm": 4.8459625244140625,
"learning_rate": 4.99821157869555e-05,
"loss": 0.7495,
"mean_token_accuracy": 0.8009798556566239,
"num_tokens": 499213.0,
"step": 160
},
{
"entropy": 0.7950803458690643,
"epoch": 0.19252548131370328,
"grad_norm": 4.592363357543945,
"learning_rate": 4.9975540647835964e-05,
"loss": 0.7862,
"mean_token_accuracy": 0.7938760071992874,
"num_tokens": 529845.0,
"step": 170
},
{
"entropy": 0.7862876445055008,
"epoch": 0.20385050962627407,
"grad_norm": 4.092002868652344,
"learning_rate": 4.99679388620629e-05,
"loss": 0.7826,
"mean_token_accuracy": 0.7916485607624054,
"num_tokens": 564300.0,
"step": 180
},
{
"entropy": 0.7688385576009751,
"epoch": 0.21517553793884484,
"grad_norm": 4.930974006652832,
"learning_rate": 4.995931074211594e-05,
"loss": 0.7503,
"mean_token_accuracy": 0.8021332859992981,
"num_tokens": 598731.0,
"step": 190
},
{
"entropy": 0.7965670645236969,
"epoch": 0.22650056625141562,
"grad_norm": 4.959285736083984,
"learning_rate": 4.994965664266331e-05,
"loss": 0.7702,
"mean_token_accuracy": 0.7963653445243836,
"num_tokens": 629368.0,
"step": 200
},
{
"entropy": 0.7948065817356109,
"epoch": 0.23782559456398641,
"grad_norm": 4.833596229553223,
"learning_rate": 4.993897696054722e-05,
"loss": 0.7944,
"mean_token_accuracy": 0.789614400267601,
"num_tokens": 660377.0,
"step": 210
},
{
"entropy": 0.7758382678031921,
"epoch": 0.2491506228765572,
"grad_norm": 4.224271774291992,
"learning_rate": 4.9927272134767566e-05,
"loss": 0.7741,
"mean_token_accuracy": 0.795179745554924,
"num_tokens": 690586.0,
"step": 220
},
{
"entropy": 0.7900628983974457,
"epoch": 0.26047565118912797,
"grad_norm": 3.947178602218628,
"learning_rate": 4.991454264646391e-05,
"loss": 0.766,
"mean_token_accuracy": 0.794954827427864,
"num_tokens": 725572.0,
"step": 230
},
{
"entropy": 0.8080787748098374,
"epoch": 0.2718006795016987,
"grad_norm": 3.5941126346588135,
"learning_rate": 4.9900789018895636e-05,
"loss": 0.7964,
"mean_token_accuracy": 0.7924022555351258,
"num_tokens": 756763.0,
"step": 240
},
{
"entropy": 0.8200718402862549,
"epoch": 0.28312570781426954,
"grad_norm": 3.9727747440338135,
"learning_rate": 4.9886011817420526e-05,
"loss": 0.7799,
"mean_token_accuracy": 0.7895867377519608,
"num_tokens": 787596.0,
"step": 250
},
{
"entropy": 0.8339032739400863,
"epoch": 0.2944507361268403,
"grad_norm": 4.92995548248291,
"learning_rate": 4.9870211649471436e-05,
"loss": 0.8152,
"mean_token_accuracy": 0.7852619111537933,
"num_tokens": 817469.0,
"step": 260
},
{
"entropy": 1.0795388698577881,
"epoch": 0.3057757644394111,
"grad_norm": 9.842768669128418,
"learning_rate": 4.9853389164531396e-05,
"loss": 1.0836,
"mean_token_accuracy": 0.744838410615921,
"num_tokens": 850001.0,
"step": 270
},
{
"entropy": 1.0132719367742538,
"epoch": 0.3171007927519819,
"grad_norm": 5.305619239807129,
"learning_rate": 4.983554505410687e-05,
"loss": 1.01,
"mean_token_accuracy": 0.7551202327013016,
"num_tokens": 880532.0,
"step": 280
},
{
"entropy": 0.8525327175855637,
"epoch": 0.32842582106455265,
"grad_norm": 3.3658530712127686,
"learning_rate": 4.981668005169934e-05,
"loss": 0.8566,
"mean_token_accuracy": 0.7789583891630173,
"num_tokens": 915441.0,
"step": 290
},
{
"entropy": 0.841290682554245,
"epoch": 0.33975084937712347,
"grad_norm": 4.01895809173584,
"learning_rate": 4.979679493277518e-05,
"loss": 0.795,
"mean_token_accuracy": 0.7931433916091919,
"num_tokens": 949771.0,
"step": 300
},
{
"entropy": 0.7752810955047608,
"epoch": 0.3510758776896942,
"grad_norm": 3.522529125213623,
"learning_rate": 4.977589051473373e-05,
"loss": 0.7492,
"mean_token_accuracy": 0.8046811252832413,
"num_tokens": 982523.0,
"step": 310
},
{
"entropy": 0.8144816100597382,
"epoch": 0.362400906002265,
"grad_norm": 3.3812482357025146,
"learning_rate": 4.975396765687375e-05,
"loss": 0.7877,
"mean_token_accuracy": 0.7939431577920913,
"num_tokens": 1016280.0,
"step": 320
},
{
"entropy": 0.8689843416213989,
"epoch": 0.3737259343148358,
"grad_norm": 4.172185897827148,
"learning_rate": 4.9731027260358056e-05,
"loss": 0.8494,
"mean_token_accuracy": 0.7813519924879074,
"num_tokens": 1048322.0,
"step": 330
},
{
"entropy": 0.7746345907449722,
"epoch": 0.38505096262740657,
"grad_norm": 3.5320262908935547,
"learning_rate": 4.9707070268176494e-05,
"loss": 0.7661,
"mean_token_accuracy": 0.7966699630022049,
"num_tokens": 1082053.0,
"step": 340
},
{
"entropy": 0.8384666532278061,
"epoch": 0.39637599093997733,
"grad_norm": 3.9420437812805176,
"learning_rate": 4.9682097665107185e-05,
"loss": 0.8145,
"mean_token_accuracy": 0.7866115182638168,
"num_tokens": 1113077.0,
"step": 350
},
{
"entropy": 0.8227119207382202,
"epoch": 0.40770101925254815,
"grad_norm": 3.9070684909820557,
"learning_rate": 4.965611047767602e-05,
"loss": 0.8011,
"mean_token_accuracy": 0.7920049339532852,
"num_tokens": 1141483.0,
"step": 360
},
{
"entropy": 0.8064505487680436,
"epoch": 0.4190260475651189,
"grad_norm": 3.753575563430786,
"learning_rate": 4.962910977411451e-05,
"loss": 0.798,
"mean_token_accuracy": 0.7958554178476334,
"num_tokens": 1173773.0,
"step": 370
},
{
"entropy": 0.80653215944767,
"epoch": 0.43035107587768967,
"grad_norm": 4.219732284545898,
"learning_rate": 4.96010966643158e-05,
"loss": 0.7906,
"mean_token_accuracy": 0.7969932436943055,
"num_tokens": 1204159.0,
"step": 380
},
{
"entropy": 0.8062887847423553,
"epoch": 0.4416761041902605,
"grad_norm": 3.3941314220428467,
"learning_rate": 4.957207229978913e-05,
"loss": 0.7938,
"mean_token_accuracy": 0.7924901843070984,
"num_tokens": 1236691.0,
"step": 390
},
{
"entropy": 0.7844055652618408,
"epoch": 0.45300113250283125,
"grad_norm": 3.865077018737793,
"learning_rate": 4.9542037873612444e-05,
"loss": 0.7839,
"mean_token_accuracy": 0.7974178075790406,
"num_tokens": 1272447.0,
"step": 400
},
{
"entropy": 0.8854492843151093,
"epoch": 0.464326160815402,
"grad_norm": 3.856250047683716,
"learning_rate": 4.9510994620383354e-05,
"loss": 0.8433,
"mean_token_accuracy": 0.7830950766801834,
"num_tokens": 1303921.0,
"step": 410
},
{
"entropy": 0.805621936917305,
"epoch": 0.47565118912797283,
"grad_norm": 3.9170212745666504,
"learning_rate": 4.9478943816168424e-05,
"loss": 0.8045,
"mean_token_accuracy": 0.7906429260969162,
"num_tokens": 1335991.0,
"step": 420
},
{
"entropy": 0.7835706263780594,
"epoch": 0.4869762174405436,
"grad_norm": 4.020387172698975,
"learning_rate": 4.944588677845068e-05,
"loss": 0.7576,
"mean_token_accuracy": 0.8036289840936661,
"num_tokens": 1369249.0,
"step": 430
},
{
"entropy": 0.7757609993219375,
"epoch": 0.4983012457531144,
"grad_norm": 3.977161407470703,
"learning_rate": 4.941182486607546e-05,
"loss": 0.7724,
"mean_token_accuracy": 0.801279491186142,
"num_tokens": 1399744.0,
"step": 440
},
{
"entropy": 0.8368120342493057,
"epoch": 0.5096262740656852,
"grad_norm": 3.2676987648010254,
"learning_rate": 4.937675947919457e-05,
"loss": 0.8203,
"mean_token_accuracy": 0.7870727032423019,
"num_tokens": 1431385.0,
"step": 450
},
{
"entropy": 0.8545293360948563,
"epoch": 0.5209513023782559,
"grad_norm": 4.302636623382568,
"learning_rate": 4.9340692059208726e-05,
"loss": 0.8189,
"mean_token_accuracy": 0.7853404134511948,
"num_tokens": 1460138.0,
"step": 460
},
{
"entropy": 0.8571904718875885,
"epoch": 0.5322763306908267,
"grad_norm": 3.3016743659973145,
"learning_rate": 4.93036240887083e-05,
"loss": 0.8428,
"mean_token_accuracy": 0.7849946826696396,
"num_tokens": 1492180.0,
"step": 470
},
{
"entropy": 0.7961454778909683,
"epoch": 0.5436013590033975,
"grad_norm": 3.5592470169067383,
"learning_rate": 4.926555709141238e-05,
"loss": 0.7875,
"mean_token_accuracy": 0.7980852603912354,
"num_tokens": 1524445.0,
"step": 480
},
{
"entropy": 0.8381227642297745,
"epoch": 0.5549263873159683,
"grad_norm": 3.524243116378784,
"learning_rate": 4.9226492632106115e-05,
"loss": 0.8016,
"mean_token_accuracy": 0.789078775048256,
"num_tokens": 1556464.0,
"step": 490
},
{
"entropy": 0.8132199555635452,
"epoch": 0.5662514156285391,
"grad_norm": 3.4961235523223877,
"learning_rate": 4.918643231657642e-05,
"loss": 0.8021,
"mean_token_accuracy": 0.7943854779005051,
"num_tokens": 1586084.0,
"step": 500
},
{
"epoch": 0.5662514156285391,
"eval_entropy": 0.7580640996570018,
"eval_loss": 0.7161204218864441,
"eval_mean_token_accuracy": 0.8122646586218877,
"eval_num_tokens": 1586084.0,
"eval_runtime": 23.0676,
"eval_samples_per_second": 46.342,
"eval_steps_per_second": 5.809,
"step": 500
},
{
"entropy": 0.7857061624526978,
"epoch": 0.5775764439411099,
"grad_norm": 3.8294427394866943,
"learning_rate": 4.914537779154596e-05,
"loss": 0.7603,
"mean_token_accuracy": 0.79789317548275,
"num_tokens": 1619063.0,
"step": 510
},
{
"entropy": 0.7763534516096116,
"epoch": 0.5889014722536806,
"grad_norm": 3.318204402923584,
"learning_rate": 4.910333074460548e-05,
"loss": 0.7836,
"mean_token_accuracy": 0.8022702991962433,
"num_tokens": 1649064.0,
"step": 520
},
{
"entropy": 0.7938951075077056,
"epoch": 0.6002265005662514,
"grad_norm": 3.7849605083465576,
"learning_rate": 4.906029290414438e-05,
"loss": 0.7402,
"mean_token_accuracy": 0.803567036986351,
"num_tokens": 1679545.0,
"step": 530
},
{
"entropy": 0.9141301155090332,
"epoch": 0.6115515288788222,
"grad_norm": 4.45604944229126,
"learning_rate": 4.9016266039279703e-05,
"loss": 0.8793,
"mean_token_accuracy": 0.7735334932804108,
"num_tokens": 1708450.0,
"step": 540
},
{
"entropy": 0.8632586270570755,
"epoch": 0.622876557191393,
"grad_norm": 3.9162728786468506,
"learning_rate": 4.897125195978344e-05,
"loss": 0.8436,
"mean_token_accuracy": 0.784154525399208,
"num_tokens": 1739828.0,
"step": 550
},
{
"entropy": 0.7791282266378403,
"epoch": 0.6342015855039638,
"grad_norm": 3.5997514724731445,
"learning_rate": 4.8925252516008066e-05,
"loss": 0.7647,
"mean_token_accuracy": 0.7994277358055115,
"num_tokens": 1771291.0,
"step": 560
},
{
"entropy": 0.80836041867733,
"epoch": 0.6455266138165345,
"grad_norm": 16.00943946838379,
"learning_rate": 4.887826959881058e-05,
"loss": 0.7846,
"mean_token_accuracy": 0.7990806043148041,
"num_tokens": 1803223.0,
"step": 570
},
{
"entropy": 0.7901756346225739,
"epoch": 0.6568516421291053,
"grad_norm": 3.633241653442383,
"learning_rate": 4.883030513947466e-05,
"loss": 0.7681,
"mean_token_accuracy": 0.8016709625720978,
"num_tokens": 1835151.0,
"step": 580
},
{
"entropy": 0.8144267559051513,
"epoch": 0.6681766704416761,
"grad_norm": 3.5593814849853516,
"learning_rate": 4.878136110963139e-05,
"loss": 0.7784,
"mean_token_accuracy": 0.7987326920032501,
"num_tokens": 1869253.0,
"step": 590
},
{
"entropy": 0.7699689626693725,
"epoch": 0.6795016987542469,
"grad_norm": 3.547506093978882,
"learning_rate": 4.873143952117812e-05,
"loss": 0.7678,
"mean_token_accuracy": 0.7996489554643631,
"num_tokens": 1901972.0,
"step": 600
},
{
"entropy": 0.8226570546627044,
"epoch": 0.6908267270668177,
"grad_norm": 3.7377145290374756,
"learning_rate": 4.868054242619582e-05,
"loss": 0.7953,
"mean_token_accuracy": 0.7935166180133819,
"num_tokens": 1934251.0,
"step": 610
},
{
"entropy": 0.8604654908180237,
"epoch": 0.7021517553793885,
"grad_norm": 4.307958126068115,
"learning_rate": 4.862867191686473e-05,
"loss": 0.83,
"mean_token_accuracy": 0.7884718626737595,
"num_tokens": 1963277.0,
"step": 620
},
{
"entropy": 0.8304067760705948,
"epoch": 0.7134767836919592,
"grad_norm": 3.559305429458618,
"learning_rate": 4.857583012537831e-05,
"loss": 0.8245,
"mean_token_accuracy": 0.7876487731933594,
"num_tokens": 1994596.0,
"step": 630
},
{
"entropy": 0.7983928799629212,
"epoch": 0.72480181200453,
"grad_norm": 3.0747625827789307,
"learning_rate": 4.852201922385564e-05,
"loss": 0.7524,
"mean_token_accuracy": 0.8012862592935562,
"num_tokens": 2024407.0,
"step": 640
},
{
"entropy": 0.891300618648529,
"epoch": 0.7361268403171007,
"grad_norm": 4.086672306060791,
"learning_rate": 4.846724142425213e-05,
"loss": 0.8883,
"mean_token_accuracy": 0.7746320635080337,
"num_tokens": 2052217.0,
"step": 650
},
{
"entropy": 0.8403537899255753,
"epoch": 0.7474518686296716,
"grad_norm": 3.159459352493286,
"learning_rate": 4.841149897826856e-05,
"loss": 0.8149,
"mean_token_accuracy": 0.7901445269584656,
"num_tokens": 2083702.0,
"step": 660
},
{
"entropy": 0.761424508690834,
"epoch": 0.7587768969422424,
"grad_norm": 3.523196220397949,
"learning_rate": 4.8354794177258575e-05,
"loss": 0.7393,
"mean_token_accuracy": 0.8060346513986587,
"num_tokens": 2117175.0,
"step": 670
},
{
"entropy": 0.864872795343399,
"epoch": 0.7701019252548131,
"grad_norm": 3.8052961826324463,
"learning_rate": 4.829712935213443e-05,
"loss": 0.8493,
"mean_token_accuracy": 0.7827917844057083,
"num_tokens": 2145885.0,
"step": 680
},
{
"entropy": 0.8192390084266663,
"epoch": 0.7814269535673839,
"grad_norm": 3.475280284881592,
"learning_rate": 4.823850687327124e-05,
"loss": 0.8014,
"mean_token_accuracy": 0.793234434723854,
"num_tokens": 2178940.0,
"step": 690
},
{
"entropy": 0.8399118930101395,
"epoch": 0.7927519818799547,
"grad_norm": 4.192668914794922,
"learning_rate": 4.817892915040948e-05,
"loss": 0.8282,
"mean_token_accuracy": 0.7871042519807816,
"num_tokens": 2208885.0,
"step": 700
},
{
"entropy": 0.7586694777011871,
"epoch": 0.8040770101925255,
"grad_norm": 3.1814303398132324,
"learning_rate": 4.811839863255602e-05,
"loss": 0.73,
"mean_token_accuracy": 0.8088628321886062,
"num_tokens": 2240222.0,
"step": 710
},
{
"entropy": 0.8313590168952942,
"epoch": 0.8154020385050963,
"grad_norm": 3.2918899059295654,
"learning_rate": 4.805691780788334e-05,
"loss": 0.7996,
"mean_token_accuracy": 0.795934408903122,
"num_tokens": 2270756.0,
"step": 720
},
{
"entropy": 0.8529336482286454,
"epoch": 0.8267270668176671,
"grad_norm": 3.0882325172424316,
"learning_rate": 4.799448920362734e-05,
"loss": 0.8301,
"mean_token_accuracy": 0.7841264367103576,
"num_tokens": 2300237.0,
"step": 730
},
{
"entropy": 0.8395129233598709,
"epoch": 0.8380520951302378,
"grad_norm": 2.9842989444732666,
"learning_rate": 4.793111538598345e-05,
"loss": 0.7959,
"mean_token_accuracy": 0.7929731100797653,
"num_tokens": 2330775.0,
"step": 740
},
{
"entropy": 0.8623131781816482,
"epoch": 0.8493771234428086,
"grad_norm": 4.138679504394531,
"learning_rate": 4.786679896000107e-05,
"loss": 0.8596,
"mean_token_accuracy": 0.7795057654380798,
"num_tokens": 2360008.0,
"step": 750
},
{
"entropy": 0.8062419712543487,
"epoch": 0.8607021517553793,
"grad_norm": 3.422121286392212,
"learning_rate": 4.7801542569476576e-05,
"loss": 0.7658,
"mean_token_accuracy": 0.8022167205810546,
"num_tokens": 2390046.0,
"step": 760
},
{
"entropy": 0.8502861768007278,
"epoch": 0.8720271800679502,
"grad_norm": 3.099006175994873,
"learning_rate": 4.773534889684458e-05,
"loss": 0.8283,
"mean_token_accuracy": 0.7852615088224411,
"num_tokens": 2421387.0,
"step": 770
},
{
"entropy": 0.782235860824585,
"epoch": 0.883352208380521,
"grad_norm": 3.4044156074523926,
"learning_rate": 4.7668220663067705e-05,
"loss": 0.7658,
"mean_token_accuracy": 0.8062999725341797,
"num_tokens": 2450799.0,
"step": 780
},
{
"entropy": 0.8354818016290665,
"epoch": 0.8946772366930917,
"grad_norm": 4.402913570404053,
"learning_rate": 4.7600160627524714e-05,
"loss": 0.7934,
"mean_token_accuracy": 0.796307036280632,
"num_tokens": 2483454.0,
"step": 790
},
{
"entropy": 0.8112906068563461,
"epoch": 0.9060022650056625,
"grad_norm": 4.405174255371094,
"learning_rate": 4.753117158789711e-05,
"loss": 0.7988,
"mean_token_accuracy": 0.7941813617944717,
"num_tokens": 2514692.0,
"step": 800
},
{
"entropy": 0.792055967450142,
"epoch": 0.9173272933182333,
"grad_norm": 3.911775827407837,
"learning_rate": 4.746125638005409e-05,
"loss": 0.7967,
"mean_token_accuracy": 0.7978548645973206,
"num_tokens": 2546588.0,
"step": 810
},
{
"entropy": 0.8198580473661423,
"epoch": 0.928652321630804,
"grad_norm": 2.8875856399536133,
"learning_rate": 4.7390417877936e-05,
"loss": 0.8022,
"mean_token_accuracy": 0.7922891557216645,
"num_tokens": 2579566.0,
"step": 820
},
{
"entropy": 0.8724555522203445,
"epoch": 0.9399773499433749,
"grad_norm": 4.037559986114502,
"learning_rate": 4.7318658993436226e-05,
"loss": 0.8366,
"mean_token_accuracy": 0.7826345145702363,
"num_tokens": 2610767.0,
"step": 830
},
{
"entropy": 0.7823762893676758,
"epoch": 0.9513023782559457,
"grad_norm": 3.327577590942383,
"learning_rate": 4.724598267628143e-05,
"loss": 0.7769,
"mean_token_accuracy": 0.7997746199369431,
"num_tokens": 2642792.0,
"step": 840
},
{
"entropy": 0.7697542518377304,
"epoch": 0.9626274065685164,
"grad_norm": 3.0890369415283203,
"learning_rate": 4.717239191391038e-05,
"loss": 0.741,
"mean_token_accuracy": 0.8050399273633957,
"num_tokens": 2676964.0,
"step": 850
},
{
"entropy": 0.763307249546051,
"epoch": 0.9739524348810872,
"grad_norm": 4.2226643562316895,
"learning_rate": 4.709788973135106e-05,
"loss": 0.7547,
"mean_token_accuracy": 0.8071206361055374,
"num_tokens": 2708988.0,
"step": 860
},
{
"entropy": 0.7996066987514496,
"epoch": 0.9852774631936579,
"grad_norm": 3.804412603378296,
"learning_rate": 4.70224791910964e-05,
"loss": 0.7901,
"mean_token_accuracy": 0.7994960457086563,
"num_tokens": 2738235.0,
"step": 870
},
{
"entropy": 0.815873098373413,
"epoch": 0.9966024915062288,
"grad_norm": 3.5629003047943115,
"learning_rate": 4.694616339297835e-05,
"loss": 0.785,
"mean_token_accuracy": 0.7961396902799607,
"num_tokens": 2769182.0,
"step": 880
},
{
"entropy": 0.7122308641672135,
"epoch": 1.0079275198187996,
"grad_norm": 3.0285568237304688,
"learning_rate": 4.686894547404045e-05,
"loss": 0.6199,
"mean_token_accuracy": 0.8262323826551438,
"num_tokens": 2799895.0,
"step": 890
},
{
"entropy": 0.5145712837576866,
"epoch": 1.0192525481313703,
"grad_norm": 3.19512939453125,
"learning_rate": 4.679082860840891e-05,
"loss": 0.5098,
"mean_token_accuracy": 0.851035150885582,
"num_tokens": 2830787.0,
"step": 900
},
{
"entropy": 0.5525638118386269,
"epoch": 1.030577576443941,
"grad_norm": 2.943143367767334,
"learning_rate": 4.6711816007162124e-05,
"loss": 0.53,
"mean_token_accuracy": 0.851621863245964,
"num_tokens": 2861630.0,
"step": 910
},
{
"entropy": 0.5268796980381012,
"epoch": 1.0419026047565119,
"grad_norm": 3.5483663082122803,
"learning_rate": 4.6631910918198654e-05,
"loss": 0.4995,
"mean_token_accuracy": 0.8567488580942154,
"num_tokens": 2895795.0,
"step": 920
},
{
"entropy": 0.5167795568704605,
"epoch": 1.0532276330690826,
"grad_norm": 3.683345317840576,
"learning_rate": 4.655111662610373e-05,
"loss": 0.4901,
"mean_token_accuracy": 0.8578720778226853,
"num_tokens": 2928474.0,
"step": 930
},
{
"entropy": 0.5409791812300682,
"epoch": 1.0645526613816534,
"grad_norm": 2.885199546813965,
"learning_rate": 4.646943645201426e-05,
"loss": 0.5262,
"mean_token_accuracy": 0.8517296850681305,
"num_tokens": 2959541.0,
"step": 940
},
{
"entropy": 0.57862998098135,
"epoch": 1.0758776896942241,
"grad_norm": 4.052005767822266,
"learning_rate": 4.638687375348227e-05,
"loss": 0.5375,
"mean_token_accuracy": 0.8474970668554306,
"num_tokens": 2989964.0,
"step": 950
},
{
"entropy": 0.5536176413297653,
"epoch": 1.087202718006795,
"grad_norm": 3.0179316997528076,
"learning_rate": 4.6303431924336934e-05,
"loss": 0.5525,
"mean_token_accuracy": 0.8437703341245651,
"num_tokens": 3021104.0,
"step": 960
},
{
"entropy": 0.5437994614243508,
"epoch": 1.098527746319366,
"grad_norm": 3.4563395977020264,
"learning_rate": 4.621911439454504e-05,
"loss": 0.5062,
"mean_token_accuracy": 0.8552743196487427,
"num_tokens": 3054000.0,
"step": 970
},
{
"entropy": 0.5517205700278283,
"epoch": 1.1098527746319367,
"grad_norm": 3.2511813640594482,
"learning_rate": 4.613392463006995e-05,
"loss": 0.5222,
"mean_token_accuracy": 0.8511393010616303,
"num_tokens": 3082595.0,
"step": 980
},
{
"entropy": 0.537626887857914,
"epoch": 1.1211778029445074,
"grad_norm": 2.921431303024292,
"learning_rate": 4.6047866132729253e-05,
"loss": 0.5062,
"mean_token_accuracy": 0.852451679110527,
"num_tokens": 3113095.0,
"step": 990
},
{
"entropy": 0.5493237912654877,
"epoch": 1.1325028312570782,
"grad_norm": 3.2696175575256348,
"learning_rate": 4.596094244005069e-05,
"loss": 0.5319,
"mean_token_accuracy": 0.852485916018486,
"num_tokens": 3143581.0,
"step": 1000
},
{
"epoch": 1.1325028312570782,
"eval_entropy": 0.575189925174215,
"eval_loss": 0.6139051914215088,
"eval_mean_token_accuracy": 0.8391392471185372,
"eval_num_tokens": 3143581.0,
"eval_runtime": 22.7589,
"eval_samples_per_second": 46.971,
"eval_steps_per_second": 5.888,
"step": 1000
},
{
"entropy": 0.5281848132610321,
"epoch": 1.143827859569649,
"grad_norm": 3.3432345390319824,
"learning_rate": 4.5873157125126806e-05,
"loss": 0.5143,
"mean_token_accuracy": 0.8539972066879272,
"num_tokens": 3176671.0,
"step": 1010
},
{
"entropy": 0.5357129707932472,
"epoch": 1.1551528878822197,
"grad_norm": 2.681213617324829,
"learning_rate": 4.578451379646808e-05,
"loss": 0.5176,
"mean_token_accuracy": 0.8533397912979126,
"num_tokens": 3207113.0,
"step": 1020
},
{
"entropy": 0.5573878586292267,
"epoch": 1.1664779161947905,
"grad_norm": 2.8207201957702637,
"learning_rate": 4.569501609785455e-05,
"loss": 0.5267,
"mean_token_accuracy": 0.8482785791158676,
"num_tokens": 3242168.0,
"step": 1030
},
{
"entropy": 0.5452752098441124,
"epoch": 1.1778029445073612,
"grad_norm": 3.3376107215881348,
"learning_rate": 4.560466770818608e-05,
"loss": 0.5172,
"mean_token_accuracy": 0.8519764631986618,
"num_tokens": 3273260.0,
"step": 1040
},
{
"entropy": 0.5476395100355148,
"epoch": 1.189127972819932,
"grad_norm": 3.6320974826812744,
"learning_rate": 4.5513472341331076e-05,
"loss": 0.5439,
"mean_token_accuracy": 0.8460662096738816,
"num_tokens": 3302219.0,
"step": 1050
},
{
"entropy": 0.5614733591675758,
"epoch": 1.2004530011325028,
"grad_norm": 2.790870428085327,
"learning_rate": 4.542143374597391e-05,
"loss": 0.5218,
"mean_token_accuracy": 0.8489633470773696,
"num_tokens": 3336388.0,
"step": 1060
},
{
"entropy": 0.5463294044137001,
"epoch": 1.2117780294450737,
"grad_norm": 3.163102388381958,
"learning_rate": 4.5328555705460716e-05,
"loss": 0.5324,
"mean_token_accuracy": 0.847563111782074,
"num_tokens": 3368580.0,
"step": 1070
},
{
"entropy": 0.5477871373295784,
"epoch": 1.2231030577576445,
"grad_norm": 2.990779161453247,
"learning_rate": 4.5234842037643985e-05,
"loss": 0.5314,
"mean_token_accuracy": 0.8470851451158523,
"num_tokens": 3399370.0,
"step": 1080
},
{
"entropy": 0.6027134999632835,
"epoch": 1.2344280860702153,
"grad_norm": 3.5813815593719482,
"learning_rate": 4.5140296594725544e-05,
"loss": 0.5768,
"mean_token_accuracy": 0.8372454822063446,
"num_tokens": 3433838.0,
"step": 1090
},
{
"entropy": 0.5187774360179901,
"epoch": 1.245753114382786,
"grad_norm": 3.008913993835449,
"learning_rate": 4.504492326309824e-05,
"loss": 0.5077,
"mean_token_accuracy": 0.8522711008787155,
"num_tokens": 3465362.0,
"step": 1100
},
{
"entropy": 0.5649975180625916,
"epoch": 1.2570781426953568,
"grad_norm": 3.3389503955841064,
"learning_rate": 4.494872596318618e-05,
"loss": 0.5394,
"mean_token_accuracy": 0.8493940323591233,
"num_tokens": 3497415.0,
"step": 1110
},
{
"entropy": 0.6041379794478416,
"epoch": 1.2684031710079275,
"grad_norm": 4.550158977508545,
"learning_rate": 4.48517086492836e-05,
"loss": 0.5791,
"mean_token_accuracy": 0.8398642212152481,
"num_tokens": 3528071.0,
"step": 1120
},
{
"entropy": 0.5308591544628143,
"epoch": 1.2797281993204983,
"grad_norm": 3.043844223022461,
"learning_rate": 4.4753875309392266e-05,
"loss": 0.5014,
"mean_token_accuracy": 0.8557395815849305,
"num_tokens": 3562511.0,
"step": 1130
},
{
"entropy": 0.5395269095897675,
"epoch": 1.291053227633069,
"grad_norm": 3.781155586242676,
"learning_rate": 4.46552299650576e-05,
"loss": 0.5364,
"mean_token_accuracy": 0.8473162204027176,
"num_tokens": 3592113.0,
"step": 1140
},
{
"entropy": 0.5485832586884498,
"epoch": 1.3023782559456398,
"grad_norm": 3.388087511062622,
"learning_rate": 4.455577667120333e-05,
"loss": 0.5232,
"mean_token_accuracy": 0.8495394617319107,
"num_tokens": 3623419.0,
"step": 1150
},
{
"entropy": 0.5457410931587219,
"epoch": 1.3137032842582106,
"grad_norm": 2.7439608573913574,
"learning_rate": 4.445551951596486e-05,
"loss": 0.5239,
"mean_token_accuracy": 0.8516563266515732,
"num_tokens": 3655712.0,
"step": 1160
},
{
"entropy": 0.5817072510719299,
"epoch": 1.3250283125707814,
"grad_norm": 3.6495678424835205,
"learning_rate": 4.4354462620521144e-05,
"loss": 0.552,
"mean_token_accuracy": 0.8431410670280457,
"num_tokens": 3687624.0,
"step": 1170
},
{
"entropy": 0.5500650435686112,
"epoch": 1.3363533408833521,
"grad_norm": 2.702406406402588,
"learning_rate": 4.425261013892534e-05,
"loss": 0.5376,
"mean_token_accuracy": 0.8446941554546357,
"num_tokens": 3718974.0,
"step": 1180
},
{
"entropy": 0.5515587076544761,
"epoch": 1.3476783691959229,
"grad_norm": 3.3534793853759766,
"learning_rate": 4.414996625793404e-05,
"loss": 0.5358,
"mean_token_accuracy": 0.8478793174028396,
"num_tokens": 3749961.0,
"step": 1190
},
{
"entropy": 0.6033241957426071,
"epoch": 1.3590033975084936,
"grad_norm": 4.399415016174316,
"learning_rate": 4.404653519683517e-05,
"loss": 0.568,
"mean_token_accuracy": 0.838269567489624,
"num_tokens": 3779674.0,
"step": 1200
},
{
"entropy": 0.5632871612906456,
"epoch": 1.3703284258210646,
"grad_norm": 3.128950834274292,
"learning_rate": 4.394232120727453e-05,
"loss": 0.5478,
"mean_token_accuracy": 0.8447149246931076,
"num_tokens": 3811144.0,
"step": 1210
},
{
"entropy": 0.5675986737012864,
"epoch": 1.3816534541336354,
"grad_norm": 3.912665843963623,
"learning_rate": 4.3837328573081057e-05,
"loss": 0.551,
"mean_token_accuracy": 0.8420376777648926,
"num_tokens": 3841697.0,
"step": 1220
},
{
"entropy": 0.5417166292667389,
"epoch": 1.3929784824462061,
"grad_norm": 3.0909736156463623,
"learning_rate": 4.373156161009071e-05,
"loss": 0.5136,
"mean_token_accuracy": 0.8509470343589782,
"num_tokens": 3875026.0,
"step": 1230
},
{
"entropy": 0.6006052121520042,
"epoch": 1.404303510758777,
"grad_norm": 3.4114279747009277,
"learning_rate": 4.3625024665969086e-05,
"loss": 0.5766,
"mean_token_accuracy": 0.8366666853427887,
"num_tokens": 3905166.0,
"step": 1240
},
{
"entropy": 0.5631272241473197,
"epoch": 1.4156285390713477,
"grad_norm": 3.365401029586792,
"learning_rate": 4.351772212003268e-05,
"loss": 0.5295,
"mean_token_accuracy": 0.8479926735162735,
"num_tokens": 3936113.0,
"step": 1250
},
{
"entropy": 0.560055273771286,
"epoch": 1.4269535673839184,
"grad_norm": 3.087405204772949,
"learning_rate": 4.3409658383068865e-05,
"loss": 0.5347,
"mean_token_accuracy": 0.8461541920900345,
"num_tokens": 3966883.0,
"step": 1260
},
{
"entropy": 0.5575806692242622,
"epoch": 1.4382785956964892,
"grad_norm": 3.507297992706299,
"learning_rate": 4.3300837897154636e-05,
"loss": 0.5417,
"mean_token_accuracy": 0.8473523437976838,
"num_tokens": 3997355.0,
"step": 1270
},
{
"entropy": 0.6279172241687775,
"epoch": 1.44960362400906,
"grad_norm": 4.649575710296631,
"learning_rate": 4.3191265135473926e-05,
"loss": 0.5937,
"mean_token_accuracy": 0.8347241580486298,
"num_tokens": 4026260.0,
"step": 1280
},
{
"entropy": 0.5787097245454789,
"epoch": 1.460928652321631,
"grad_norm": 3.5137453079223633,
"learning_rate": 4.3080944602133804e-05,
"loss": 0.5253,
"mean_token_accuracy": 0.8458025574684143,
"num_tokens": 4056398.0,
"step": 1290
},
{
"entropy": 0.5460534125566483,
"epoch": 1.4722536806342017,
"grad_norm": 3.8779032230377197,
"learning_rate": 4.296988083197932e-05,
"loss": 0.553,
"mean_token_accuracy": 0.8447301775217056,
"num_tokens": 4087702.0,
"step": 1300
},
{
"entropy": 0.6065227225422859,
"epoch": 1.4835787089467725,
"grad_norm": 3.0295608043670654,
"learning_rate": 4.285807839040702e-05,
"loss": 0.5682,
"mean_token_accuracy": 0.8381103157997132,
"num_tokens": 4119831.0,
"step": 1310
},
{
"entropy": 0.5128992781043052,
"epoch": 1.4949037372593432,
"grad_norm": 2.5592479705810547,
"learning_rate": 4.274554187317742e-05,
"loss": 0.4974,
"mean_token_accuracy": 0.8573563575744629,
"num_tokens": 4153546.0,
"step": 1320
},
{
"entropy": 0.5610502302646637,
"epoch": 1.506228765571914,
"grad_norm": 2.9809346199035645,
"learning_rate": 4.263227590622594e-05,
"loss": 0.5337,
"mean_token_accuracy": 0.8465662121772766,
"num_tokens": 4184159.0,
"step": 1330
},
{
"entropy": 0.5405657783150672,
"epoch": 1.5175537938844847,
"grad_norm": 3.1333119869232178,
"learning_rate": 4.251828514547285e-05,
"loss": 0.5222,
"mean_token_accuracy": 0.851176279783249,
"num_tokens": 4216675.0,
"step": 1340
},
{
"entropy": 0.566189019382,
"epoch": 1.5288788221970555,
"grad_norm": 3.00138783454895,
"learning_rate": 4.2403574276631864e-05,
"loss": 0.5444,
"mean_token_accuracy": 0.8445883154869079,
"num_tokens": 4248386.0,
"step": 1350
},
{
"entropy": 0.527710796892643,
"epoch": 1.5402038505096263,
"grad_norm": 3.269314765930176,
"learning_rate": 4.2288148015017505e-05,
"loss": 0.5053,
"mean_token_accuracy": 0.8536709517240524,
"num_tokens": 4282402.0,
"step": 1360
},
{
"entropy": 0.5625803858041764,
"epoch": 1.551528878822197,
"grad_norm": 3.079254388809204,
"learning_rate": 4.2172011105351294e-05,
"loss": 0.5384,
"mean_token_accuracy": 0.8477725654840469,
"num_tokens": 4313169.0,
"step": 1370
},
{
"entropy": 0.5694461062550544,
"epoch": 1.5628539071347678,
"grad_norm": 3.1012649536132812,
"learning_rate": 4.205516832156671e-05,
"loss": 0.5568,
"mean_token_accuracy": 0.8427411139011383,
"num_tokens": 4344614.0,
"step": 1380
},
{
"entropy": 0.5670416623353958,
"epoch": 1.5741789354473386,
"grad_norm": 3.6018662452697754,
"learning_rate": 4.193762446661294e-05,
"loss": 0.5314,
"mean_token_accuracy": 0.8488269418478012,
"num_tokens": 4375036.0,
"step": 1390
},
{
"entropy": 0.5953057199716568,
"epoch": 1.5855039637599093,
"grad_norm": 3.029942274093628,
"learning_rate": 4.18193843722575e-05,
"loss": 0.5799,
"mean_token_accuracy": 0.837462404370308,
"num_tokens": 4404701.0,
"step": 1400
},
{
"entropy": 0.5928519412875175,
"epoch": 1.59682899207248,
"grad_norm": 3.283315896987915,
"learning_rate": 4.170045289888753e-05,
"loss": 0.5624,
"mean_token_accuracy": 0.8420351535081864,
"num_tokens": 4435592.0,
"step": 1410
},
{
"entropy": 0.5582237809896469,
"epoch": 1.6081540203850508,
"grad_norm": 3.4651055335998535,
"learning_rate": 4.15808349353101e-05,
"loss": 0.5491,
"mean_token_accuracy": 0.8469580680131912,
"num_tokens": 4466796.0,
"step": 1420
},
{
"entropy": 0.5469602465629577,
"epoch": 1.6194790486976216,
"grad_norm": 3.3069710731506348,
"learning_rate": 4.1460535398551156e-05,
"loss": 0.5196,
"mean_token_accuracy": 0.8506799221038819,
"num_tokens": 4500213.0,
"step": 1430
},
{
"entropy": 0.5610076829791069,
"epoch": 1.6308040770101924,
"grad_norm": 2.9331612586975098,
"learning_rate": 4.1339559233653484e-05,
"loss": 0.5464,
"mean_token_accuracy": 0.8456663846969604,
"num_tokens": 4533383.0,
"step": 1440
},
{
"entropy": 0.5389099940657616,
"epoch": 1.6421291053227633,
"grad_norm": 3.7289347648620605,
"learning_rate": 4.12179114134734e-05,
"loss": 0.5173,
"mean_token_accuracy": 0.855185616016388,
"num_tokens": 4566354.0,
"step": 1450
},
{
"entropy": 0.5230394601821899,
"epoch": 1.6534541336353341,
"grad_norm": 3.2918195724487305,
"learning_rate": 4.1095596938476316e-05,
"loss": 0.5053,
"mean_token_accuracy": 0.8556439846754074,
"num_tokens": 4598001.0,
"step": 1460
},
{
"entropy": 0.5553015500307084,
"epoch": 1.6647791619479049,
"grad_norm": 3.6326143741607666,
"learning_rate": 4.097262083653123e-05,
"loss": 0.5369,
"mean_token_accuracy": 0.8448810696601867,
"num_tokens": 4630452.0,
"step": 1470
},
{
"entropy": 0.6212641701102257,
"epoch": 1.6761041902604756,
"grad_norm": 3.491703748703003,
"learning_rate": 4.084898816270405e-05,
"loss": 0.5945,
"mean_token_accuracy": 0.8339376747608185,
"num_tokens": 4660517.0,
"step": 1480
},
{
"entropy": 0.5631924226880074,
"epoch": 1.6874292185730464,
"grad_norm": 2.8656742572784424,
"learning_rate": 4.072470399904972e-05,
"loss": 0.5362,
"mean_token_accuracy": 0.8484214007854461,
"num_tokens": 4693572.0,
"step": 1490
},
{
"entropy": 0.5927946478128433,
"epoch": 1.6987542468856174,
"grad_norm": 3.262650489807129,
"learning_rate": 4.059977345440345e-05,
"loss": 0.5631,
"mean_token_accuracy": 0.8415592044591904,
"num_tokens": 4725726.0,
"step": 1500
},
{
"epoch": 1.6987542468856174,
"eval_entropy": 0.5341141203890986,
"eval_loss": 0.5169265270233154,
"eval_mean_token_accuracy": 0.8661515663808851,
"eval_num_tokens": 4725726.0,
"eval_runtime": 22.9682,
"eval_samples_per_second": 46.543,
"eval_steps_per_second": 5.834,
"step": 1500
},
{
"entropy": 0.5506062388420105,
"epoch": 1.7100792751981881,
"grad_norm": 3.5445897579193115,
"learning_rate": 4.0474201664170605e-05,
"loss": 0.5315,
"mean_token_accuracy": 0.8496404081583023,
"num_tokens": 4755710.0,
"step": 1510
},
{
"entropy": 0.5794235303997993,
"epoch": 1.721404303510759,
"grad_norm": 3.481981039047241,
"learning_rate": 4.034799379011565e-05,
"loss": 0.5681,
"mean_token_accuracy": 0.8396411448717117,
"num_tokens": 4788007.0,
"step": 1520
},
{
"entropy": 0.5977949738502503,
"epoch": 1.7327293318233297,
"grad_norm": 3.0650320053100586,
"learning_rate": 4.0221155020149956e-05,
"loss": 0.5602,
"mean_token_accuracy": 0.838371816277504,
"num_tokens": 4823110.0,
"step": 1530
},
{
"entropy": 0.5333872765302659,
"epoch": 1.7440543601359004,
"grad_norm": 3.4268224239349365,
"learning_rate": 4.009369056811857e-05,
"loss": 0.5284,
"mean_token_accuracy": 0.8493491917848587,
"num_tokens": 4854122.0,
"step": 1540
},
{
"entropy": 0.5541011542081833,
"epoch": 1.7553793884484712,
"grad_norm": 2.7358953952789307,
"learning_rate": 3.9965605673585885e-05,
"loss": 0.5322,
"mean_token_accuracy": 0.8492937862873078,
"num_tokens": 4885040.0,
"step": 1550
},
{
"entropy": 0.5873281508684158,
"epoch": 1.766704416761042,
"grad_norm": 3.24727463722229,
"learning_rate": 3.983690560162021e-05,
"loss": 0.5406,
"mean_token_accuracy": 0.8487812489271164,
"num_tokens": 4913565.0,
"step": 1560
},
{
"entropy": 0.5366435587406159,
"epoch": 1.7780294450736127,
"grad_norm": 2.3538382053375244,
"learning_rate": 3.970759564257744e-05,
"loss": 0.5122,
"mean_token_accuracy": 0.852169793844223,
"num_tokens": 4945547.0,
"step": 1570
},
{
"entropy": 0.5801732629537583,
"epoch": 1.7893544733861835,
"grad_norm": 3.008432388305664,
"learning_rate": 3.9577681111883525e-05,
"loss": 0.5498,
"mean_token_accuracy": 0.8420356780290603,
"num_tokens": 4975227.0,
"step": 1580
},
{
"entropy": 0.5545943319797516,
"epoch": 1.8006795016987542,
"grad_norm": 3.050703525543213,
"learning_rate": 3.9447167349815975e-05,
"loss": 0.5202,
"mean_token_accuracy": 0.8499834507703781,
"num_tokens": 5006740.0,
"step": 1590
},
{
"entropy": 0.614229929447174,
"epoch": 1.812004530011325,
"grad_norm": 3.1485025882720947,
"learning_rate": 3.931605972128434e-05,
"loss": 0.5971,
"mean_token_accuracy": 0.8335285931825638,
"num_tokens": 5038134.0,
"step": 1600
},
{
"entropy": 0.5501451089978218,
"epoch": 1.8233295583238958,
"grad_norm": 3.1998586654663086,
"learning_rate": 3.918436361560975e-05,
"loss": 0.5239,
"mean_token_accuracy": 0.8538520008325576,
"num_tokens": 5069253.0,
"step": 1610
},
{
"entropy": 0.5649801835417747,
"epoch": 1.8346545866364665,
"grad_norm": 3.609856128692627,
"learning_rate": 3.905208444630327e-05,
"loss": 0.544,
"mean_token_accuracy": 0.8445982128381729,
"num_tokens": 5099796.0,
"step": 1620
},
{
"entropy": 0.6116504520177841,
"epoch": 1.8459796149490373,
"grad_norm": 2.742194175720215,
"learning_rate": 3.8919227650843446e-05,
"loss": 0.5889,
"mean_token_accuracy": 0.8342977851629257,
"num_tokens": 5132503.0,
"step": 1630
},
{
"entropy": 0.5255660384893417,
"epoch": 1.857304643261608,
"grad_norm": 2.977027654647827,
"learning_rate": 3.878579869045279e-05,
"loss": 0.4961,
"mean_token_accuracy": 0.8565999537706375,
"num_tokens": 5162678.0,
"step": 1640
},
{
"entropy": 0.5009402737021447,
"epoch": 1.8686296715741788,
"grad_norm": 2.497317314147949,
"learning_rate": 3.865180304987325e-05,
"loss": 0.4848,
"mean_token_accuracy": 0.8591432988643646,
"num_tokens": 5196737.0,
"step": 1650
},
{
"entropy": 0.5639937236905098,
"epoch": 1.8799546998867496,
"grad_norm": 3.1093459129333496,
"learning_rate": 3.851724623714078e-05,
"loss": 0.5568,
"mean_token_accuracy": 0.845026895403862,
"num_tokens": 5227400.0,
"step": 1660
},
{
"entropy": 0.5589576527476311,
"epoch": 1.8912797281993206,
"grad_norm": 3.0768072605133057,
"learning_rate": 3.838213378335893e-05,
"loss": 0.5291,
"mean_token_accuracy": 0.8501736044883728,
"num_tokens": 5259004.0,
"step": 1670
},
{
"entropy": 0.5750891670584679,
"epoch": 1.9026047565118913,
"grad_norm": 2.6023709774017334,
"learning_rate": 3.824647124247149e-05,
"loss": 0.5517,
"mean_token_accuracy": 0.8456086486577987,
"num_tokens": 5290222.0,
"step": 1680
},
{
"entropy": 0.5511097982525826,
"epoch": 1.913929784824462,
"grad_norm": 2.9224021434783936,
"learning_rate": 3.811026419103415e-05,
"loss": 0.5325,
"mean_token_accuracy": 0.8483414232730866,
"num_tokens": 5322612.0,
"step": 1690
},
{
"entropy": 0.5870863348245621,
"epoch": 1.9252548131370328,
"grad_norm": 3.0950560569763184,
"learning_rate": 3.79735182279853e-05,
"loss": 0.5522,
"mean_token_accuracy": 0.8423091471195221,
"num_tokens": 5352812.0,
"step": 1700
},
{
"entropy": 0.5567023307085037,
"epoch": 1.9365798414496036,
"grad_norm": 2.833249092102051,
"learning_rate": 3.78362389744159e-05,
"loss": 0.5457,
"mean_token_accuracy": 0.8451368689537049,
"num_tokens": 5383796.0,
"step": 1710
},
{
"entropy": 0.55447788387537,
"epoch": 1.9479048697621744,
"grad_norm": 3.5825366973876953,
"learning_rate": 3.769843207333837e-05,
"loss": 0.5214,
"mean_token_accuracy": 0.8480161488056183,
"num_tokens": 5416796.0,
"step": 1720
},
{
"entropy": 0.5370402306318283,
"epoch": 1.9592298980747453,
"grad_norm": 2.9841582775115967,
"learning_rate": 3.7560103189454654e-05,
"loss": 0.5232,
"mean_token_accuracy": 0.8511360079050064,
"num_tokens": 5447812.0,
"step": 1730
},
{
"entropy": 0.5704680263996125,
"epoch": 1.970554926387316,
"grad_norm": 3.3857421875,
"learning_rate": 3.742125800892339e-05,
"loss": 0.5528,
"mean_token_accuracy": 0.8471181511878967,
"num_tokens": 5477860.0,
"step": 1740
},
{
"entropy": 0.5869046375155449,
"epoch": 1.9818799546998869,
"grad_norm": 3.149844169616699,
"learning_rate": 3.7281902239126114e-05,
"loss": 0.5758,
"mean_token_accuracy": 0.8392861634492874,
"num_tokens": 5508399.0,
"step": 1750
},
{
"entropy": 0.5820975095033646,
"epoch": 1.9932049830124576,
"grad_norm": 3.422454833984375,
"learning_rate": 3.714204160843271e-05,
"loss": 0.5391,
"mean_token_accuracy": 0.8477762788534164,
"num_tokens": 5538408.0,
"step": 1760
},
{
"entropy": 0.5504972025752067,
"epoch": 2.0045300113250284,
"grad_norm": 2.6110424995422363,
"learning_rate": 3.700168186596591e-05,
"loss": 0.4606,
"mean_token_accuracy": 0.864372169971466,
"num_tokens": 5568168.0,
"step": 1770
},
{
"entropy": 0.33086139261722564,
"epoch": 2.015855039637599,
"grad_norm": 3.0384321212768555,
"learning_rate": 3.686082878136497e-05,
"loss": 0.2968,
"mean_token_accuracy": 0.9110626548528671,
"num_tokens": 5600301.0,
"step": 1780
},
{
"entropy": 0.3262224726378918,
"epoch": 2.02718006795017,
"grad_norm": 3.177510976791382,
"learning_rate": 3.671948814454852e-05,
"loss": 0.3034,
"mean_token_accuracy": 0.9086451709270478,
"num_tokens": 5631080.0,
"step": 1790
},
{
"entropy": 0.33181734979152677,
"epoch": 2.0385050962627407,
"grad_norm": 2.8883979320526123,
"learning_rate": 3.6577665765476534e-05,
"loss": 0.2977,
"mean_token_accuracy": 0.9095875382423401,
"num_tokens": 5663677.0,
"step": 1800
},
{
"entropy": 0.34994775205850603,
"epoch": 2.0498301245753114,
"grad_norm": 2.915252208709717,
"learning_rate": 3.643536747391155e-05,
"loss": 0.3191,
"mean_token_accuracy": 0.9045411765575408,
"num_tokens": 5692514.0,
"step": 1810
},
{
"entropy": 0.33512948006391524,
"epoch": 2.061155152887882,
"grad_norm": 3.1287646293640137,
"learning_rate": 3.629259911917898e-05,
"loss": 0.3079,
"mean_token_accuracy": 0.9076024353504181,
"num_tokens": 5723520.0,
"step": 1820
},
{
"entropy": 0.3023697704076767,
"epoch": 2.072480181200453,
"grad_norm": 2.952409029006958,
"learning_rate": 3.614936656992671e-05,
"loss": 0.2774,
"mean_token_accuracy": 0.9159202218055725,
"num_tokens": 5756911.0,
"step": 1830
},
{
"entropy": 0.3192523382604122,
"epoch": 2.0838052095130237,
"grad_norm": 2.7479248046875,
"learning_rate": 3.600567571388384e-05,
"loss": 0.2952,
"mean_token_accuracy": 0.9110053300857544,
"num_tokens": 5789223.0,
"step": 1840
},
{
"entropy": 0.3362482935190201,
"epoch": 2.0951302378255945,
"grad_norm": 2.9288883209228516,
"learning_rate": 3.5861532457618647e-05,
"loss": 0.3001,
"mean_token_accuracy": 0.910579189658165,
"num_tokens": 5820267.0,
"step": 1850
},
{
"entropy": 0.3218920275568962,
"epoch": 2.1064552661381652,
"grad_norm": 2.9838762283325195,
"learning_rate": 3.571694272629583e-05,
"loss": 0.3028,
"mean_token_accuracy": 0.9099289119243622,
"num_tokens": 5851637.0,
"step": 1860
},
{
"entropy": 0.33703851103782656,
"epoch": 2.117780294450736,
"grad_norm": 2.6930618286132812,
"learning_rate": 3.557191246343294e-05,
"loss": 0.3005,
"mean_token_accuracy": 0.9076809674501419,
"num_tokens": 5884035.0,
"step": 1870
},
{
"entropy": 0.33338933885097505,
"epoch": 2.1291053227633068,
"grad_norm": 2.9287595748901367,
"learning_rate": 3.5426447630656015e-05,
"loss": 0.2996,
"mean_token_accuracy": 0.9076695948839187,
"num_tokens": 5915289.0,
"step": 1880
},
{
"entropy": 0.3408766373991966,
"epoch": 2.1404303510758775,
"grad_norm": 3.0822606086730957,
"learning_rate": 3.528055420745461e-05,
"loss": 0.302,
"mean_token_accuracy": 0.9080609738826751,
"num_tokens": 5945986.0,
"step": 1890
},
{
"entropy": 0.33647180199623106,
"epoch": 2.1517553793884483,
"grad_norm": 2.4581234455108643,
"learning_rate": 3.51342381909359e-05,
"loss": 0.3117,
"mean_token_accuracy": 0.9036801576614379,
"num_tokens": 5979688.0,
"step": 1900
},
{
"entropy": 0.3434984043240547,
"epoch": 2.163080407701019,
"grad_norm": 3.238070487976074,
"learning_rate": 3.498750559557827e-05,
"loss": 0.3049,
"mean_token_accuracy": 0.908153846859932,
"num_tokens": 6010581.0,
"step": 1910
},
{
"entropy": 0.3294851824641228,
"epoch": 2.17440543601359,
"grad_norm": 3.0652968883514404,
"learning_rate": 3.484036245298398e-05,
"loss": 0.3024,
"mean_token_accuracy": 0.9057962894439697,
"num_tokens": 6042553.0,
"step": 1920
},
{
"entropy": 0.3511190369725227,
"epoch": 2.185730464326161,
"grad_norm": 2.9219870567321777,
"learning_rate": 3.469281481163131e-05,
"loss": 0.3258,
"mean_token_accuracy": 0.9027796924114228,
"num_tokens": 6073935.0,
"step": 1930
},
{
"entropy": 0.32454402297735213,
"epoch": 2.197055492638732,
"grad_norm": 2.8523194789886475,
"learning_rate": 3.45448687366259e-05,
"loss": 0.2839,
"mean_token_accuracy": 0.9147208243608475,
"num_tokens": 6106596.0,
"step": 1940
},
{
"entropy": 0.3268588811159134,
"epoch": 2.2083805209513026,
"grad_norm": 3.46616530418396,
"learning_rate": 3.4396530309451416e-05,
"loss": 0.3023,
"mean_token_accuracy": 0.9075274288654327,
"num_tokens": 6137408.0,
"step": 1950
},
{
"entropy": 0.34170345067977903,
"epoch": 2.2197055492638733,
"grad_norm": 2.899691104888916,
"learning_rate": 3.4247805627719605e-05,
"loss": 0.3087,
"mean_token_accuracy": 0.9050837367773056,
"num_tokens": 6169270.0,
"step": 1960
},
{
"entropy": 0.33029789179563523,
"epoch": 2.231030577576444,
"grad_norm": 3.0167901515960693,
"learning_rate": 3.4098700804919614e-05,
"loss": 0.2938,
"mean_token_accuracy": 0.9097273588180542,
"num_tokens": 6200234.0,
"step": 1970
},
{
"entropy": 0.3392042815685272,
"epoch": 2.242355605889015,
"grad_norm": 3.2007789611816406,
"learning_rate": 3.394922197016671e-05,
"loss": 0.308,
"mean_token_accuracy": 0.9080634534358978,
"num_tokens": 6229177.0,
"step": 1980
},
{
"entropy": 0.3530780151486397,
"epoch": 2.2536806342015856,
"grad_norm": 2.546180009841919,
"learning_rate": 3.379937526795031e-05,
"loss": 0.3062,
"mean_token_accuracy": 0.9073823064565658,
"num_tokens": 6257509.0,
"step": 1990
},
{
"entropy": 0.31698922738432883,
"epoch": 2.2650056625141564,
"grad_norm": 2.573667049407959,
"learning_rate": 3.3649166857881444e-05,
"loss": 0.2812,
"mean_token_accuracy": 0.9117646932601928,
"num_tokens": 6292493.0,
"step": 2000
},
{
"epoch": 2.2650056625141564,
"eval_entropy": 0.3538621710752373,
"eval_loss": 0.461500346660614,
"eval_mean_token_accuracy": 0.8896430148117578,
"eval_num_tokens": 6292493.0,
"eval_runtime": 22.9271,
"eval_samples_per_second": 46.626,
"eval_steps_per_second": 5.845,
"step": 2000
},
{
"entropy": 0.3301647327840328,
"epoch": 2.276330690826727,
"grad_norm": 3.6626837253570557,
"learning_rate": 3.349860291443952e-05,
"loss": 0.3122,
"mean_token_accuracy": 0.9060505300760269,
"num_tokens": 6326995.0,
"step": 2010
},
{
"entropy": 0.3390480265021324,
"epoch": 2.287655719139298,
"grad_norm": 3.048062801361084,
"learning_rate": 3.3347689626718535e-05,
"loss": 0.301,
"mean_token_accuracy": 0.9094172894954682,
"num_tokens": 6358142.0,
"step": 2020
},
{
"entropy": 0.33652088344097136,
"epoch": 2.2989807474518686,
"grad_norm": 2.5861592292785645,
"learning_rate": 3.319643319817266e-05,
"loss": 0.3007,
"mean_token_accuracy": 0.9065529972314834,
"num_tokens": 6391665.0,
"step": 2030
},
{
"entropy": 0.34212576597929,
"epoch": 2.3103057757644394,
"grad_norm": 3.1473755836486816,
"learning_rate": 3.304483984636124e-05,
"loss": 0.3088,
"mean_token_accuracy": 0.9068757712841033,
"num_tokens": 6420978.0,
"step": 2040
},
{
"entropy": 0.3290658682584763,
"epoch": 2.32163080407701,
"grad_norm": 2.8407304286956787,
"learning_rate": 3.289291580269322e-05,
"loss": 0.2956,
"mean_token_accuracy": 0.9091124355792999,
"num_tokens": 6452000.0,
"step": 2050
},
{
"entropy": 0.32331828474998475,
"epoch": 2.332955832389581,
"grad_norm": 2.915709972381592,
"learning_rate": 3.2740667312170984e-05,
"loss": 0.3011,
"mean_token_accuracy": 0.9096018075942993,
"num_tokens": 6482800.0,
"step": 2060
},
{
"entropy": 0.35195091664791106,
"epoch": 2.3442808607021517,
"grad_norm": 2.9953255653381348,
"learning_rate": 3.258810063313367e-05,
"loss": 0.3126,
"mean_token_accuracy": 0.9040073782205582,
"num_tokens": 6516807.0,
"step": 2070
},
{
"entropy": 0.34647743552923205,
"epoch": 2.3556058890147225,
"grad_norm": 2.6879382133483887,
"learning_rate": 3.243522203699988e-05,
"loss": 0.3188,
"mean_token_accuracy": 0.9036970168352128,
"num_tokens": 6548271.0,
"step": 2080
},
{
"entropy": 0.32736130654811857,
"epoch": 2.366930917327293,
"grad_norm": 3.3962409496307373,
"learning_rate": 3.228203780800993e-05,
"loss": 0.2937,
"mean_token_accuracy": 0.91224045753479,
"num_tokens": 6577404.0,
"step": 2090
},
{
"entropy": 0.34083098769187925,
"epoch": 2.378255945639864,
"grad_norm": 2.9646759033203125,
"learning_rate": 3.212855424296748e-05,
"loss": 0.3093,
"mean_token_accuracy": 0.9033702582120895,
"num_tokens": 6610996.0,
"step": 2100
},
{
"entropy": 0.33483326584100725,
"epoch": 2.3895809739524347,
"grad_norm": 3.603712558746338,
"learning_rate": 3.1974777650980735e-05,
"loss": 0.3052,
"mean_token_accuracy": 0.9083625108003617,
"num_tokens": 6641408.0,
"step": 2110
},
{
"entropy": 0.36678697168827057,
"epoch": 2.4009060022650055,
"grad_norm": 5.152080535888672,
"learning_rate": 3.182071435320309e-05,
"loss": 0.3386,
"mean_token_accuracy": 0.8961910039186478,
"num_tokens": 6671255.0,
"step": 2120
},
{
"entropy": 0.3381674662232399,
"epoch": 2.4122310305775763,
"grad_norm": 2.51043438911438,
"learning_rate": 3.1666370682573305e-05,
"loss": 0.3018,
"mean_token_accuracy": 0.9092491447925568,
"num_tokens": 6701042.0,
"step": 2130
},
{
"entropy": 0.340181964635849,
"epoch": 2.4235560588901475,
"grad_norm": 2.988914966583252,
"learning_rate": 3.151175298355515e-05,
"loss": 0.3186,
"mean_token_accuracy": 0.9035066097974778,
"num_tokens": 6731513.0,
"step": 2140
},
{
"entropy": 0.3381286084651947,
"epoch": 2.434881087202718,
"grad_norm": 3.1499576568603516,
"learning_rate": 3.1356867611876626e-05,
"loss": 0.3063,
"mean_token_accuracy": 0.906991371512413,
"num_tokens": 6763329.0,
"step": 2150
},
{
"entropy": 0.34378741681575775,
"epoch": 2.446206115515289,
"grad_norm": 3.016859292984009,
"learning_rate": 3.120172093426873e-05,
"loss": 0.3184,
"mean_token_accuracy": 0.9061174720525742,
"num_tokens": 6794777.0,
"step": 2160
},
{
"entropy": 0.336453452706337,
"epoch": 2.4575311438278598,
"grad_norm": 3.0973525047302246,
"learning_rate": 3.10463193282037e-05,
"loss": 0.304,
"mean_token_accuracy": 0.9055917888879776,
"num_tokens": 6826289.0,
"step": 2170
},
{
"entropy": 0.35441608279943465,
"epoch": 2.4688561721404305,
"grad_norm": 3.2018167972564697,
"learning_rate": 3.08906691816329e-05,
"loss": 0.3213,
"mean_token_accuracy": 0.9018235206604004,
"num_tokens": 6856735.0,
"step": 2180
},
{
"entropy": 0.3426896780729294,
"epoch": 2.4801812004530013,
"grad_norm": 3.111391067504883,
"learning_rate": 3.073477689272422e-05,
"loss": 0.3085,
"mean_token_accuracy": 0.9053992748260498,
"num_tokens": 6887751.0,
"step": 2190
},
{
"entropy": 0.3537640705704689,
"epoch": 2.491506228765572,
"grad_norm": 3.6062848567962646,
"learning_rate": 3.057864886959907e-05,
"loss": 0.3161,
"mean_token_accuracy": 0.9045758932828903,
"num_tokens": 6916652.0,
"step": 2200
},
{
"entropy": 0.3475446105003357,
"epoch": 2.502831257078143,
"grad_norm": 2.8011586666107178,
"learning_rate": 3.0422291530068953e-05,
"loss": 0.3139,
"mean_token_accuracy": 0.9058610677719117,
"num_tokens": 6947964.0,
"step": 2210
},
{
"entropy": 0.33603944927453994,
"epoch": 2.5141562853907136,
"grad_norm": 2.7381720542907715,
"learning_rate": 3.02657113013717e-05,
"loss": 0.311,
"mean_token_accuracy": 0.9063718646764756,
"num_tokens": 6979191.0,
"step": 2220
},
{
"entropy": 0.33636826276779175,
"epoch": 2.5254813137032843,
"grad_norm": 2.717543363571167,
"learning_rate": 3.0108914619907236e-05,
"loss": 0.2989,
"mean_token_accuracy": 0.9079161524772644,
"num_tokens": 7011313.0,
"step": 2230
},
{
"entropy": 0.32530431896448136,
"epoch": 2.536806342015855,
"grad_norm": 3.0896191596984863,
"learning_rate": 2.9951907930972994e-05,
"loss": 0.3001,
"mean_token_accuracy": 0.9083738714456558,
"num_tokens": 7042592.0,
"step": 2240
},
{
"entropy": 0.3297998860478401,
"epoch": 2.548131370328426,
"grad_norm": 2.8318963050842285,
"learning_rate": 2.9794697688499e-05,
"loss": 0.2948,
"mean_token_accuracy": 0.9093282163143158,
"num_tokens": 7074945.0,
"step": 2250
},
{
"entropy": 0.34250654131174085,
"epoch": 2.5594563986409966,
"grad_norm": 3.3961706161499023,
"learning_rate": 2.9637290354782587e-05,
"loss": 0.3157,
"mean_token_accuracy": 0.904097443819046,
"num_tokens": 7107633.0,
"step": 2260
},
{
"entropy": 0.3470340445637703,
"epoch": 2.5707814269535674,
"grad_norm": 3.1472079753875732,
"learning_rate": 2.9479692400222715e-05,
"loss": 0.3189,
"mean_token_accuracy": 0.9026219546794891,
"num_tokens": 7140772.0,
"step": 2270
},
{
"entropy": 0.33536320328712466,
"epoch": 2.582106455266138,
"grad_norm": 3.536257266998291,
"learning_rate": 2.932191030305401e-05,
"loss": 0.3074,
"mean_token_accuracy": 0.9077878296375275,
"num_tokens": 7174565.0,
"step": 2280
},
{
"entropy": 0.34011308997869494,
"epoch": 2.593431483578709,
"grad_norm": 2.7522521018981934,
"learning_rate": 2.916395054908052e-05,
"loss": 0.3063,
"mean_token_accuracy": 0.9075266391038894,
"num_tokens": 7205947.0,
"step": 2290
},
{
"entropy": 0.3416082814335823,
"epoch": 2.6047565118912797,
"grad_norm": 3.5005111694335938,
"learning_rate": 2.9005819631409043e-05,
"loss": 0.3097,
"mean_token_accuracy": 0.9054687231779098,
"num_tokens": 7239660.0,
"step": 2300
},
{
"entropy": 0.330567429959774,
"epoch": 2.6160815402038504,
"grad_norm": 2.752882480621338,
"learning_rate": 2.8847524050182233e-05,
"loss": 0.2994,
"mean_token_accuracy": 0.9095361143350601,
"num_tokens": 7271351.0,
"step": 2310
},
{
"entropy": 0.3262020036578178,
"epoch": 2.627406568516421,
"grad_norm": 2.702531337738037,
"learning_rate": 2.8689070312311443e-05,
"loss": 0.307,
"mean_token_accuracy": 0.9070776045322418,
"num_tokens": 7306146.0,
"step": 2320
},
{
"entropy": 0.33680719435214995,
"epoch": 2.638731596828992,
"grad_norm": 2.79473614692688,
"learning_rate": 2.853046493120921e-05,
"loss": 0.3065,
"mean_token_accuracy": 0.9064619988203049,
"num_tokens": 7338021.0,
"step": 2330
},
{
"entropy": 0.3301373228430748,
"epoch": 2.6500566251415627,
"grad_norm": 2.9453022480010986,
"learning_rate": 2.837171442652155e-05,
"loss": 0.2878,
"mean_token_accuracy": 0.9134915202856064,
"num_tokens": 7367317.0,
"step": 2340
},
{
"entropy": 0.34509565234184264,
"epoch": 2.661381653454134,
"grad_norm": 2.921013832092285,
"learning_rate": 2.821282532385991e-05,
"loss": 0.3178,
"mean_token_accuracy": 0.9036821633577347,
"num_tokens": 7399316.0,
"step": 2350
},
{
"entropy": 0.33235773593187334,
"epoch": 2.6727066817667042,
"grad_norm": 3.50290846824646,
"learning_rate": 2.8053804154532992e-05,
"loss": 0.3025,
"mean_token_accuracy": 0.9082023203372955,
"num_tokens": 7429163.0,
"step": 2360
},
{
"entropy": 0.3544440522789955,
"epoch": 2.6840317100792754,
"grad_norm": 3.421252489089966,
"learning_rate": 2.78946574552782e-05,
"loss": 0.3255,
"mean_token_accuracy": 0.9013722956180572,
"num_tokens": 7459357.0,
"step": 2370
},
{
"entropy": 0.3445164501667023,
"epoch": 2.6953567383918458,
"grad_norm": 2.7874948978424072,
"learning_rate": 2.7735391767993024e-05,
"loss": 0.3056,
"mean_token_accuracy": 0.9070187360048294,
"num_tokens": 7491443.0,
"step": 2380
},
{
"entropy": 0.3235954880714417,
"epoch": 2.706681766704417,
"grad_norm": 2.596618413925171,
"learning_rate": 2.7576013639466064e-05,
"loss": 0.3016,
"mean_token_accuracy": 0.9080851078033447,
"num_tokens": 7525034.0,
"step": 2390
},
{
"entropy": 0.33873881548643114,
"epoch": 2.7180067950169873,
"grad_norm": 2.730522394180298,
"learning_rate": 2.741652962110794e-05,
"loss": 0.3098,
"mean_token_accuracy": 0.9070809602737426,
"num_tokens": 7556123.0,
"step": 2400
},
{
"entropy": 0.3413001626729965,
"epoch": 2.7293318233295585,
"grad_norm": 2.565798759460449,
"learning_rate": 2.7256946268681997e-05,
"loss": 0.3029,
"mean_token_accuracy": 0.9063649326562881,
"num_tokens": 7587382.0,
"step": 2410
},
{
"entropy": 0.32414872050285337,
"epoch": 2.7406568516421292,
"grad_norm": 2.687298536300659,
"learning_rate": 2.7097270142034798e-05,
"loss": 0.3039,
"mean_token_accuracy": 0.90952388048172,
"num_tokens": 7620486.0,
"step": 2420
},
{
"entropy": 0.33378091007471083,
"epoch": 2.7519818799547,
"grad_norm": 2.9899933338165283,
"learning_rate": 2.6937507804826505e-05,
"loss": 0.3099,
"mean_token_accuracy": 0.9065448194742203,
"num_tokens": 7647800.0,
"step": 2430
},
{
"entropy": 0.32358061224222184,
"epoch": 2.7633069082672708,
"grad_norm": 3.958669424057007,
"learning_rate": 2.6777665824261056e-05,
"loss": 0.2932,
"mean_token_accuracy": 0.9114756107330322,
"num_tokens": 7681853.0,
"step": 2440
},
{
"entropy": 0.35349722802639005,
"epoch": 2.7746319365798415,
"grad_norm": 2.954362154006958,
"learning_rate": 2.661775077081621e-05,
"loss": 0.3166,
"mean_token_accuracy": 0.9041797786951065,
"num_tokens": 7711414.0,
"step": 2450
},
{
"entropy": 0.3320110633969307,
"epoch": 2.7859569648924123,
"grad_norm": 3.5596938133239746,
"learning_rate": 2.645776921797347e-05,
"loss": 0.3035,
"mean_token_accuracy": 0.9083330810070038,
"num_tokens": 7745429.0,
"step": 2460
},
{
"entropy": 0.331499008834362,
"epoch": 2.797281993204983,
"grad_norm": 2.9514622688293457,
"learning_rate": 2.6297727741947876e-05,
"loss": 0.2946,
"mean_token_accuracy": 0.9112407445907593,
"num_tokens": 7776539.0,
"step": 2470
},
{
"entropy": 0.34212442487478256,
"epoch": 2.808607021517554,
"grad_norm": 4.607098579406738,
"learning_rate": 2.613763292141766e-05,
"loss": 0.3161,
"mean_token_accuracy": 0.9037576377391815,
"num_tokens": 7807498.0,
"step": 2480
},
{
"entropy": 0.326346854865551,
"epoch": 2.8199320498301246,
"grad_norm": 2.47886323928833,
"learning_rate": 2.5977491337253844e-05,
"loss": 0.2944,
"mean_token_accuracy": 0.911178269982338,
"num_tokens": 7838306.0,
"step": 2490
},
{
"entropy": 0.33608004450798035,
"epoch": 2.8312570781426953,
"grad_norm": 3.0412232875823975,
"learning_rate": 2.5817309572249736e-05,
"loss": 0.3037,
"mean_token_accuracy": 0.9075497031211853,
"num_tokens": 7872016.0,
"step": 2500
},
{
"epoch": 2.8312570781426953,
"eval_entropy": 0.327717344151504,
"eval_loss": 0.38485008478164673,
"eval_mean_token_accuracy": 0.9145353396437061,
"eval_num_tokens": 7872016.0,
"eval_runtime": 22.6936,
"eval_samples_per_second": 47.106,
"eval_steps_per_second": 5.905,
"step": 2500
},
{
"entropy": 0.3390058234333992,
"epoch": 2.842582106455266,
"grad_norm": 2.7344987392425537,
"learning_rate": 2.565709421085028e-05,
"loss": 0.3032,
"mean_token_accuracy": 0.9075493305921555,
"num_tokens": 7904470.0,
"step": 2510
},
{
"entropy": 0.3146300859749317,
"epoch": 2.853907134767837,
"grad_norm": 3.110588788986206,
"learning_rate": 2.549685183888149e-05,
"loss": 0.2803,
"mean_token_accuracy": 0.9147897362709045,
"num_tokens": 7935280.0,
"step": 2520
},
{
"entropy": 0.3277345307171345,
"epoch": 2.8652321630804076,
"grad_norm": 3.927215099334717,
"learning_rate": 2.5336589043279634e-05,
"loss": 0.315,
"mean_token_accuracy": 0.9082329630851745,
"num_tokens": 7968973.0,
"step": 2530
},
{
"entropy": 0.31817393004894257,
"epoch": 2.8765571913929784,
"grad_norm": 3.023855447769165,
"learning_rate": 2.5176312411820545e-05,
"loss": 0.2923,
"mean_token_accuracy": 0.9107245028018951,
"num_tokens": 8001676.0,
"step": 2540
},
{
"entropy": 0.33554808497428895,
"epoch": 2.887882219705549,
"grad_norm": 3.3250324726104736,
"learning_rate": 2.5016028532848768e-05,
"loss": 0.3107,
"mean_token_accuracy": 0.9068995088338851,
"num_tokens": 8034188.0,
"step": 2550
},
{
"entropy": 0.33894127756357195,
"epoch": 2.89920724801812,
"grad_norm": 2.6111295223236084,
"learning_rate": 2.4855743995006785e-05,
"loss": 0.3057,
"mean_token_accuracy": 0.9076811999082566,
"num_tokens": 8064782.0,
"step": 2560
},
{
"entropy": 0.34306282699108126,
"epoch": 2.9105322763306907,
"grad_norm": 2.726879596710205,
"learning_rate": 2.4695465386964154e-05,
"loss": 0.3149,
"mean_token_accuracy": 0.9052368462085724,
"num_tokens": 8095683.0,
"step": 2570
},
{
"entropy": 0.32450498044490816,
"epoch": 2.921857304643262,
"grad_norm": 2.7551498413085938,
"learning_rate": 2.4535199297146692e-05,
"loss": 0.2934,
"mean_token_accuracy": 0.9113465547561646,
"num_tokens": 8128181.0,
"step": 2580
},
{
"entropy": 0.31809509843587874,
"epoch": 2.933182332955832,
"grad_norm": 3.248033046722412,
"learning_rate": 2.4374952313465618e-05,
"loss": 0.2929,
"mean_token_accuracy": 0.9110888838768005,
"num_tokens": 8161222.0,
"step": 2590
},
{
"entropy": 0.3428458094596863,
"epoch": 2.9445073612684034,
"grad_norm": 3.0945913791656494,
"learning_rate": 2.4214731023046793e-05,
"loss": 0.3094,
"mean_token_accuracy": 0.9040931612253189,
"num_tokens": 8191149.0,
"step": 2600
},
{
"entropy": 0.3347173988819122,
"epoch": 2.9558323895809737,
"grad_norm": 3.0713417530059814,
"learning_rate": 2.4054542011959925e-05,
"loss": 0.3085,
"mean_token_accuracy": 0.9067859917879104,
"num_tokens": 8223169.0,
"step": 2610
},
{
"entropy": 0.33130600601434707,
"epoch": 2.967157417893545,
"grad_norm": 2.971564531326294,
"learning_rate": 2.3894391864947845e-05,
"loss": 0.3003,
"mean_token_accuracy": 0.9123528331518174,
"num_tokens": 8251958.0,
"step": 2620
},
{
"entropy": 0.3486244469881058,
"epoch": 2.9784824462061152,
"grad_norm": 3.0552313327789307,
"learning_rate": 2.3734287165155814e-05,
"loss": 0.3088,
"mean_token_accuracy": 0.9068525344133377,
"num_tokens": 8280414.0,
"step": 2630
},
{
"entropy": 0.3378919124603271,
"epoch": 2.9898074745186864,
"grad_norm": 3.0992133617401123,
"learning_rate": 2.357423449386097e-05,
"loss": 0.308,
"mean_token_accuracy": 0.908593600988388,
"num_tokens": 8309410.0,
"step": 2640
},
{
"entropy": 0.32522587329149244,
"epoch": 3.001132502831257,
"grad_norm": 1.8476426601409912,
"learning_rate": 2.341424043020174e-05,
"loss": 0.2861,
"mean_token_accuracy": 0.9121149241924286,
"num_tokens": 8339500.0,
"step": 2650
},
{
"entropy": 0.2401589497923851,
"epoch": 3.012457531143828,
"grad_norm": 2.254667282104492,
"learning_rate": 2.3254311550907432e-05,
"loss": 0.158,
"mean_token_accuracy": 0.9545799106359482,
"num_tokens": 8368545.0,
"step": 2660
},
{
"entropy": 0.1963147647678852,
"epoch": 3.0237825594563987,
"grad_norm": 3.1326498985290527,
"learning_rate": 2.309445443002788e-05,
"loss": 0.1527,
"mean_token_accuracy": 0.9554174333810806,
"num_tokens": 8398109.0,
"step": 2670
},
{
"entropy": 0.18867359235882758,
"epoch": 3.0351075877689695,
"grad_norm": 2.721315860748291,
"learning_rate": 2.293467563866319e-05,
"loss": 0.1467,
"mean_token_accuracy": 0.9568633317947388,
"num_tokens": 8427150.0,
"step": 2680
},
{
"entropy": 0.17226183749735355,
"epoch": 3.0464326160815403,
"grad_norm": 2.6865651607513428,
"learning_rate": 2.2774981744693667e-05,
"loss": 0.1333,
"mean_token_accuracy": 0.9603831499814988,
"num_tokens": 8459804.0,
"step": 2690
},
{
"entropy": 0.17720401659607887,
"epoch": 3.057757644394111,
"grad_norm": 2.5850725173950195,
"learning_rate": 2.2615379312509827e-05,
"loss": 0.1482,
"mean_token_accuracy": 0.9568986982107163,
"num_tokens": 8496137.0,
"step": 2700
},
{
"entropy": 0.19257365614175798,
"epoch": 3.069082672706682,
"grad_norm": 2.543994188308716,
"learning_rate": 2.245587490274253e-05,
"loss": 0.1511,
"mean_token_accuracy": 0.9563712388277054,
"num_tokens": 8527404.0,
"step": 2710
},
{
"entropy": 0.1776495523750782,
"epoch": 3.0804077010192525,
"grad_norm": 2.6783289909362793,
"learning_rate": 2.229647507199332e-05,
"loss": 0.1394,
"mean_token_accuracy": 0.9598089069128036,
"num_tokens": 8561000.0,
"step": 2720
},
{
"entropy": 0.17914639115333558,
"epoch": 3.0917327293318233,
"grad_norm": 2.805384635925293,
"learning_rate": 2.2137186372564918e-05,
"loss": 0.1462,
"mean_token_accuracy": 0.956651571393013,
"num_tokens": 8593390.0,
"step": 2730
},
{
"entropy": 0.19363198950886726,
"epoch": 3.103057757644394,
"grad_norm": 3.3500022888183594,
"learning_rate": 2.1978015352191864e-05,
"loss": 0.1506,
"mean_token_accuracy": 0.9557105898857117,
"num_tokens": 8622236.0,
"step": 2740
},
{
"entropy": 0.18671710416674614,
"epoch": 3.114382785956965,
"grad_norm": 2.2808239459991455,
"learning_rate": 2.181896855377138e-05,
"loss": 0.1435,
"mean_token_accuracy": 0.9574358344078064,
"num_tokens": 8654502.0,
"step": 2750
},
{
"entropy": 0.1893085241317749,
"epoch": 3.1257078142695356,
"grad_norm": 2.6378250122070312,
"learning_rate": 2.1660052515094405e-05,
"loss": 0.1526,
"mean_token_accuracy": 0.9564388334751129,
"num_tokens": 8685392.0,
"step": 2760
},
{
"entropy": 0.18678954988718033,
"epoch": 3.1370328425821064,
"grad_norm": 2.5639476776123047,
"learning_rate": 2.1501273768576852e-05,
"loss": 0.1448,
"mean_token_accuracy": 0.9556661039590836,
"num_tokens": 8718903.0,
"step": 2770
},
{
"entropy": 0.1747233547270298,
"epoch": 3.148357870894677,
"grad_norm": 2.5726349353790283,
"learning_rate": 2.1342638840991097e-05,
"loss": 0.1401,
"mean_token_accuracy": 0.9590637445449829,
"num_tokens": 8751352.0,
"step": 2780
},
{
"entropy": 0.18045611083507537,
"epoch": 3.159682899207248,
"grad_norm": 2.1656455993652344,
"learning_rate": 2.1184154253197684e-05,
"loss": 0.1461,
"mean_token_accuracy": 0.9568846583366394,
"num_tokens": 8784384.0,
"step": 2790
},
{
"entropy": 0.1883378468453884,
"epoch": 3.1710079275198186,
"grad_norm": 2.388462781906128,
"learning_rate": 2.1025826519877282e-05,
"loss": 0.144,
"mean_token_accuracy": 0.9586382806301117,
"num_tokens": 8813243.0,
"step": 2800
},
{
"entropy": 0.18703141212463378,
"epoch": 3.1823329558323894,
"grad_norm": 2.7605926990509033,
"learning_rate": 2.086766214926288e-05,
"loss": 0.1506,
"mean_token_accuracy": 0.956416517496109,
"num_tokens": 8842657.0,
"step": 2810
},
{
"entropy": 0.18549679666757585,
"epoch": 3.19365798414496,
"grad_norm": 2.453981399536133,
"learning_rate": 2.0709667642872256e-05,
"loss": 0.1456,
"mean_token_accuracy": 0.9563432067632676,
"num_tokens": 8874598.0,
"step": 2820
},
{
"entropy": 0.18915204033255578,
"epoch": 3.2049830124575314,
"grad_norm": 2.598982810974121,
"learning_rate": 2.055184949524075e-05,
"loss": 0.1502,
"mean_token_accuracy": 0.9555239349603653,
"num_tokens": 8905326.0,
"step": 2830
},
{
"entropy": 0.18085768818855286,
"epoch": 3.216308040770102,
"grad_norm": 2.4260501861572266,
"learning_rate": 2.039421419365427e-05,
"loss": 0.1458,
"mean_token_accuracy": 0.9582718342542649,
"num_tokens": 8938848.0,
"step": 2840
},
{
"entropy": 0.183878605812788,
"epoch": 3.227633069082673,
"grad_norm": 2.386054039001465,
"learning_rate": 2.0236768217882683e-05,
"loss": 0.153,
"mean_token_accuracy": 0.956705066561699,
"num_tokens": 8969858.0,
"step": 2850
},
{
"entropy": 0.18992681950330734,
"epoch": 3.2389580973952437,
"grad_norm": 3.0777018070220947,
"learning_rate": 2.0079518039913343e-05,
"loss": 0.1498,
"mean_token_accuracy": 0.9585150212049485,
"num_tokens": 8999393.0,
"step": 2860
},
{
"entropy": 0.18626221343874932,
"epoch": 3.2502831257078144,
"grad_norm": 2.023559808731079,
"learning_rate": 1.992247012368517e-05,
"loss": 0.1492,
"mean_token_accuracy": 0.9551093459129334,
"num_tokens": 9031528.0,
"step": 2870
},
{
"entropy": 0.18020688593387604,
"epoch": 3.261608154020385,
"grad_norm": 2.096430540084839,
"learning_rate": 1.9765630924822886e-05,
"loss": 0.1421,
"mean_token_accuracy": 0.9582108706235886,
"num_tokens": 9063549.0,
"step": 2880
},
{
"entropy": 0.18335785791277887,
"epoch": 3.272933182332956,
"grad_norm": 2.4209272861480713,
"learning_rate": 1.9609006890371667e-05,
"loss": 0.1493,
"mean_token_accuracy": 0.9564682513475418,
"num_tokens": 9095204.0,
"step": 2890
},
{
"entropy": 0.1816806599497795,
"epoch": 3.2842582106455267,
"grad_norm": 2.4786605834960938,
"learning_rate": 1.9452604458532113e-05,
"loss": 0.1396,
"mean_token_accuracy": 0.9589683145284653,
"num_tokens": 9128871.0,
"step": 2900
},
{
"entropy": 0.18295636475086213,
"epoch": 3.2955832389580975,
"grad_norm": 2.6071975231170654,
"learning_rate": 1.92964300583956e-05,
"loss": 0.1412,
"mean_token_accuracy": 0.959463146328926,
"num_tokens": 9159877.0,
"step": 2910
},
{
"entropy": 0.1954453021287918,
"epoch": 3.3069082672706682,
"grad_norm": 2.99585223197937,
"learning_rate": 1.914049010968003e-05,
"loss": 0.155,
"mean_token_accuracy": 0.9559677958488464,
"num_tokens": 9191639.0,
"step": 2920
},
{
"entropy": 0.1915322221815586,
"epoch": 3.318233295583239,
"grad_norm": 2.6830849647521973,
"learning_rate": 1.898479102246592e-05,
"loss": 0.1487,
"mean_token_accuracy": 0.9566442102193833,
"num_tokens": 9222953.0,
"step": 2930
},
{
"entropy": 0.18220141232013704,
"epoch": 3.3295583238958097,
"grad_norm": 2.3461086750030518,
"learning_rate": 1.8829339196932927e-05,
"loss": 0.1421,
"mean_token_accuracy": 0.9585667610168457,
"num_tokens": 9255460.0,
"step": 2940
},
{
"entropy": 0.1813945546746254,
"epoch": 3.3408833522083805,
"grad_norm": 2.318178415298462,
"learning_rate": 1.867414102309671e-05,
"loss": 0.148,
"mean_token_accuracy": 0.9563045471906662,
"num_tokens": 9287765.0,
"step": 2950
},
{
"entropy": 0.18687178418040276,
"epoch": 3.3522083805209513,
"grad_norm": 2.7838022708892822,
"learning_rate": 1.8519202880546337e-05,
"loss": 0.1443,
"mean_token_accuracy": 0.9579327285289765,
"num_tokens": 9320472.0,
"step": 2960
},
{
"entropy": 0.18875156193971634,
"epoch": 3.363533408833522,
"grad_norm": 2.69187068939209,
"learning_rate": 1.8364531138182002e-05,
"loss": 0.1471,
"mean_token_accuracy": 0.9574691027402877,
"num_tokens": 9351036.0,
"step": 2970
},
{
"entropy": 0.179339037835598,
"epoch": 3.374858437146093,
"grad_norm": 2.6196610927581787,
"learning_rate": 1.821013215395322e-05,
"loss": 0.1389,
"mean_token_accuracy": 0.9601082682609559,
"num_tokens": 9382386.0,
"step": 2980
},
{
"entropy": 0.18361469805240632,
"epoch": 3.3861834654586636,
"grad_norm": 2.4154880046844482,
"learning_rate": 1.805601227459751e-05,
"loss": 0.146,
"mean_token_accuracy": 0.957899197936058,
"num_tokens": 9414364.0,
"step": 2990
},
{
"entropy": 0.18562110662460327,
"epoch": 3.3975084937712343,
"grad_norm": 1.9397906064987183,
"learning_rate": 1.7902177835379437e-05,
"loss": 0.1424,
"mean_token_accuracy": 0.9570330619812012,
"num_tokens": 9446533.0,
"step": 3000
},
{
"epoch": 3.3975084937712343,
"eval_entropy": 0.2167282881799029,
"eval_loss": 0.3606521189212799,
"eval_mean_token_accuracy": 0.9296634993446407,
"eval_num_tokens": 9446533.0,
"eval_runtime": 22.8577,
"eval_samples_per_second": 46.768,
"eval_steps_per_second": 5.862,
"step": 3000
},
{
"entropy": 0.19131319522857665,
"epoch": 3.408833522083805,
"grad_norm": 2.902683973312378,
"learning_rate": 1.7748635159830286e-05,
"loss": 0.1543,
"mean_token_accuracy": 0.9540064781904221,
"num_tokens": 9478106.0,
"step": 3010
},
{
"entropy": 0.1933427281677723,
"epoch": 3.420158550396376,
"grad_norm": 3.3666422367095947,
"learning_rate": 1.759539055948806e-05,
"loss": 0.1498,
"mean_token_accuracy": 0.9571747660636902,
"num_tokens": 9511243.0,
"step": 3020
},
{
"entropy": 0.1907684937119484,
"epoch": 3.4314835787089466,
"grad_norm": 2.353508710861206,
"learning_rate": 1.7442450333638073e-05,
"loss": 0.1544,
"mean_token_accuracy": 0.9548930257558823,
"num_tokens": 9540974.0,
"step": 3030
},
{
"entropy": 0.2047928489744663,
"epoch": 3.4428086070215174,
"grad_norm": 2.5164899826049805,
"learning_rate": 1.728982076905396e-05,
"loss": 0.1651,
"mean_token_accuracy": 0.9520281046628952,
"num_tokens": 9571441.0,
"step": 3040
},
{
"entropy": 0.18354908376932144,
"epoch": 3.454133635334088,
"grad_norm": 2.7729098796844482,
"learning_rate": 1.7137508139739327e-05,
"loss": 0.1409,
"mean_token_accuracy": 0.9593415051698685,
"num_tokens": 9601231.0,
"step": 3050
},
{
"entropy": 0.1844599574804306,
"epoch": 3.4654586636466593,
"grad_norm": 2.361435890197754,
"learning_rate": 1.6985518706669794e-05,
"loss": 0.147,
"mean_token_accuracy": 0.9563383430242538,
"num_tokens": 9632314.0,
"step": 3060
},
{
"entropy": 0.18727928251028061,
"epoch": 3.4767836919592296,
"grad_norm": 3.096991539001465,
"learning_rate": 1.6833858717535657e-05,
"loss": 0.1489,
"mean_token_accuracy": 0.9559879064559936,
"num_tokens": 9664358.0,
"step": 3070
},
{
"entropy": 0.18697959557175636,
"epoch": 3.488108720271801,
"grad_norm": 2.7007927894592285,
"learning_rate": 1.6682534406485055e-05,
"loss": 0.1435,
"mean_token_accuracy": 0.9573837488889694,
"num_tokens": 9694084.0,
"step": 3080
},
{
"entropy": 0.17963240146636963,
"epoch": 3.4994337485843716,
"grad_norm": 2.9009249210357666,
"learning_rate": 1.6531551993867717e-05,
"loss": 0.1488,
"mean_token_accuracy": 0.957426118850708,
"num_tokens": 9726062.0,
"step": 3090
},
{
"entropy": 0.18775674030184747,
"epoch": 3.5107587768969424,
"grad_norm": 2.6308300495147705,
"learning_rate": 1.638091768597927e-05,
"loss": 0.1468,
"mean_token_accuracy": 0.9567855209112167,
"num_tokens": 9757366.0,
"step": 3100
},
{
"entropy": 0.18580029234290124,
"epoch": 3.522083805209513,
"grad_norm": 2.537184715270996,
"learning_rate": 1.623063767480611e-05,
"loss": 0.1451,
"mean_token_accuracy": 0.9578706949949265,
"num_tokens": 9789224.0,
"step": 3110
},
{
"entropy": 0.184904894977808,
"epoch": 3.533408833522084,
"grad_norm": 2.5414226055145264,
"learning_rate": 1.6080718137770908e-05,
"loss": 0.1492,
"mean_token_accuracy": 0.9576172918081284,
"num_tokens": 9820220.0,
"step": 3120
},
{
"entropy": 0.1862082712352276,
"epoch": 3.5447338618346547,
"grad_norm": 2.7060234546661377,
"learning_rate": 1.5931165237478618e-05,
"loss": 0.1465,
"mean_token_accuracy": 0.9575734734535217,
"num_tokens": 9852382.0,
"step": 3130
},
{
"entropy": 0.19045321121811867,
"epoch": 3.5560588901472254,
"grad_norm": 2.51277756690979,
"learning_rate": 1.5781985121463215e-05,
"loss": 0.1529,
"mean_token_accuracy": 0.9558474659919739,
"num_tokens": 9882680.0,
"step": 3140
},
{
"entropy": 0.1901462681591511,
"epoch": 3.567383918459796,
"grad_norm": 3.2841107845306396,
"learning_rate": 1.5633183921934975e-05,
"loss": 0.1494,
"mean_token_accuracy": 0.9553068786859512,
"num_tokens": 9912907.0,
"step": 3150
},
{
"entropy": 0.20788209587335588,
"epoch": 3.578708946772367,
"grad_norm": 2.5944294929504395,
"learning_rate": 1.5484767755528396e-05,
"loss": 0.1695,
"mean_token_accuracy": 0.9538979262113572,
"num_tokens": 9943382.0,
"step": 3160
},
{
"entropy": 0.19017338678240775,
"epoch": 3.5900339750849377,
"grad_norm": 2.511800765991211,
"learning_rate": 1.5336742723050773e-05,
"loss": 0.1508,
"mean_token_accuracy": 0.9550507336854934,
"num_tokens": 9974102.0,
"step": 3170
},
{
"entropy": 0.18529783561825752,
"epoch": 3.6013590033975085,
"grad_norm": 2.57311749458313,
"learning_rate": 1.5189114909231417e-05,
"loss": 0.1468,
"mean_token_accuracy": 0.9569410651922226,
"num_tokens": 10006688.0,
"step": 3180
},
{
"entropy": 0.17686643823981285,
"epoch": 3.6126840317100792,
"grad_norm": 2.5054280757904053,
"learning_rate": 1.5041890382471529e-05,
"loss": 0.1447,
"mean_token_accuracy": 0.9577518045902252,
"num_tokens": 10041266.0,
"step": 3190
},
{
"entropy": 0.17823027074337006,
"epoch": 3.62400906002265,
"grad_norm": 2.448424816131592,
"learning_rate": 1.4895075194594772e-05,
"loss": 0.1432,
"mean_token_accuracy": 0.9584116190671921,
"num_tokens": 10073211.0,
"step": 3200
},
{
"entropy": 0.18234781473875045,
"epoch": 3.6353340883352208,
"grad_norm": 2.9412801265716553,
"learning_rate": 1.4748675380598485e-05,
"loss": 0.1437,
"mean_token_accuracy": 0.958428630232811,
"num_tokens": 10103170.0,
"step": 3210
},
{
"entropy": 0.18727488964796066,
"epoch": 3.6466591166477915,
"grad_norm": 2.0503034591674805,
"learning_rate": 1.4602696958405604e-05,
"loss": 0.1461,
"mean_token_accuracy": 0.9569843918085098,
"num_tokens": 10133784.0,
"step": 3220
},
{
"entropy": 0.18094087094068528,
"epoch": 3.6579841449603623,
"grad_norm": 2.4768571853637695,
"learning_rate": 1.4457145928617318e-05,
"loss": 0.1436,
"mean_token_accuracy": 0.956751012802124,
"num_tokens": 10165215.0,
"step": 3230
},
{
"entropy": 0.1724256232380867,
"epoch": 3.669309173272933,
"grad_norm": 2.701741933822632,
"learning_rate": 1.4312028274266368e-05,
"loss": 0.1358,
"mean_token_accuracy": 0.9607989430427551,
"num_tokens": 10198970.0,
"step": 3240
},
{
"entropy": 0.1781797580420971,
"epoch": 3.680634201585504,
"grad_norm": 2.8996741771698,
"learning_rate": 1.4167349960571139e-05,
"loss": 0.1473,
"mean_token_accuracy": 0.9563544005155563,
"num_tokens": 10231477.0,
"step": 3250
},
{
"entropy": 0.18036175966262818,
"epoch": 3.6919592298980746,
"grad_norm": 2.777268409729004,
"learning_rate": 1.4023116934690444e-05,
"loss": 0.1398,
"mean_token_accuracy": 0.9592943549156189,
"num_tokens": 10264052.0,
"step": 3260
},
{
"entropy": 0.17828406393527985,
"epoch": 3.7032842582106458,
"grad_norm": 2.637403964996338,
"learning_rate": 1.3879335125479054e-05,
"loss": 0.136,
"mean_token_accuracy": 0.9595444172620773,
"num_tokens": 10295601.0,
"step": 3270
},
{
"entropy": 0.18349251225590707,
"epoch": 3.714609286523216,
"grad_norm": 2.5384583473205566,
"learning_rate": 1.3736010443243987e-05,
"loss": 0.1452,
"mean_token_accuracy": 0.9566281735897064,
"num_tokens": 10328327.0,
"step": 3280
},
{
"entropy": 0.1727997176349163,
"epoch": 3.7259343148357873,
"grad_norm": 2.7597897052764893,
"learning_rate": 1.3593148779501575e-05,
"loss": 0.1383,
"mean_token_accuracy": 0.9598427444696427,
"num_tokens": 10363214.0,
"step": 3290
},
{
"entropy": 0.17277977019548416,
"epoch": 3.7372593431483576,
"grad_norm": 2.6197760105133057,
"learning_rate": 1.345075600673526e-05,
"loss": 0.1351,
"mean_token_accuracy": 0.9603388220071792,
"num_tokens": 10394380.0,
"step": 3300
},
{
"entropy": 0.18125973120331765,
"epoch": 3.748584371460929,
"grad_norm": 2.668290853500366,
"learning_rate": 1.330883797815421e-05,
"loss": 0.1351,
"mean_token_accuracy": 0.9600670874118805,
"num_tokens": 10426589.0,
"step": 3310
},
{
"entropy": 0.17329150810837746,
"epoch": 3.7599093997734996,
"grad_norm": 2.1576309204101562,
"learning_rate": 1.3167400527452722e-05,
"loss": 0.1354,
"mean_token_accuracy": 0.9584385722875595,
"num_tokens": 10459861.0,
"step": 3320
},
{
"entropy": 0.18031825870275497,
"epoch": 3.7712344280860703,
"grad_norm": 2.3556900024414062,
"learning_rate": 1.3026449468570434e-05,
"loss": 0.1431,
"mean_token_accuracy": 0.9587339907884598,
"num_tokens": 10490225.0,
"step": 3330
},
{
"entropy": 0.17993075400590897,
"epoch": 3.782559456398641,
"grad_norm": 2.7811639308929443,
"learning_rate": 1.28859905954533e-05,
"loss": 0.1413,
"mean_token_accuracy": 0.9577191978693008,
"num_tokens": 10520675.0,
"step": 3340
},
{
"entropy": 0.17930143624544143,
"epoch": 3.793884484711212,
"grad_norm": 2.998231887817383,
"learning_rate": 1.2746029681815468e-05,
"loss": 0.1378,
"mean_token_accuracy": 0.9599625527858734,
"num_tokens": 10551792.0,
"step": 3350
},
{
"entropy": 0.17830280363559722,
"epoch": 3.8052095130237826,
"grad_norm": 2.379882574081421,
"learning_rate": 1.2606572480901887e-05,
"loss": 0.1405,
"mean_token_accuracy": 0.9589870750904084,
"num_tokens": 10582017.0,
"step": 3360
},
{
"entropy": 0.18163416758179665,
"epoch": 3.8165345413363534,
"grad_norm": 2.8061420917510986,
"learning_rate": 1.2467624725251881e-05,
"loss": 0.1398,
"mean_token_accuracy": 0.9601597607135772,
"num_tokens": 10612802.0,
"step": 3370
},
{
"entropy": 0.1735350415110588,
"epoch": 3.827859569648924,
"grad_norm": 2.3078725337982178,
"learning_rate": 1.2329192126463466e-05,
"loss": 0.1426,
"mean_token_accuracy": 0.95964674949646,
"num_tokens": 10644168.0,
"step": 3380
},
{
"entropy": 0.18496472463011743,
"epoch": 3.839184597961495,
"grad_norm": 2.1106555461883545,
"learning_rate": 1.2191280374958558e-05,
"loss": 0.1397,
"mean_token_accuracy": 0.9582046836614608,
"num_tokens": 10675513.0,
"step": 3390
},
{
"entropy": 0.18191742673516273,
"epoch": 3.8505096262740657,
"grad_norm": 2.747044324874878,
"learning_rate": 1.2053895139749136e-05,
"loss": 0.1507,
"mean_token_accuracy": 0.9582120895385742,
"num_tokens": 10705985.0,
"step": 3400
},
{
"entropy": 0.17193833813071252,
"epoch": 3.8618346545866364,
"grad_norm": 2.384255886077881,
"learning_rate": 1.1917042068204082e-05,
"loss": 0.1298,
"mean_token_accuracy": 0.9630542248487473,
"num_tokens": 10737566.0,
"step": 3410
},
{
"entropy": 0.17727019563317298,
"epoch": 3.873159682899207,
"grad_norm": 2.8619980812072754,
"learning_rate": 1.1780726785817161e-05,
"loss": 0.1448,
"mean_token_accuracy": 0.9592967689037323,
"num_tokens": 10767754.0,
"step": 3420
},
{
"entropy": 0.1765161231160164,
"epoch": 3.884484711211778,
"grad_norm": 1.9828704595565796,
"learning_rate": 1.1644954895975726e-05,
"loss": 0.1391,
"mean_token_accuracy": 0.9608483374118805,
"num_tokens": 10799308.0,
"step": 3430
},
{
"entropy": 0.17770369723439217,
"epoch": 3.8958097395243487,
"grad_norm": 2.346670150756836,
"learning_rate": 1.1509731979730393e-05,
"loss": 0.1373,
"mean_token_accuracy": 0.9591909050941467,
"num_tokens": 10831057.0,
"step": 3440
},
{
"entropy": 0.18057307675480844,
"epoch": 3.9071347678369195,
"grad_norm": 2.6184489727020264,
"learning_rate": 1.1375063595565597e-05,
"loss": 0.1389,
"mean_token_accuracy": 0.9603639721870423,
"num_tokens": 10862132.0,
"step": 3450
},
{
"entropy": 0.17516846358776092,
"epoch": 3.9184597961494902,
"grad_norm": 2.5211005210876465,
"learning_rate": 1.1240955279171161e-05,
"loss": 0.1355,
"mean_token_accuracy": 0.9610599786043167,
"num_tokens": 10892085.0,
"step": 3460
},
{
"entropy": 0.17097302675247192,
"epoch": 3.929784824462061,
"grad_norm": 2.9019627571105957,
"learning_rate": 1.1107412543214707e-05,
"loss": 0.1368,
"mean_token_accuracy": 0.9616718500852585,
"num_tokens": 10924497.0,
"step": 3470
},
{
"entropy": 0.17731274515390397,
"epoch": 3.9411098527746318,
"grad_norm": 2.090160369873047,
"learning_rate": 1.0974440877115059e-05,
"loss": 0.1429,
"mean_token_accuracy": 0.9593401104211807,
"num_tokens": 10954535.0,
"step": 3480
},
{
"entropy": 0.174043457955122,
"epoch": 3.9524348810872025,
"grad_norm": 2.19345760345459,
"learning_rate": 1.0842045746816604e-05,
"loss": 0.1334,
"mean_token_accuracy": 0.9610940903425217,
"num_tokens": 10984773.0,
"step": 3490
},
{
"entropy": 0.1764768786728382,
"epoch": 3.9637599093997737,
"grad_norm": 2.4305057525634766,
"learning_rate": 1.0710232594564573e-05,
"loss": 0.1395,
"mean_token_accuracy": 0.9584390819072723,
"num_tokens": 11016942.0,
"step": 3500
},
{
"epoch": 3.9637599093997737,
"eval_entropy": 0.19162503318555318,
"eval_loss": 0.3246709704399109,
"eval_mean_token_accuracy": 0.9435568692079231,
"eval_num_tokens": 11016942.0,
"eval_runtime": 22.7597,
"eval_samples_per_second": 46.969,
"eval_steps_per_second": 5.888,
"step": 3500
},
{
"entropy": 0.1761431761085987,
"epoch": 3.975084937712344,
"grad_norm": 2.183934450149536,
"learning_rate": 1.057900683868138e-05,
"loss": 0.1371,
"mean_token_accuracy": 0.9605765461921691,
"num_tokens": 11047956.0,
"step": 3510
},
{
"entropy": 0.18299319073557854,
"epoch": 3.9864099660249153,
"grad_norm": 2.257868766784668,
"learning_rate": 1.0448373873343892e-05,
"loss": 0.1442,
"mean_token_accuracy": 0.9576489955186844,
"num_tokens": 11077078.0,
"step": 3520
},
{
"entropy": 0.1684267781674862,
"epoch": 3.9977349943374856,
"grad_norm": 2.2900640964508057,
"learning_rate": 1.0318339068361657e-05,
"loss": 0.1332,
"mean_token_accuracy": 0.961523225903511,
"num_tokens": 11110785.0,
"step": 3530
},
{
"entropy": 0.13921927139163018,
"epoch": 4.009060022650057,
"grad_norm": 1.109343409538269,
"learning_rate": 1.018890776895618e-05,
"loss": 0.0855,
"mean_token_accuracy": 0.9792260736227035,
"num_tokens": 11140853.0,
"step": 3540
},
{
"entropy": 0.12029264867305756,
"epoch": 4.020385050962627,
"grad_norm": 2.0965583324432373,
"learning_rate": 1.0060085295541242e-05,
"loss": 0.0692,
"mean_token_accuracy": 0.982733103632927,
"num_tokens": 11171432.0,
"step": 3550
},
{
"entropy": 0.1098148312419653,
"epoch": 4.031710079275198,
"grad_norm": 1.9380742311477661,
"learning_rate": 9.931876943504159e-06,
"loss": 0.0706,
"mean_token_accuracy": 0.9828455358743667,
"num_tokens": 11202059.0,
"step": 3560
},
{
"entropy": 0.10720174796879292,
"epoch": 4.043035107587769,
"grad_norm": 1.4595756530761719,
"learning_rate": 9.804287982988128e-06,
"loss": 0.0699,
"mean_token_accuracy": 0.9834076315164566,
"num_tokens": 11233844.0,
"step": 3570
},
{
"entropy": 0.0977464810013771,
"epoch": 4.05436013590034,
"grad_norm": 1.4825645685195923,
"learning_rate": 9.677323658675594e-06,
"loss": 0.0644,
"mean_token_accuracy": 0.9842707842588425,
"num_tokens": 11268276.0,
"step": 3580
},
{
"entropy": 0.10380081199109555,
"epoch": 4.06568516421291,
"grad_norm": 1.125681757926941,
"learning_rate": 9.550989189572623e-06,
"loss": 0.0664,
"mean_token_accuracy": 0.983523941040039,
"num_tokens": 11300315.0,
"step": 3590
},
{
"entropy": 0.11478501930832863,
"epoch": 4.077010192525481,
"grad_norm": 2.136237144470215,
"learning_rate": 9.425289768794433e-06,
"loss": 0.0741,
"mean_token_accuracy": 0.9814123988151551,
"num_tokens": 11328482.0,
"step": 3600
},
{
"entropy": 0.10670767351984978,
"epoch": 4.088335220838052,
"grad_norm": 1.1652253866195679,
"learning_rate": 9.300230563351877e-06,
"loss": 0.0666,
"mean_token_accuracy": 0.9832827031612397,
"num_tokens": 11359860.0,
"step": 3610
},
{
"entropy": 0.10554791763424873,
"epoch": 4.099660249150623,
"grad_norm": 1.7422910928726196,
"learning_rate": 9.17581671393908e-06,
"loss": 0.0663,
"mean_token_accuracy": 0.9834140241146088,
"num_tokens": 11391326.0,
"step": 3620
},
{
"entropy": 0.09950958117842675,
"epoch": 4.110985277463194,
"grad_norm": 1.6092437505722046,
"learning_rate": 9.052053334722075e-06,
"loss": 0.0665,
"mean_token_accuracy": 0.983272585272789,
"num_tokens": 11424746.0,
"step": 3630
},
{
"entropy": 0.10113044157624244,
"epoch": 4.122310305775764,
"grad_norm": 1.7780877351760864,
"learning_rate": 8.928945513128648e-06,
"loss": 0.0679,
"mean_token_accuracy": 0.9828346908092499,
"num_tokens": 11456151.0,
"step": 3640
},
{
"entropy": 0.10017617046833038,
"epoch": 4.133635334088336,
"grad_norm": 1.337113857269287,
"learning_rate": 8.806498309639161e-06,
"loss": 0.0653,
"mean_token_accuracy": 0.9838932573795318,
"num_tokens": 11489218.0,
"step": 3650
},
{
"entropy": 0.09929984286427498,
"epoch": 4.144960362400906,
"grad_norm": 1.4660295248031616,
"learning_rate": 8.684716757578559e-06,
"loss": 0.0625,
"mean_token_accuracy": 0.9850300490856171,
"num_tokens": 11522506.0,
"step": 3660
},
{
"entropy": 0.10483775176107883,
"epoch": 4.156285390713477,
"grad_norm": 1.3711936473846436,
"learning_rate": 8.563605862909466e-06,
"loss": 0.0674,
"mean_token_accuracy": 0.983280673623085,
"num_tokens": 11554172.0,
"step": 3670
},
{
"entropy": 0.1068756926804781,
"epoch": 4.1676104190260475,
"grad_norm": 1.3237987756729126,
"learning_rate": 8.443170604026388e-06,
"loss": 0.069,
"mean_token_accuracy": 0.9830775171518326,
"num_tokens": 11586956.0,
"step": 3680
},
{
"entropy": 0.10014188215136528,
"epoch": 4.178935447338619,
"grad_norm": 1.6192259788513184,
"learning_rate": 8.323415931551114e-06,
"loss": 0.0634,
"mean_token_accuracy": 0.9846415996551514,
"num_tokens": 11619759.0,
"step": 3690
},
{
"entropy": 0.10651820972561836,
"epoch": 4.190260475651189,
"grad_norm": 1.3025074005126953,
"learning_rate": 8.204346768129182e-06,
"loss": 0.0694,
"mean_token_accuracy": 0.9827026188373565,
"num_tokens": 11652472.0,
"step": 3700
},
{
"entropy": 0.10732679478824139,
"epoch": 4.20158550396376,
"grad_norm": 2.4277567863464355,
"learning_rate": 8.085968008227545e-06,
"loss": 0.0701,
"mean_token_accuracy": 0.9824820995330811,
"num_tokens": 11685051.0,
"step": 3710
},
{
"entropy": 0.10409895218908786,
"epoch": 4.2129105322763305,
"grad_norm": 1.18539559841156,
"learning_rate": 7.96828451793335e-06,
"loss": 0.0642,
"mean_token_accuracy": 0.9833295792341232,
"num_tokens": 11717668.0,
"step": 3720
},
{
"entropy": 0.1043463621288538,
"epoch": 4.224235560588902,
"grad_norm": 1.7125917673110962,
"learning_rate": 7.851301134753958e-06,
"loss": 0.0693,
"mean_token_accuracy": 0.9828597515821457,
"num_tokens": 11748692.0,
"step": 3730
},
{
"entropy": 0.10716413073241711,
"epoch": 4.235560588901472,
"grad_norm": 1.7997198104858398,
"learning_rate": 7.735022667418054e-06,
"loss": 0.0666,
"mean_token_accuracy": 0.9837399870157242,
"num_tokens": 11779854.0,
"step": 3740
},
{
"entropy": 0.10388663597404957,
"epoch": 4.246885617214043,
"grad_norm": 1.4692825078964233,
"learning_rate": 7.619453895678002e-06,
"loss": 0.0704,
"mean_token_accuracy": 0.982433345913887,
"num_tokens": 11811159.0,
"step": 3750
},
{
"entropy": 0.09520137794315815,
"epoch": 4.2582106455266135,
"grad_norm": 1.4710146188735962,
"learning_rate": 7.504599570113355e-06,
"loss": 0.0639,
"mean_token_accuracy": 0.9837541997432708,
"num_tokens": 11845982.0,
"step": 3760
},
{
"entropy": 0.1049416147172451,
"epoch": 4.269535673839185,
"grad_norm": 1.5879381895065308,
"learning_rate": 7.3904644119355856e-06,
"loss": 0.0672,
"mean_token_accuracy": 0.9831987112760544,
"num_tokens": 11877055.0,
"step": 3770
},
{
"entropy": 0.11051239259541035,
"epoch": 4.280860702151755,
"grad_norm": 1.4687776565551758,
"learning_rate": 7.277053112794008e-06,
"loss": 0.0698,
"mean_token_accuracy": 0.9820594847202301,
"num_tokens": 11908759.0,
"step": 3780
},
{
"entropy": 0.09960917495191098,
"epoch": 4.292185730464326,
"grad_norm": 1.4004406929016113,
"learning_rate": 7.164370334582929e-06,
"loss": 0.0638,
"mean_token_accuracy": 0.984798014163971,
"num_tokens": 11941140.0,
"step": 3790
},
{
"entropy": 0.11478472761809826,
"epoch": 4.303510758776897,
"grad_norm": 1.6815623044967651,
"learning_rate": 7.052420709250018e-06,
"loss": 0.0728,
"mean_token_accuracy": 0.9819641202688217,
"num_tokens": 11969922.0,
"step": 3800
},
{
"entropy": 0.1214354656636715,
"epoch": 4.314835787089468,
"grad_norm": 1.6286342144012451,
"learning_rate": 6.941208838605884e-06,
"loss": 0.0783,
"mean_token_accuracy": 0.9797982454299927,
"num_tokens": 11997173.0,
"step": 3810
},
{
"entropy": 0.09785076789557934,
"epoch": 4.326160815402038,
"grad_norm": 1.5718897581100464,
"learning_rate": 6.830739294134944e-06,
"loss": 0.0642,
"mean_token_accuracy": 0.9843629568815231,
"num_tokens": 12030969.0,
"step": 3820
},
{
"entropy": 0.09950311519205571,
"epoch": 4.337485843714609,
"grad_norm": 1.7603079080581665,
"learning_rate": 6.721016616807499e-06,
"loss": 0.0655,
"mean_token_accuracy": 0.9839122086763382,
"num_tokens": 12062700.0,
"step": 3830
},
{
"entropy": 0.10633566826581956,
"epoch": 4.34881087202718,
"grad_norm": 1.8313571214675903,
"learning_rate": 6.6120453168930565e-06,
"loss": 0.0667,
"mean_token_accuracy": 0.9825680136680603,
"num_tokens": 12094124.0,
"step": 3840
},
{
"entropy": 0.10049457848072052,
"epoch": 4.360135900339751,
"grad_norm": 1.9124850034713745,
"learning_rate": 6.50382987377495e-06,
"loss": 0.064,
"mean_token_accuracy": 0.9843475848436356,
"num_tokens": 12126923.0,
"step": 3850
},
{
"entropy": 0.09917602241039276,
"epoch": 4.371460928652322,
"grad_norm": 2.690685510635376,
"learning_rate": 6.396374735766181e-06,
"loss": 0.0654,
"mean_token_accuracy": 0.984014829993248,
"num_tokens": 12162914.0,
"step": 3860
},
{
"entropy": 0.10630933344364166,
"epoch": 4.382785956964892,
"grad_norm": 1.5558531284332275,
"learning_rate": 6.289684319926609e-06,
"loss": 0.0688,
"mean_token_accuracy": 0.9832538574934006,
"num_tokens": 12193386.0,
"step": 3870
},
{
"entropy": 0.10831660889089108,
"epoch": 4.394110985277464,
"grad_norm": 1.1746289730072021,
"learning_rate": 6.183763011881349e-06,
"loss": 0.0678,
"mean_token_accuracy": 0.9832908064126968,
"num_tokens": 12224466.0,
"step": 3880
},
{
"entropy": 0.09632682017982006,
"epoch": 4.405436013590034,
"grad_norm": 1.4813034534454346,
"learning_rate": 6.078615165640514e-06,
"loss": 0.0625,
"mean_token_accuracy": 0.9853335320949554,
"num_tokens": 12258416.0,
"step": 3890
},
{
"entropy": 0.10236595608294011,
"epoch": 4.416761041902605,
"grad_norm": 1.622856616973877,
"learning_rate": 5.9742451034202225e-06,
"loss": 0.0669,
"mean_token_accuracy": 0.9836599975824356,
"num_tokens": 12291050.0,
"step": 3900
},
{
"entropy": 0.09877747371792793,
"epoch": 4.428086070215175,
"grad_norm": 1.5723583698272705,
"learning_rate": 5.870657115464945e-06,
"loss": 0.0649,
"mean_token_accuracy": 0.9841223329305648,
"num_tokens": 12323596.0,
"step": 3910
},
{
"entropy": 0.1077471561729908,
"epoch": 4.439411098527747,
"grad_norm": 1.604130744934082,
"learning_rate": 5.767855459871138e-06,
"loss": 0.0692,
"mean_token_accuracy": 0.9839894771575928,
"num_tokens": 12353006.0,
"step": 3920
},
{
"entropy": 0.11380970776081085,
"epoch": 4.450736126840317,
"grad_norm": 1.3762873411178589,
"learning_rate": 5.66584436241222e-06,
"loss": 0.0722,
"mean_token_accuracy": 0.9826073527336121,
"num_tokens": 12381072.0,
"step": 3930
},
{
"entropy": 0.1162498328834772,
"epoch": 4.462061155152888,
"grad_norm": 1.0174576044082642,
"learning_rate": 5.564628016364862e-06,
"loss": 0.0812,
"mean_token_accuracy": 0.9817124873399734,
"num_tokens": 12411917.0,
"step": 3940
},
{
"entropy": 0.11888325586915016,
"epoch": 4.4733861834654585,
"grad_norm": 1.0936172008514404,
"learning_rate": 5.464210582336595e-06,
"loss": 0.0778,
"mean_token_accuracy": 0.9785851925611496,
"num_tokens": 12444726.0,
"step": 3950
},
{
"entropy": 0.1004635065793991,
"epoch": 4.48471121177803,
"grad_norm": 1.5305367708206177,
"learning_rate": 5.364596188094839e-06,
"loss": 0.0646,
"mean_token_accuracy": 0.9839898198843002,
"num_tokens": 12478621.0,
"step": 3960
},
{
"entropy": 0.11077578589320183,
"epoch": 4.4960362400906,
"grad_norm": 1.8867307901382446,
"learning_rate": 5.265788928397172e-06,
"loss": 0.0737,
"mean_token_accuracy": 0.981790030002594,
"num_tokens": 12506545.0,
"step": 3970
},
{
"entropy": 0.10919649675488471,
"epoch": 4.507361268403171,
"grad_norm": 1.829324722290039,
"learning_rate": 5.16779286482304e-06,
"loss": 0.0693,
"mean_token_accuracy": 0.9827600866556168,
"num_tokens": 12536367.0,
"step": 3980
},
{
"entropy": 0.10490477122366429,
"epoch": 4.5186862967157415,
"grad_norm": 1.3586397171020508,
"learning_rate": 5.070612025606799e-06,
"loss": 0.0689,
"mean_token_accuracy": 0.9822495877742767,
"num_tokens": 12567694.0,
"step": 3990
},
{
"entropy": 0.10469603314995765,
"epoch": 4.530011325028313,
"grad_norm": 2.1284432411193848,
"learning_rate": 4.974250405472109e-06,
"loss": 0.0655,
"mean_token_accuracy": 0.9836343795061111,
"num_tokens": 12599121.0,
"step": 4000
},
{
"epoch": 4.530011325028313,
"eval_entropy": 0.14545185185635268,
"eval_loss": 0.33652952313423157,
"eval_mean_token_accuracy": 0.947298993815237,
"eval_num_tokens": 12599121.0,
"eval_runtime": 22.7646,
"eval_samples_per_second": 46.959,
"eval_steps_per_second": 5.886,
"step": 4000
},
{
"entropy": 0.09632386714220047,
"epoch": 4.541336353340883,
"grad_norm": 1.4618823528289795,
"learning_rate": 4.87871196546775e-06,
"loss": 0.0634,
"mean_token_accuracy": 0.9848664104938507,
"num_tokens": 12633049.0,
"step": 4010
},
{
"entropy": 0.09856323637068272,
"epoch": 4.552661381653454,
"grad_norm": 1.6553484201431274,
"learning_rate": 4.784000632804797e-06,
"loss": 0.0653,
"mean_token_accuracy": 0.9834588289260864,
"num_tokens": 12667038.0,
"step": 4020
},
{
"entropy": 0.1045151598751545,
"epoch": 4.563986409966025,
"grad_norm": 1.4048621654510498,
"learning_rate": 4.690120300695175e-06,
"loss": 0.0669,
"mean_token_accuracy": 0.9840848416090011,
"num_tokens": 12698493.0,
"step": 4030
},
{
"entropy": 0.09922835528850556,
"epoch": 4.575311438278596,
"grad_norm": 1.3776944875717163,
"learning_rate": 4.597074828191633e-06,
"loss": 0.0649,
"mean_token_accuracy": 0.9843086212873459,
"num_tokens": 12731001.0,
"step": 4040
},
{
"entropy": 0.10786725878715515,
"epoch": 4.586636466591166,
"grad_norm": 1.3582957983016968,
"learning_rate": 4.504868040029112e-06,
"loss": 0.0704,
"mean_token_accuracy": 0.9824901282787323,
"num_tokens": 12761747.0,
"step": 4050
},
{
"entropy": 0.10388052202761174,
"epoch": 4.597961494903737,
"grad_norm": 1.312872052192688,
"learning_rate": 4.4135037264675244e-06,
"loss": 0.0665,
"mean_token_accuracy": 0.9838407725095749,
"num_tokens": 12791779.0,
"step": 4060
},
{
"entropy": 0.0987605456262827,
"epoch": 4.6092865232163085,
"grad_norm": 1.818518042564392,
"learning_rate": 4.322985643135952e-06,
"loss": 0.0655,
"mean_token_accuracy": 0.9834914088249207,
"num_tokens": 12823785.0,
"step": 4070
},
{
"entropy": 0.09871327206492424,
"epoch": 4.620611551528879,
"grad_norm": 1.7639421224594116,
"learning_rate": 4.233317510878271e-06,
"loss": 0.0631,
"mean_token_accuracy": 0.9850008130073548,
"num_tokens": 12856087.0,
"step": 4080
},
{
"entropy": 0.11288162134587765,
"epoch": 4.631936579841449,
"grad_norm": 1.7146990299224854,
"learning_rate": 4.1445030156001765e-06,
"loss": 0.0714,
"mean_token_accuracy": 0.9824142724275589,
"num_tokens": 12885222.0,
"step": 4090
},
{
"entropy": 0.11302966885268688,
"epoch": 4.64326160815402,
"grad_norm": 1.8406291007995605,
"learning_rate": 4.0565458081177174e-06,
"loss": 0.0713,
"mean_token_accuracy": 0.9816466063261032,
"num_tokens": 12915084.0,
"step": 4100
},
{
"entropy": 0.10371216647326946,
"epoch": 4.6545866364665915,
"grad_norm": 1.4874554872512817,
"learning_rate": 3.969449504007189e-06,
"loss": 0.0655,
"mean_token_accuracy": 0.9841427594423294,
"num_tokens": 12946310.0,
"step": 4110
},
{
"entropy": 0.09823175929486752,
"epoch": 4.665911664779162,
"grad_norm": 1.4839348793029785,
"learning_rate": 3.883217683456522e-06,
"loss": 0.0624,
"mean_token_accuracy": 0.984762093424797,
"num_tokens": 12979829.0,
"step": 4120
},
{
"entropy": 0.09992509298026561,
"epoch": 4.677236693091733,
"grad_norm": 1.2307822704315186,
"learning_rate": 3.7978538911181054e-06,
"loss": 0.065,
"mean_token_accuracy": 0.9842061281204224,
"num_tokens": 13011881.0,
"step": 4130
},
{
"entropy": 0.10551130101084709,
"epoch": 4.688561721404303,
"grad_norm": 1.4577821493148804,
"learning_rate": 3.7133616359630974e-06,
"loss": 0.0658,
"mean_token_accuracy": 0.9835035592317581,
"num_tokens": 13042506.0,
"step": 4140
},
{
"entropy": 0.10092180483043194,
"epoch": 4.699886749716875,
"grad_norm": 1.5127148628234863,
"learning_rate": 3.6297443911371744e-06,
"loss": 0.0687,
"mean_token_accuracy": 0.9826838493347168,
"num_tokens": 13074275.0,
"step": 4150
},
{
"entropy": 0.1087297696620226,
"epoch": 4.711211778029445,
"grad_norm": 1.520949125289917,
"learning_rate": 3.547005593817765e-06,
"loss": 0.0691,
"mean_token_accuracy": 0.9830662041902543,
"num_tokens": 13104642.0,
"step": 4160
},
{
"entropy": 0.10933543741703033,
"epoch": 4.722536806342016,
"grad_norm": 1.3643742799758911,
"learning_rate": 3.4651486450727633e-06,
"loss": 0.0676,
"mean_token_accuracy": 0.9833125978708267,
"num_tokens": 13134253.0,
"step": 4170
},
{
"entropy": 0.11058039665222168,
"epoch": 4.733861834654586,
"grad_norm": 1.106510043144226,
"learning_rate": 3.384176909720718e-06,
"loss": 0.0668,
"mean_token_accuracy": 0.9826488733291626,
"num_tokens": 13165807.0,
"step": 4180
},
{
"entropy": 0.10953018330037594,
"epoch": 4.745186862967158,
"grad_norm": 1.3522114753723145,
"learning_rate": 3.304093716192527e-06,
"loss": 0.0675,
"mean_token_accuracy": 0.9823530614376068,
"num_tokens": 13194564.0,
"step": 4190
},
{
"entropy": 0.1017619101330638,
"epoch": 4.756511891279728,
"grad_norm": 1.3031588792800903,
"learning_rate": 3.22490235639461e-06,
"loss": 0.0655,
"mean_token_accuracy": 0.9847893178462982,
"num_tokens": 13227659.0,
"step": 4200
},
{
"entropy": 0.11424098871648311,
"epoch": 4.767836919592299,
"grad_norm": 1.2343190908432007,
"learning_rate": 3.146606085573603e-06,
"loss": 0.0712,
"mean_token_accuracy": 0.9818738162517547,
"num_tokens": 13255783.0,
"step": 4210
},
{
"entropy": 0.1053063541650772,
"epoch": 4.7791619479048695,
"grad_norm": 1.8224282264709473,
"learning_rate": 3.0692081221825235e-06,
"loss": 0.0655,
"mean_token_accuracy": 0.9835486084222793,
"num_tokens": 13286564.0,
"step": 4220
},
{
"entropy": 0.10083051659166813,
"epoch": 4.790486976217441,
"grad_norm": 1.631349802017212,
"learning_rate": 2.992711647748503e-06,
"loss": 0.0626,
"mean_token_accuracy": 0.9853236883878708,
"num_tokens": 13319319.0,
"step": 4230
},
{
"entropy": 0.10127135142683982,
"epoch": 4.801812004530011,
"grad_norm": 1.8996741771697998,
"learning_rate": 2.917119806741991e-06,
"loss": 0.0642,
"mean_token_accuracy": 0.9843530178070068,
"num_tokens": 13351309.0,
"step": 4240
},
{
"entropy": 0.10405859649181366,
"epoch": 4.813137032842582,
"grad_norm": 1.651363730430603,
"learning_rate": 2.842435706447499e-06,
"loss": 0.0676,
"mean_token_accuracy": 0.9830576866865158,
"num_tokens": 13381712.0,
"step": 4250
},
{
"entropy": 0.10092639103531838,
"epoch": 4.8244620611551525,
"grad_norm": 1.4375232458114624,
"learning_rate": 2.7686624168358766e-06,
"loss": 0.0674,
"mean_token_accuracy": 0.9841265827417374,
"num_tokens": 13413182.0,
"step": 4260
},
{
"entropy": 0.10942405574023724,
"epoch": 4.835787089467724,
"grad_norm": 1.705937385559082,
"learning_rate": 2.6958029704380995e-06,
"loss": 0.0691,
"mean_token_accuracy": 0.9832047790288925,
"num_tokens": 13443178.0,
"step": 4270
},
{
"entropy": 0.10540841370821,
"epoch": 4.847112117780295,
"grad_norm": 1.5828317403793335,
"learning_rate": 2.6238603622206427e-06,
"loss": 0.0681,
"mean_token_accuracy": 0.9836997509002685,
"num_tokens": 13472760.0,
"step": 4280
},
{
"entropy": 0.10759205557405949,
"epoch": 4.858437146092865,
"grad_norm": 2.081759452819824,
"learning_rate": 2.5528375494623446e-06,
"loss": 0.0691,
"mean_token_accuracy": 0.9837682038545609,
"num_tokens": 13502402.0,
"step": 4290
},
{
"entropy": 0.10228043347597122,
"epoch": 4.869762174405436,
"grad_norm": 1.5169647932052612,
"learning_rate": 2.482737451632877e-06,
"loss": 0.0656,
"mean_token_accuracy": 0.9840810686349869,
"num_tokens": 13533069.0,
"step": 4300
},
{
"entropy": 0.10236416049301625,
"epoch": 4.881087202718007,
"grad_norm": 1.245417833328247,
"learning_rate": 2.4135629502726825e-06,
"loss": 0.0656,
"mean_token_accuracy": 0.9837470918893814,
"num_tokens": 13564257.0,
"step": 4310
},
{
"entropy": 0.10065983273088933,
"epoch": 4.892412231030578,
"grad_norm": 1.7363886833190918,
"learning_rate": 2.34531688887458e-06,
"loss": 0.0646,
"mean_token_accuracy": 0.9847758024930954,
"num_tokens": 13594944.0,
"step": 4320
},
{
"entropy": 0.09625008963048458,
"epoch": 4.903737259343148,
"grad_norm": 1.4529474973678589,
"learning_rate": 2.2780020727668523e-06,
"loss": 0.063,
"mean_token_accuracy": 0.9846002101898194,
"num_tokens": 13626469.0,
"step": 4330
},
{
"entropy": 0.10502854771912098,
"epoch": 4.9150622876557195,
"grad_norm": 1.4248895645141602,
"learning_rate": 2.211621268997932e-06,
"loss": 0.0665,
"mean_token_accuracy": 0.9832880735397339,
"num_tokens": 13655878.0,
"step": 4340
},
{
"entropy": 0.09599109441041946,
"epoch": 4.92638731596829,
"grad_norm": 1.3314168453216553,
"learning_rate": 2.1461772062226744e-06,
"loss": 0.0618,
"mean_token_accuracy": 0.9860332638025284,
"num_tokens": 13689056.0,
"step": 4350
},
{
"entropy": 0.10332580804824829,
"epoch": 4.937712344280861,
"grad_norm": 2.152930498123169,
"learning_rate": 2.0816725745901598e-06,
"loss": 0.0649,
"mean_token_accuracy": 0.983710652589798,
"num_tokens": 13720715.0,
"step": 4360
},
{
"entropy": 0.10360825695097446,
"epoch": 4.949037372593431,
"grad_norm": 2.080280065536499,
"learning_rate": 2.01811002563316e-06,
"loss": 0.0649,
"mean_token_accuracy": 0.9848357111215591,
"num_tokens": 13751367.0,
"step": 4370
},
{
"entropy": 0.10016910135746002,
"epoch": 4.9603624009060026,
"grad_norm": 1.3621736764907837,
"learning_rate": 1.955492172159096e-06,
"loss": 0.0641,
"mean_token_accuracy": 0.9845564514398575,
"num_tokens": 13782327.0,
"step": 4380
},
{
"entropy": 0.09491119757294655,
"epoch": 4.971687429218573,
"grad_norm": 1.176149845123291,
"learning_rate": 1.8938215881426773e-06,
"loss": 0.0618,
"mean_token_accuracy": 0.9852659702301025,
"num_tokens": 13814598.0,
"step": 4390
},
{
"entropy": 0.09827233254909515,
"epoch": 4.983012457531144,
"grad_norm": 1.7675400972366333,
"learning_rate": 1.8331008086200557e-06,
"loss": 0.0599,
"mean_token_accuracy": 0.9853399336338043,
"num_tokens": 13847789.0,
"step": 4400
},
{
"entropy": 0.10404907390475274,
"epoch": 4.994337485843714,
"grad_norm": 1.3572454452514648,
"learning_rate": 1.7733323295846483e-06,
"loss": 0.0694,
"mean_token_accuracy": 0.9828858345746994,
"num_tokens": 13878498.0,
"step": 4410
},
{
"entropy": 0.08685027938336135,
"epoch": 5.005662514156286,
"grad_norm": 0.5327439904212952,
"learning_rate": 1.714518607884541e-06,
"loss": 0.0552,
"mean_token_accuracy": 0.9876868307590485,
"num_tokens": 13909014.0,
"step": 4420
},
{
"entropy": 0.08384513482451439,
"epoch": 5.016987542468856,
"grad_norm": 0.6326966285705566,
"learning_rate": 1.6566620611214722e-06,
"loss": 0.0448,
"mean_token_accuracy": 0.9899709314107895,
"num_tokens": 13940990.0,
"step": 4430
},
{
"entropy": 0.08707914762198925,
"epoch": 5.028312570781427,
"grad_norm": 0.6900177001953125,
"learning_rate": 1.5997650675514703e-06,
"loss": 0.0475,
"mean_token_accuracy": 0.9897072285413742,
"num_tokens": 13971095.0,
"step": 4440
},
{
"entropy": 0.0787117276340723,
"epoch": 5.0396375990939974,
"grad_norm": 0.6966872811317444,
"learning_rate": 1.5438299659870897e-06,
"loss": 0.0415,
"mean_token_accuracy": 0.9909714490175248,
"num_tokens": 14006355.0,
"step": 4450
},
{
"entropy": 0.0797785248607397,
"epoch": 5.050962627406569,
"grad_norm": 0.8368032574653625,
"learning_rate": 1.4888590557012722e-06,
"loss": 0.0425,
"mean_token_accuracy": 0.9908244729042053,
"num_tokens": 14039617.0,
"step": 4460
},
{
"entropy": 0.07591100987046957,
"epoch": 5.062287655719139,
"grad_norm": 0.6300851702690125,
"learning_rate": 1.4348545963328326e-06,
"loss": 0.0425,
"mean_token_accuracy": 0.9909938126802444,
"num_tokens": 14073268.0,
"step": 4470
},
{
"entropy": 0.08444061297923326,
"epoch": 5.07361268403171,
"grad_norm": 0.9781507253646851,
"learning_rate": 1.3818188077935807e-06,
"loss": 0.0479,
"mean_token_accuracy": 0.988967090845108,
"num_tokens": 14104311.0,
"step": 4480
},
{
"entropy": 0.08250515647232533,
"epoch": 5.0849377123442805,
"grad_norm": 0.8466204404830933,
"learning_rate": 1.3297538701770501e-06,
"loss": 0.0448,
"mean_token_accuracy": 0.9896146982908249,
"num_tokens": 14134612.0,
"step": 4490
},
{
"entropy": 0.07659826464951039,
"epoch": 5.096262740656852,
"grad_norm": 0.804344117641449,
"learning_rate": 1.2786619236689002e-06,
"loss": 0.0441,
"mean_token_accuracy": 0.9899546831846238,
"num_tokens": 14167795.0,
"step": 4500
},
{
"epoch": 5.096262740656852,
"eval_entropy": 0.1306759690893675,
"eval_loss": 0.3403843939304352,
"eval_mean_token_accuracy": 0.9495465515264824,
"eval_num_tokens": 14167795.0,
"eval_runtime": 22.7882,
"eval_samples_per_second": 46.91,
"eval_steps_per_second": 5.88,
"step": 4500
},
{
"entropy": 0.07452163267880678,
"epoch": 5.107587768969422,
"grad_norm": 0.6283188462257385,
"learning_rate": 1.2285450684589444e-06,
"loss": 0.0425,
"mean_token_accuracy": 0.9906815826892853,
"num_tokens": 14201908.0,
"step": 4510
},
{
"entropy": 0.07540215756744147,
"epoch": 5.118912797281993,
"grad_norm": 0.8341459035873413,
"learning_rate": 1.1794053646548038e-06,
"loss": 0.0424,
"mean_token_accuracy": 0.9904711753129959,
"num_tokens": 14235583.0,
"step": 4520
},
{
"entropy": 0.07949476931244134,
"epoch": 5.130237825594564,
"grad_norm": 0.7760495543479919,
"learning_rate": 1.1312448321972313e-06,
"loss": 0.0459,
"mean_token_accuracy": 0.9900894790887833,
"num_tokens": 14266482.0,
"step": 4530
},
{
"entropy": 0.07272812370210886,
"epoch": 5.141562853907135,
"grad_norm": 0.8201942443847656,
"learning_rate": 1.0840654507770915e-06,
"loss": 0.0422,
"mean_token_accuracy": 0.9906651735305786,
"num_tokens": 14300188.0,
"step": 4540
},
{
"entropy": 0.08043113667517901,
"epoch": 5.152887882219706,
"grad_norm": 1.054805040359497,
"learning_rate": 1.0378691597539598e-06,
"loss": 0.045,
"mean_token_accuracy": 0.9899501115083694,
"num_tokens": 14330803.0,
"step": 4550
},
{
"entropy": 0.08116988949477673,
"epoch": 5.164212910532276,
"grad_norm": 0.7149173021316528,
"learning_rate": 9.926578580764234e-07,
"loss": 0.0465,
"mean_token_accuracy": 0.9893332690000534,
"num_tokens": 14362375.0,
"step": 4560
},
{
"entropy": 0.07761353813111782,
"epoch": 5.1755379388448475,
"grad_norm": 0.6609951257705688,
"learning_rate": 9.484334042040161e-07,
"loss": 0.0443,
"mean_token_accuracy": 0.9897470384836197,
"num_tokens": 14394138.0,
"step": 4570
},
{
"entropy": 0.08467995468527079,
"epoch": 5.186862967157418,
"grad_norm": 0.9895986318588257,
"learning_rate": 9.051976160308173e-07,
"loss": 0.0492,
"mean_token_accuracy": 0.9889312475919724,
"num_tokens": 14422968.0,
"step": 4580
},
{
"entropy": 0.07399331219494343,
"epoch": 5.198187995469989,
"grad_norm": 0.5765529870986938,
"learning_rate": 8.629522708107352e-07,
"loss": 0.0427,
"mean_token_accuracy": 0.9902592331171036,
"num_tokens": 14456688.0,
"step": 4590
},
{
"entropy": 0.07157274391502141,
"epoch": 5.209513023782559,
"grad_norm": 0.6822951436042786,
"learning_rate": 8.21699105084453e-07,
"loss": 0.0399,
"mean_token_accuracy": 0.9916774541139602,
"num_tokens": 14491708.0,
"step": 4600
},
{
"entropy": 0.08627440240234137,
"epoch": 5.2208380520951305,
"grad_norm": 1.024498701095581,
"learning_rate": 7.814398146080315e-07,
"loss": 0.0508,
"mean_token_accuracy": 0.9888632267713546,
"num_tokens": 14519565.0,
"step": 4610
},
{
"entropy": 0.0863100489601493,
"epoch": 5.232163080407701,
"grad_norm": 1.332257866859436,
"learning_rate": 7.421760542832223e-07,
"loss": 0.0497,
"mean_token_accuracy": 0.9890684664249421,
"num_tokens": 14548908.0,
"step": 4620
},
{
"entropy": 0.07913930304348468,
"epoch": 5.243488108720272,
"grad_norm": 0.6243993043899536,
"learning_rate": 7.039094380894201e-07,
"loss": 0.0459,
"mean_token_accuracy": 0.9895853340625763,
"num_tokens": 14580672.0,
"step": 4630
},
{
"entropy": 0.0805983567610383,
"epoch": 5.254813137032842,
"grad_norm": 1.0783846378326416,
"learning_rate": 6.66641539017343e-07,
"loss": 0.0466,
"mean_token_accuracy": 0.9895779520273209,
"num_tokens": 14610845.0,
"step": 4640
},
{
"entropy": 0.07961287405341863,
"epoch": 5.266138165345414,
"grad_norm": 0.718805730342865,
"learning_rate": 6.303738890043486e-07,
"loss": 0.0461,
"mean_token_accuracy": 0.9900570958852768,
"num_tokens": 14641210.0,
"step": 4650
},
{
"entropy": 0.07430734038352967,
"epoch": 5.277463193657984,
"grad_norm": 0.6966316103935242,
"learning_rate": 5.951079788714813e-07,
"loss": 0.0429,
"mean_token_accuracy": 0.9904509067535401,
"num_tokens": 14674054.0,
"step": 4660
},
{
"entropy": 0.07951049022376537,
"epoch": 5.288788221970555,
"grad_norm": 0.772232174873352,
"learning_rate": 5.608452582621771e-07,
"loss": 0.0461,
"mean_token_accuracy": 0.9892933934926986,
"num_tokens": 14705152.0,
"step": 4670
},
{
"entropy": 0.08211956918239594,
"epoch": 5.300113250283125,
"grad_norm": 0.787604033946991,
"learning_rate": 5.275871355826828e-07,
"loss": 0.0469,
"mean_token_accuracy": 0.9894729733467102,
"num_tokens": 14734245.0,
"step": 4680
},
{
"entropy": 0.07479157857596874,
"epoch": 5.311438278595697,
"grad_norm": 0.6638301014900208,
"learning_rate": 4.953349779441619e-07,
"loss": 0.0438,
"mean_token_accuracy": 0.9901529848575592,
"num_tokens": 14766102.0,
"step": 4690
},
{
"entropy": 0.0770484933629632,
"epoch": 5.322763306908267,
"grad_norm": 0.6229594945907593,
"learning_rate": 4.6409011110648824e-07,
"loss": 0.0446,
"mean_token_accuracy": 0.9904583126306534,
"num_tokens": 14797209.0,
"step": 4700
},
{
"entropy": 0.07668529693037271,
"epoch": 5.334088335220838,
"grad_norm": 0.8131431937217712,
"learning_rate": 4.338538194237657e-07,
"loss": 0.0441,
"mean_token_accuracy": 0.9898177713155747,
"num_tokens": 14829020.0,
"step": 4710
},
{
"entropy": 0.0804054843261838,
"epoch": 5.3454133635334085,
"grad_norm": 1.0860034227371216,
"learning_rate": 4.046273457915084e-07,
"loss": 0.0465,
"mean_token_accuracy": 0.989600470662117,
"num_tokens": 14859012.0,
"step": 4720
},
{
"entropy": 0.07926053572446108,
"epoch": 5.35673839184598,
"grad_norm": 0.7582584023475647,
"learning_rate": 3.7641189159557946e-07,
"loss": 0.0465,
"mean_token_accuracy": 0.9896339029073715,
"num_tokens": 14888725.0,
"step": 4730
},
{
"entropy": 0.07964552585035563,
"epoch": 5.36806342015855,
"grad_norm": 0.9673193097114563,
"learning_rate": 3.4920861666279116e-07,
"loss": 0.0448,
"mean_token_accuracy": 0.9897122889757156,
"num_tokens": 14920550.0,
"step": 4740
},
{
"entropy": 0.0836305595934391,
"epoch": 5.379388448471121,
"grad_norm": 0.9790941476821899,
"learning_rate": 3.2301863921322395e-07,
"loss": 0.0485,
"mean_token_accuracy": 0.9896321624517441,
"num_tokens": 14950387.0,
"step": 4750
},
{
"entropy": 0.07453758120536805,
"epoch": 5.3907134767836915,
"grad_norm": 0.7324961423873901,
"learning_rate": 2.97843035814277e-07,
"loss": 0.0427,
"mean_token_accuracy": 0.9902149707078933,
"num_tokens": 14984224.0,
"step": 4760
},
{
"entropy": 0.07159966733306647,
"epoch": 5.402038505096263,
"grad_norm": 0.5410106182098389,
"learning_rate": 2.7368284133639234e-07,
"loss": 0.0424,
"mean_token_accuracy": 0.9906870782375335,
"num_tokens": 15017637.0,
"step": 4770
},
{
"entropy": 0.07854281235486268,
"epoch": 5.413363533408834,
"grad_norm": 0.5609409809112549,
"learning_rate": 2.505390489105419e-07,
"loss": 0.0439,
"mean_token_accuracy": 0.9902636379003524,
"num_tokens": 15049480.0,
"step": 4780
},
{
"entropy": 0.08379605785012245,
"epoch": 5.424688561721404,
"grad_norm": 0.8252092003822327,
"learning_rate": 2.2841260988738232e-07,
"loss": 0.0483,
"mean_token_accuracy": 0.9893587976694107,
"num_tokens": 15078517.0,
"step": 4790
},
{
"entropy": 0.08447619508951902,
"epoch": 5.436013590033975,
"grad_norm": 1.0486787557601929,
"learning_rate": 2.0730443379815835e-07,
"loss": 0.0486,
"mean_token_accuracy": 0.9892659068107605,
"num_tokens": 15108019.0,
"step": 4800
},
{
"entropy": 0.07685894444584847,
"epoch": 5.447338618346546,
"grad_norm": 0.5011602640151978,
"learning_rate": 1.8721538831731334e-07,
"loss": 0.0434,
"mean_token_accuracy": 0.9901338279247284,
"num_tokens": 15140919.0,
"step": 4810
},
{
"entropy": 0.08181709069758654,
"epoch": 5.458663646659117,
"grad_norm": 0.6206848621368408,
"learning_rate": 1.6814629922682624e-07,
"loss": 0.0473,
"mean_token_accuracy": 0.9891577929258346,
"num_tokens": 15170772.0,
"step": 4820
},
{
"entropy": 0.08480710126459598,
"epoch": 5.469988674971687,
"grad_norm": 0.8081274032592773,
"learning_rate": 1.5009795038225806e-07,
"loss": 0.0488,
"mean_token_accuracy": 0.989034104347229,
"num_tokens": 15199368.0,
"step": 4830
},
{
"entropy": 0.08092885501682759,
"epoch": 5.4813137032842585,
"grad_norm": 1.0287882089614868,
"learning_rate": 1.3307108368054155e-07,
"loss": 0.0472,
"mean_token_accuracy": 0.9893998026847839,
"num_tokens": 15228987.0,
"step": 4840
},
{
"entropy": 0.0767004294320941,
"epoch": 5.492638731596829,
"grad_norm": 0.712293267250061,
"learning_rate": 1.1706639902947791e-07,
"loss": 0.0451,
"mean_token_accuracy": 0.9901446044445038,
"num_tokens": 15260156.0,
"step": 4850
},
{
"entropy": 0.08494163490831852,
"epoch": 5.5039637599094,
"grad_norm": 1.0310022830963135,
"learning_rate": 1.0208455431896247e-07,
"loss": 0.0471,
"mean_token_accuracy": 0.9887308567762375,
"num_tokens": 15289396.0,
"step": 4860
},
{
"entropy": 0.07497126292437314,
"epoch": 5.51528878822197,
"grad_norm": 0.7754374146461487,
"learning_rate": 8.812616539395358e-08,
"loss": 0.0428,
"mean_token_accuracy": 0.9900702744722366,
"num_tokens": 15322332.0,
"step": 4870
},
{
"entropy": 0.07183474581688643,
"epoch": 5.5266138165345415,
"grad_norm": 0.8449347615242004,
"learning_rate": 7.519180602915121e-08,
"loss": 0.0424,
"mean_token_accuracy": 0.9906940549612046,
"num_tokens": 15356004.0,
"step": 4880
},
{
"entropy": 0.0772839292883873,
"epoch": 5.537938844847112,
"grad_norm": 0.7060112357139587,
"learning_rate": 6.328200790540472e-08,
"loss": 0.0435,
"mean_token_accuracy": 0.9895416975021363,
"num_tokens": 15388730.0,
"step": 4890
},
{
"entropy": 0.08096686322242022,
"epoch": 5.549263873159683,
"grad_norm": 0.6333571672439575,
"learning_rate": 5.239726058787198e-08,
"loss": 0.0453,
"mean_token_accuracy": 0.9899547636508942,
"num_tokens": 15420424.0,
"step": 4900
},
{
"entropy": 0.0785508582368493,
"epoch": 5.560588901472253,
"grad_norm": 0.7323933839797974,
"learning_rate": 4.253801150587711e-08,
"loss": 0.0464,
"mean_token_accuracy": 0.9895732134580613,
"num_tokens": 15451624.0,
"step": 4910
},
{
"entropy": 0.07821750827133656,
"epoch": 5.571913929784825,
"grad_norm": 0.94260174036026,
"learning_rate": 3.370466593453914e-08,
"loss": 0.0444,
"mean_token_accuracy": 0.989855831861496,
"num_tokens": 15482992.0,
"step": 4920
},
{
"entropy": 0.09697492402046919,
"epoch": 5.583238958097395,
"grad_norm": 0.9922671318054199,
"learning_rate": 2.589758697809086e-08,
"loss": 0.0612,
"mean_token_accuracy": 0.9866875171661377,
"num_tokens": 15513562.0,
"step": 4930
},
{
"entropy": 0.0751221977174282,
"epoch": 5.594563986409966,
"grad_norm": 0.5793846845626831,
"learning_rate": 1.9117095554974097e-08,
"loss": 0.044,
"mean_token_accuracy": 0.9899530529975891,
"num_tokens": 15545720.0,
"step": 4940
},
{
"entropy": 0.07994864359498025,
"epoch": 5.605889014722536,
"grad_norm": 0.6867619752883911,
"learning_rate": 1.3363470384636367e-08,
"loss": 0.0458,
"mean_token_accuracy": 0.9898412346839904,
"num_tokens": 15577126.0,
"step": 4950
},
{
"entropy": 0.0812272697687149,
"epoch": 5.617214043035108,
"grad_norm": 0.8053082227706909,
"learning_rate": 8.636947976065069e-09,
"loss": 0.0473,
"mean_token_accuracy": 0.9890479534864426,
"num_tokens": 15607165.0,
"step": 4960
},
{
"entropy": 0.09132701512426138,
"epoch": 5.628539071347678,
"grad_norm": 0.8204180598258972,
"learning_rate": 4.9377226180924444e-09,
"loss": 0.0562,
"mean_token_accuracy": 0.9869016110897064,
"num_tokens": 15637782.0,
"step": 4970
},
{
"entropy": 0.08176330700516701,
"epoch": 5.639864099660249,
"grad_norm": 1.1686699390411377,
"learning_rate": 2.2659463713770036e-09,
"loss": 0.0464,
"mean_token_accuracy": 0.9895883351564407,
"num_tokens": 15668276.0,
"step": 4980
},
{
"entropy": 0.07596621736884117,
"epoch": 5.65118912797282,
"grad_norm": 0.7290496230125427,
"learning_rate": 6.217290621807204e-10,
"loss": 0.0446,
"mean_token_accuracy": 0.9898297399282455,
"num_tokens": 15700845.0,
"step": 4990
},
{
"entropy": 0.07350182663649321,
"epoch": 5.662514156285391,
"grad_norm": 0.7328366041183472,
"learning_rate": 5.138277833771632e-12,
"loss": 0.0444,
"mean_token_accuracy": 0.9903534233570099,
"num_tokens": 15733342.0,
"step": 5000
},
{
"epoch": 5.662514156285391,
"eval_entropy": 0.12777303962676384,
"eval_loss": 0.3470225930213928,
"eval_mean_token_accuracy": 0.9495740079168064,
"eval_num_tokens": 15733342.0,
"eval_runtime": 22.7824,
"eval_samples_per_second": 46.922,
"eval_steps_per_second": 5.882,
"step": 5000
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.869577281465498e+16,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}