Ru Arena General
{
- "headers": [
- "model",
- "score",
- "95% CI",
- "lower",
- "upper",
- "avg_tokens",
- "std_tokens",
- "lc_score"
- "data": [
- [
- "DeepSeek - DeepSeek-V3-Chat",
- 96.3,
- "+0.7 / -0.8",
- 95.51,
- 97.02,
- 665.97,
- 504.83,
- 56.62
- [
- "-chatgpt-4o-latest",
- 94.75,
- "+0.8 / -0.7",
- 94.03,
- 95.53,
- 693.15,
- 634.2,
- 56.4
- [
- "yi-lightning",
- 93.46,
- "+0.9 / -0.9",
- 92.51,
- 94.33,
- 636.68,
- 469.74,
- 56.22
- [
- "o1-mini",
- 93.45,
- "+0.7 / -0.8",
- 92.61,
- 94.19,
- 791.18,
- 647.74,
- 56.22
- [
- "RefalMachine-RuadaptQwen-32B-Pro_v1",
- 92.18,
- "+1.2 / -1.2",
- 90.99,
- 93.43,
- 563.43,
- 387.83,
- 56.04
- [
- "claude-3-opus-20240229",
- 91.3,
- "+1.0 / -1.1",
- 90.17,
- 92.34,
- 468.69,
- 254.1,
- 55.92
- [
- "gpt-4-1106-preview",
- 90.89,
- "+1.1 / -1.1",
- 89.78,
- 91.97,
- 541.66,
- 346.59,
- 55.86
- [
- "T-Tech-T-pro-it-1.0",
- 90.87,
- "+1.1 / -1.2",
- 89.69,
- 92.02,
- 502,
- 380.68,
- 55.85
- [
- "o1-preview",
- 90.8,
- "+0.9 / -1.0",
- 89.84,
- 91.73,
- 664.89,
- 601.34,
- 55.84
- [
- "RefalMachine-RuadaptQwen2.5-32B-instruct-v1",
- 90.47,
- "+1.1 / -1.0",
- 89.48,
- 91.55,
- 527.86,
- 366.54,
- 55.8
- [
- "SberDevices-GigaChatMaxWithoutFilter",
- 89.96,
- "+1.2 / -1.4",
- 88.52,
- 91.14,
- 523.95,
- 421.87,
- 55.73
- [
- "DeepSeek, Inc.-DeepSeek-V2-Chat-0628",
- 89.67,
- "+1.3 / -1.1",
- 88.61,
- 90.96,
- 514.79,
- 340.79,
- 55.68
- [
- "gemini-1.5-pro-002",
- 89.08,
- "+1.3 / -1.0",
- 88.12,
- 90.34,
- 639.51,
- 493.3,
- 55.6
- [
- "gemini-1.5-pro-exp-0801",
- 88.89,
- "+1.0 / -1.0",
- 87.93,
- 89.85,
- 547.91,
- 411.58,
- 55.57
- [
- "RefalMachine-RuadaptQwen2.5-14B-Instruct-v1",
- 88.63,
- "+1.4 / -1.0",
- 87.59,
- 90.04,
- 572.22,
- 397.38,
- 55.54
- [
- "RefalMachine-RuadaptQwen2.5-7B-Lite-v1",
- 88.6,
- "+1.0 / -1.2",
- 87.4,
- 89.57,
- 580.95,
- 402.27,
- 55.53
- [
- "Qwen-Qwen2.5-72B-Instruct",
- 88.25,
- "+1.0 / -1.5",
- 86.77,
- 89.25,
- 557.41,
- 437.32,
- 55.48
- [
- "claude-3-5-sonnet-20240620",
- 88.17,
- "+1.4 / -1.1",
- 87.04,
- 89.55,
- 387.42,
- 248.97,
- 55.47
- [
- "ZeroAgency.ru-Zero-Mistral-Small-24B-Instruct-2501",
- 87.43,
- "+1.4 / -1.2",
- 86.25,
- 88.85,
- 565.19,
- 339.27,
- 55.37
- [
- "Vikhrmodels-vikhr-nemo-12b-instruct-r-21-09-24",
- 87.32,
- "+1.1 / -1.4",
- 85.87,
- 88.45,
- 627,
- 416.72,
- 55.35
- [
- "Google-gemma-2-27b-it",
- 86.87,
- "+1.3 / -1.7",
- 85.16,
- 88.2,
- 472.79,
- 336.56,
- 55.29
- [
- "SberDevices-GigaChatMaxDefault",
- 86.7,
- "+1.7 / -1.3",
- 85.38,
- 88.36,
- 515.48,
- 429.13,
- 55.26
- [
- "IlyaGusev-IlyaGusevvikhr_nemo_orpo_dostoevsky_12b_slerp",
- 86.64,
- "+1.2 / -1.3",
- 85.33,
- 87.81,
- 634.57,
- 405.24,
- 55.25
- [
- "-Phi-4",
- 86.59,
- "+1.1 / -1.3",
- 85.28,
- 87.66,
- 641.7,
- 439.8,
- 55.25
- [
- "IlyaGusev-IlyaGusevsaiga_nemo_12b_sft_m9_d14_simpo_m19_d31",
- 85.58,
- "+1.4 / -1.1",
- 84.51,
- 86.95,
- 649.12,
- 450.98,
- 55.1
- [
- "T-Tech-T-lite-it-1.0",
- 84.98,
- "+1.3 / -1.3",
- 83.68,
- 86.24,
- 544.92,
- 454.22,
- 55.02
- [
- "claude-3-sonnet-20240229",
- 84.27,
- "+1.5 / -1.5",
- 82.77,
- 85.81,
- 432.07,
- 248.67,
- 54.92
- [
- "gpt-4o-mini",
- 83.9,
- "+1.7 / -1.6",
- 82.26,
- 85.63,
- 448.12,
- 367.72,
- 54.86
- [
- "IlyaGusev-IlyaGusevsaiga_nemo_12b_sft_m9_d14_simpo_m22_d34",
- 83.85,
- "+1.2 / -1.4",
- 82.48,
- 85.06,
- 552.85,
- 431.47,
- 54.86
- [
- "llama-3.1-70b-instruct",
- 83.26,
- "+1.4 / -1.1",
- 82.17,
- 84.71,
- 537.53,
- 428.6,
- 54.77
- [
- "RefalMachine-ruadapt_qwen2.5_7B_ext_u48_instruct_v3",
- 81.92,
- "+1.6 / -1.3",
- 80.59,
- 83.52,
- 555.9,
- 383.32,
- 54.58
- [
- "claude-3-haiku-20240307",
- 80.74,
- "+1.5 / -1.1",
- 79.62,
- 82.24,
- 394.8,
- 221.79,
- 54.41
- [
- "mistral-large-2407",
- 80.39,
- "+2.1 / -1.5",
- 78.87,
- 82.52,
- 422.42,
- 320.72,
- 54.36
- [
- "gemini-1.5-flash-8b-exp-0827",
- 80.17,
- "+1.6 / -1.5",
- 78.66,
- 81.8,
- 596.56,
- 527.68,
- 54.33
- [
- "IlyaGusev-saiga-nemo_v3",
- 79.89,
- "+1.8 / -1.3",
- 78.55,
- 81.71,
- 550.91,
- 463.81,
- 54.29
- [
- "RefalMachine-ruadapt_qwen2.5_7B_ext_u48_instruct_v1",
- 78.67,
- "+1.3 / -2.0",
- 76.67,
- 79.94,
- 539.24,
- 377.67,
- 54.12
- [
- "RefalMachine-ruadapt_qwen2.5_7B_ext_u48_instruct_v2",
- 78.56,
- "+1.2 / -1.8",
- 76.72,
- 79.73,
- 547.74,
- 380.85,
- 54.1
- [
- "DeepSeek - DeepSeek-R1-Distill-Qwen-32B--temp-0.6--think-excluded",
- 77.28,
- "+1.6 / -2.0",
- 75.32,
- 78.84,
- 452.09,
- 402.34,
- 53.92
- [
- "command-r-plus",
- 77.17,
- "+1.5 / -1.5",
- 75.63,
- 78.62,
- 560.83,
- 424.07,
- 53.9
- [
- "gemma-2-9b-it",
- 76.5,
- "+1.3 / -1.2",
- 75.25,
- 77.8,
- 459.15,
- 312.59,
- 53.81
- [
- "Qwen-Qwen2.5-7B-Instruct",
- 76.03,
- "+1.4 / -1.9",
- 74.16,
- 77.41,
- 484.87,
- 383.86,
- 53.74
- [
- "SberDevices-GigaChat-20B-A3B-instruct-v1.5 ",
- 74.43,
- "+1.6 / -1.7",
- 72.76,
- 76.04,
- 538.06,
- 447.81,
- 53.51
- [
- "SberDevices-GigaChat-20B-A3B-instruct-v1.5",
- 73.81,
- "+2.0 / -1.6",
- 72.21,
- 75.8,
- 538.06,
- 447.81,
- 53.42
- [
- "gemma-2-9b-it-sppo-iter3",
- 73.61,
- "+1.9 / -2.1",
- 71.47,
- 75.48,
- 509.66,
- 336.7,
- 53.39
- [
- "llama-3.1-405b-instruct",
- 73.39,
- "+2.0 / -2.1",
- 71.32,
- 75.35,
- 435.44,
- 372.72,
- 53.36
- [
- "Attention Signs-Watari-7b-v1",
- 69.49,
- "+1.8 / -1.7",
- 67.78,
- 71.24,
- 616.8,
- 449.25,
- 52.8
- [
- "IlyaGusev-IlyaGusevsaiga_llama3_8b_v7",
- 67.69,
- "+1.9 / -2.1",
- 65.56,
- 69.59,
- 503.54,
- 360.83,
- 52.54
- [
- "IlyaGusev-saiga_llama3_8b_v7_no_system",
- 66.14,
- "+2.1 / -2.0",
- 64.09,
- 68.26,
- 492.28,
- 355.62,
- 52.32
- [
- "RefalMachine-ruadapt_qwen2.5_3B_ext_u48_instruct_v4",
- 66.1,
- "+1.7 / -1.9",
- 64.15,
- 67.77,
- 531.37,
- 384.03,
- 52.32
- [
- "-gemini-pro",
- 65.35,
- "+2.6 / -1.5",
- 63.81,
- 67.92,
- 396.96,
- 366.17,
- 52.21
- [
- "t-lite-instruct-0.1",
- 64.66,
- "+1.8 / -2.0",
- 62.68,
- 66.45,
- 810.27,
- 445.23,
- 52.11
- [
- "-ruadapt_llama_saiga_kto_ablitirated_ru_arena_hard_rep_pen_1.1",
- 63.3,
- "+1.9 / -2.2",
- 61.12,
- 65.15,
- 512.17,
- 366.2,
- 51.91
- [
- "DeepSeek - DeepSeek-R1-Qwen-14b-t0.0",
- 63.25,
- "+1.9 / -2.1",
- 61.17,
- 65.15,
- 446.69,
- 569.64,
- 51.91
- [
- "RefalMachine-ruadapt_qwen2.5_3B_ext_u32_instruct_v3",
- 62.34,
- "+2.1 / -2.2",
- 60.14,
- 64.42,
- 517.16,
- 330.79,
- 51.78
- [
- "RefalMachine-RuadaptQwen2.5-3B-instruct-v6",
- 62.21,
- "+1.9 / -1.6",
- 60.65,
- 64.16,
- 579.01,
- 420.59,
- 51.76
- [
- "DeepSeek - DeepSeek-R1-Qwen-14B-t0.6",
- 61.92,
- "+1.9 / -2.3",
- 59.62,
- 63.83,
- 414.79,
- 346.68,
- 51.71
- [
- "-ruadapt_llama3_instruct_lep_saiga_ablitirated_gm_kto",
- 60.04,
- "+1.5 / -1.9",
- 58.19,
- 61.56,
- 493.09,
- 359.95,
- 51.44
- [
- "-saiga_llama3_8b_recalc_bench_infer",
- 59.93,
- "+1.9 / -1.6",
- 58.28,
- 61.78,
- 519.15,
- 430.42,
- 51.43
- [
- "Yandex-yandexgpt-4-pro",
- 59.23,
- "+2.1 / -1.9",
- 57.31,
- 61.37,
- 383.8,
- 306.97,
- 51.33
- [
- "RefalMachine-ruadapt_qwen_2.5_3B_ext_u32_lep_ft_sft_kto_v2",
- 58.89,
- "+1.8 / -1.5",
- 57.34,
- 60.67,
- 541.56,
- 376.48,
- 51.28
- [
- "suzume-llama-3-8B-multilingual-orpo-borda-half",
- 57.13,
- "+2.1 / -1.8",
- 55.32,
- 59.26,
- 682.81,
- 378.3,
- 51.03
- [
- "-ruadapt_qwen_2.5_3B_ext_u32_lep_ft_sft_kto",
- 56.85,
- "+1.9 / -2.3",
- 54.52,
- 58.77,
- 536.33,
- 387.74,
- 50.99
- [
- "-ruadapt_llama_saiga_kto_ablitirated_ru_arena_hard_external_infer",
- 56.76,
- "+1.9 / -2.1",
- 54.65,
- 58.64,
- 526.43,
- 385.11,
- 50.97
- [
- "-ruadapt_llama3_extended_gm_ft_v4d1_external_infer",
- 56.5,
- "+2.1 / -2.3",
- 54.19,
- 58.61,
- 546.01,
- 352.63,
- 50.94
- [
- "-ruadapt_llama3_extended_gm_ft_v5d1_external_infer",
- 55.7,
- "+1.5 / -1.8",
- 53.91,
- 57.22,
- 591.59,
- 383.26,
- 50.82
- [
- "-ruadapt_llama_instruct_lep_saiga_ablitirated_gm_d1_v6",
- 55.56,
- "+2.2 / -1.7",
- 53.82,
- 57.76,
- 579.29,
- 501.64,
- 50.8
- [
- "phi-3-medium-4k-instruct",
- 55.15,
- "+1.7 / -2.3",
- 52.81,
- 56.85,
- 566.47,
- 485.71,
- 50.74
- [
- "llama-3-sonar-large-32k-online",
- 54.99,
- "+1.9 / -2.3",
- 52.65,
- 56.9,
- 419.91,
- 369.08,
- 50.72
- [
- "Vikhrmodels-QVikhr-2.5-1.5B-Instruct-SMPO",
- 53.67,
- "+2.3 / -1.9",
- 51.78,
- 55.98,
- 535.71,
- 390.6,
- 50.53
- [
- "Vikhrmodels-VikhrmodelsQVikhr-2.5-1.5B-Instruct-r",
- 52.82,
- "+1.7 / -1.8",
- 50.98,
- 54.53,
- 548.43,
- 423.81,
- 50.41
- [
- "-ruadapt_llama3_extended_gm_ft_v4d1",
- 52.51,
- "+1.8 / -1.8",
- 50.73,
- 54.31,
- 689.29,
- 732.82,
- 50.36
- [
- "RefalMachine-RefalMachineruadapt_mistral_7b_openchat_extended_lep_ft_external_infer",
- 51.87,
- "+1.8 / -2.0",
- 49.82,
- 53.69,
- 484.7,
- 372.78,
- 50.27
- [
- "Vikhrmodels-vikhr-2b-grndm",
- 50.72,
- "+1.9 / -1.9",
- 48.82,
- 52.63,
- 691.59,
- 630.15,
- 50.1
- [
- "google-gemma-2-2b-it",
- 50.55,
- "+2.3 / -2.0",
- 48.59,
- 52.85,
- 483.57,
- 367.72,
- 50.08
- [
- "mistral-nemo-instruct-2407",
- 50.52,
- "+2.6 / -2.4",
- 48.09,
- 53.08,
- 403.17,
- 321.53,
- 50.07
- [
- "MTSAIR-Cotype-Nano-Uncensored ",
- 50.51,
- "+1.9 / -1.4",
- 49.16,
- 52.43,
- 567.34,
- 435.47,
- 50.07
- [
- "sfr-iterative-dpo-llama-3-8b-r",
- 50.06,
- "+2.4 / -1.6",
- 48.44,
- 52.48,
- 516.74,
- 316.84,
- 50.01
- [
- "gpt-3.5-turbo-0125",
- 50,
- "+0.0 / 0.0",
- 50,
- 50,
- 220.83,
- 170.3,
- 50
- [
- "glm-4-9b-chat",
- 49.75,
- "+1.6 / -2.1",
- 47.65,
- 51.39,
- 568.81,
- 448.76,
- 49.96
- [
- "c4ai-command-r-v01",
- 48.95,
- "+3.0 / -2.1",
- 46.81,
- 51.98,
- 529.34,
- 368.98,
- 49.85
- [
- "-ruadapt_llama3_extended_gm_ft_v5d1",
- 48.17,
- "+2.1 / -1.9",
- 46.24,
- 50.23,
- 759.44,
- 729.04,
- 49.74
- [
- "-kolibri-mistral-0427-upd",
- 47.92,
- "+2.1 / -2.6",
- 45.32,
- 50.06,
- 551.33,
- 497.89,
- 49.7
- [
- "-ruadapt_llama3_8b_instruct_extended_lep_ft-external_infer",
- 47.81,
- "+2.2 / -2.1",
- 45.68,
- 49.99,
- 465.39,
- 429.25,
- 49.68
- [
- "MTSAIR-Cotype-Nano",
- 47.74,
- "+1.9 / -1.6",
- 46.16,
- 49.61,
- 542.49,
- 409.34,
- 49.67
- [
- "llama-3-instruct-8b-sppo-iter3",
- 47.45,
- "+2.3 / -2.0",
- 45.44,
- 49.72,
- 502.27,
- 304.27,
- 49.63
- [
- "-ruadapt_saiga_v7_lep_ft_external_infer",
- 47.36,
- "+2.3 / -2.2",
- 45.12,
- 49.7,
- 482.73,
- 428.93,
- 49.62
- [
- "Vikhrmodels-Vikhr-Qwen-2.5-1.5b-Instruct",
- 47.23,
- "+2.1 / -1.8",
- 45.38,
- 49.34,
- 536.05,
- 418.23,
- 49.6
- [
- "MTSAIR-Cotype-Nano-1B",
- 47.21,
- "+2.0 / -2.0",
- 45.18,
- 49.22,
- 542.49,
- 409.34,
- 49.6
- [
- "-openchat_3.5_0106_external_infer",
- 47.06,
- "+2.2 / -1.8",
- 45.29,
- 49.23,
- 430.01,
- 302.31,
- 49.58
- [
- "mixtral-8x7b-original",
- 46.94,
- "+1.7 / -1.8",
- 45.14,
- 48.63,
- 371.3,
- 278.71,
- 49.56
- [
- "-ruadapt_qwen_2.5_3B_ext_u32_lep_ft_sft_v1",
- 46.47,
- "+1.9 / -1.6",
- 44.89,
- 48.33,
- 497.51,
- 471.7,
- 49.49
- [
- "Vikhrmodels-vikhr-gemma-2b-it",
- 45.82,
- "+2.2 / -2.4",
- 43.42,
- 48.01,
- 722.83,
- 710.71,
- 49.4
- [
- "suzume-llama-3-8b-multilingual",
- 45.71,
- "+2.3 / -2.6",
- 43.13,
- 48.01,
- 641.18,
- 858.96,
- 49.38
- [
- "yandex_gpt_pro",
- 45.11,
- "+1.8 / -2.2",
- 42.89,
- 46.87,
- 345.3,
- 277.64,
- 49.3
- [
- "attn-signs-zariman-reason-7b-v0",
- 44.84,
- "+1.9 / -2.5",
- 42.33,
- 46.76,
- 1748.19,
- 1925.13,
- 49.26
- [
- "Vikhrmodels-Vikhr-Gemma-2B-instruct-v1.0",
- 44.1,
- "+1.9 / -1.9",
- 42.16,
- 46.04,
- 701.48,
- 681.22,
- 49.15
- [
- "hermes-2-theta-llama-3-8b",
- 44.07,
- "+1.9 / -2.0",
- 42.05,
- 45.97,
- 485.99,
- 390.85,
- 49.15
- [
- "gpt-3.5-turbo-1106",
- 41.47,
- "+1.8 / -2.1",
- 39.41,
- 43.26,
- 191.19,
- 177.31,
- 48.77
- [
- "RefalMachine-RuadaptQwen2.5-1.5B-instruct-v1",
- 41.03,
- "+2.0 / -1.9",
- 39.12,
- 43.04,
- 445.03,
- 391.78,
- 48.71
- [
- "llama-3-smaug-8b",
- 40.8,
- "+2.5 / -2.1",
- 38.65,
- 43.25,
- 524.02,
- 480.56,
- 48.68
- [
- "RefalMachine-RuadaptQwen-2.5-1.5B-instruct-v2",
- 40.31,
- "+1.6 / -1.9",
- 38.4,
- 41.95,
- 553.42,
- 464.04,
- 48.61
- [
- "-ruadapt_llama3_8b_instruct_extended_led_ft",
- 40.13,
- "+1.6 / -2.2",
- 37.9,
- 41.74,
- 604.91,
- 796.85,
- 48.58
- [
- "llama-3-8b-saiga-suzume-ties",
- 39.94,
- "+2.0 / -2.7",
- 37.27,
- 41.89,
- 763.27,
- 699.39,
- 48.55
- [
- "starling-lm-7b-beta",
- 39.76,
- "+2.0 / -2.2",
- 37.56,
- 41.75,
- 629.68,
- 465.08,
- 48.53
- [
- "vikhr-it-5.4-fp16-orpo-v2",
- 39.33,
- "+2.0 / -1.9",
- 37.38,
- 41.32,
- 379.23,
- 558.81,
- 48.46
- [
- "saiga_llama3_8b_v6",
- 39.17,
- "+2.1 / -1.6",
- 37.57,
- 41.23,
- 471.51,
- 463.62,
- 48.44
- [
- "llama-3-instruct-8b-simpo",
- 38.01,
- "+2.0 / -2.1",
- 35.9,
- 40.05,
- 417.5,
- 262.37,
- 48.28
- [
- "MTSAIR-Cotype-Nano-4bit",
- 37.64,
- "+1.9 / -1.9",
- 35.77,
- 39.58,
- 582.7,
- 472.43,
- 48.22
- [
- "qwen2-7b-instruct",
- 37.53,
- "+1.9 / -2.1",
- 35.46,
- 39.4,
- 340.65,
- 288.17,
- 48.21
- [
- "paralex-llama-3-8b-sft",
- 37.36,
- "+2.5 / -2.2",
- 35.11,
- 39.85,
- 688.57,
- 632.87,
- 48.18
- [
- "MTSAIR-Cotype-Nano-1B-v2",
- 36.77,
- "+1.8 / -2.1",
- 34.66,
- 38.53,
- 616.55,
- 532.28,
- 48.1
- [
- "aya-23-8b",
- 36.26,
- "+1.9 / -2.2",
- 34.06,
- 38.19,
- 554.34,
- 433.51,
- 48.02
- [
- "meta-llama-3-8b-instruct",
- 35.06,
- "+1.7 / -2.0",
- 33.11,
- 36.79,
- 450.85,
- 317.66,
- 47.85
- [
- "openchat-3.5-0106",
- 33.79,
- "+2.2 / -2.1",
- 31.69,
- 36.04,
- 492.47,
- 690.73,
- 47.67
- [
- "mistral-7b-instruct-v0.3",
- 32.92,
- "+2.1 / -2.1",
- 30.83,
- 34.99,
- 469.38,
- 455.43,
- 47.54
- [
- "vikhr-it-5.2-fp16-cp",
- 31.73,
- "+2.0 / -1.9",
- 29.78,
- 33.73,
- 543.44,
- 441.71,
- 47.37
- [
- "gigachat_pro",
- 31.37,
- "+2.2 / -2.0",
- 29.41,
- 33.53,
- 294.33,
- 242.61,
- 47.32
- [
- "hermes-2-pro-llama-3-8b",
- 30.78,
- "+2.3 / -2.2",
- 28.53,
- 33.09,
- 463.45,
- 559.96,
- 47.24
- [
- "openchat-3.6-8b-20240522",
- 30.28,
- "+1.4 / -2.2",
- 28.09,
- 31.71,
- 428.7,
- 400.82,
- 47.17
- [
- "vikhr-it-5.3-fp16-32k",
- 27.81,
- "+1.7 / -2.3",
- 25.46,
- 29.46,
- 519.71,
- 516.09,
- 46.81
- [
- "vikhr-it-5.3-fp16",
- 22.73,
- "+2.1 / -1.8",
- 20.95,
- 24.83,
- 523.45,
- 543.91,
- 46.08
- [
- "snorkel-mistral-pairrm-dpo",
- 22.41,
- "+1.7 / -1.6",
- 20.78,
- 24.11,
- 773.8,
- 950.3,
- 46.04
- [
- "kolibri-vikhr-mistral-0427",
- 22.41,
- "+1.8 / -1.5",
- 20.9,
- 24.22,
- 489.89,
- 566.29,
- 46.04
- [
- "storm-7b",
- 20.62,
- "+1.8 / -1.4",
- 19.22,
- 22.41,
- 419.32,
- 190.85,
- 45.78
- [
- "Vikhrmodels-Vikhr-Llama-3.2-1B-instruct",
- 19.04,
- "+1.6 / -1.3",
- 17.7,
- 20.63,
- 958.63,
- 1297.33,
- 45.56
- [
- "neural-chat-7b-v3-3",
- 19.03,
- "+1.6 / -1.5",
- 17.52,
- 20.61,
- 927.21,
- 1211.62,
- 45.55
- [
- "gigachat_lite",
- 17.2,
- "+1.3 / -1.4",
- 15.81,
- 18.5,
- 276.81,
- 329.66,
- 45.29
- [
- "Vikhrmodels-Vikhr-Qwen-2.5-0.5b-Instruct",
- 16.5,
- "+1.0 / -1.3",
- 15.15,
- 17.5,
- 583.5,
- 506.76,
- 45.19
- [
- "Qwen-Qwen2.5-1.5B-Instruct",
- 16.46,
- "+1.6 / -1.4",
- 15.11,
- 18.03,
- 483.67,
- 674.11,
- 45.19
- [
- "Vikhrmodels-vikhr-qwen-1.5b-it",
- 13.18,
- "+1.0 / -1.3",
- 11.92,
- 14.18,
- 2495.38,
- 741.45,
- 44.72
- [
- "meta-llama-Llama-3.2-1B-Instruct",
- 4.04,
- "+0.6 / -0.7",
- 3.3,
- 4.68,
- 1240.53,
- 1783.08,
- 43.42
- [
- "Qwen-Qwen2.5-0.5B-Instruct",
- 4.02,
- "+0.7 / -0.6",
- 3.39,
- 4.76,
- 829.87,
- 931.51,
- 43.42
- [
- "HuggingFaceTB-SmolLM2-1.7B-instruct",
- 1.75,
- "+0.5 / -0.5",
- 1.28,
- 2.24,
- 486.11,
- 473.23,
- 43.1
- [
- "HuggingFaceTB-SmolLM2-1.7B-Instruct",
- 1.72,
- "+0.6 / -0.4",
- 1.29,
- 2.27,
- 486.11,
- 473.23,
- 43.1
- [
- "HuggingFaceTB-SmolLM2-135M-Instruct",
- 0.64,
- "+0.3 / -0.2",
- 0.4,
- 0.92,
- 524.63,
- 589.35,
- 42.94
- [
- "HuggingFaceTB-SmolLM2-135M-instruct",
- 0.55,
- "+0.3 / -0.2",
- 0.34,
- 0.83,
- 524.63,
- 589.35,
- 42.93
- [
- "HuggingFaceTB-SmolLM2-360M-Instruct",
- 0.15,
- "+0.2 / -0.1",
- 0.02,
- 0.31,
- 312.11,
- 481.48,
- 42.87
- [
- "metadata": null
Что это ?
Ru Arena Hard
Это инструмент для автоматической оценки моделей на русском языке с помощью сильной LLM (GPT-4-1106-preview). Использует систему ELO рангов.
- Основывается на фиксированном наборе из 500 промптов, разбитым по 50 темам. Каждая модель дает свой ответ на каждый промпт, после чего он сравнивается с ответами на эти же промпты от модели-бейзлайна (gpt-3.5-turbo-0125).
Важными особенностями отличающими Arena-Hard-Auto от обычного SBS ялвются:
- При сравнениях ответов учитываются 3 основных случая: >> (сильно лучше), > (просто лучше) и = (примерно одинаково), за случаи когда один ответ сильно лучше другого вес вердикта увеличивается в 3 раза
- Для удаления позиционного биаса в промпте модели-судьи, каждое сравнение делается 2 раза (ответы моделей переставляются местами в промпте).
- Бутстрапирование результатов сравнений для получения доверительных интервалов Использование системы ELO рангов и предсказения винрейта с помощью Bradley–Terry модели
В отличие от оригинала Arena-Hard-Auto, эта версия содержит некоторые изменения:
- Изменен промпт для модели-оценщика, для того чтобы сравнивать модели в том числе по владению русским языком, сам промпт находится в config/judge_config.yaml
- Добавлена функция контроля длины ответов для штрафования за слишком длинные ответы по сравнению с бейзлайном (экспериментально)
- В качестве бейзлайна используется gpt-3.5-turbo-0125, в отличие от GPT-4, так как для русского языка модели менее развиты чем для английского
- Добавлены функции генерации с gigachat и yandexgpt
- Фиксы некоторых багов в оригинальной имплементации
Reproducibility
Colab - https://colab.research.google.com/drive/1w8f2kN8-JWJ_JjLvgEZAt7UDGpwOoEfy?usp=sharing
✉️✨ Submit your model here!
Model type
Precision
Weights type