cuda, sycl : fix batched gemm when ne02 == 1 && ne03 > 1 (#15038 )

* cuda, sycl : fix batched gemm when ne02 == 1 && ne03 > 1 ggml-ci * cont : fix cont types ggml-ci * cont : adopt variable names and comment from the other branch
ci : check that pre-tokenizer hashes are up-to-date (#15032 )
2026-02-05 13:53:23 +02:00 · 2025-08-02 17:13:05 +03:00 · 2025-08-02 14:39:01 +02:00 · 2025-08-02 12:51:02 +02:00 · 2025-08-02 18:04:48 +08:00 · 2025-08-02 11:21:37 +02:00
9 changed files with 183 additions and 47 deletions
--- a/.github/workflows/pre-tokenizer-hashes.yml
+++ b/.github/workflows/pre-tokenizer-hashes.yml
@@ -0,0 +1,45 @@
+name: Check Pre-Tokenizer Hashes
+
+on:
+    push:
+        paths:
+            - 'convert_hf_to_gguf.py'
+            - 'convert_hf_to_gguf_update.py'
+    pull_request:
+        paths:
+            - 'convert_hf_to_gguf.py'
+            - 'convert_hf_to_gguf_update.py'
+
+jobs:
+    pre-tokenizer-hashes:
+        runs-on: ubuntu-latest
+
+        steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+
+        - name: Set up Python
+          uses: actions/setup-python@v5
+          with:
+              python-version: '3.11'
+
+        - name: Install Python dependencies
+          run: |
+              python3 -m venv .venv
+              .venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
+
+        - name: Update pre-tokenizer hashes
+          run: |
+              cp convert_hf_to_gguf.py /tmp
+              .venv/bin/python convert_hf_to_gguf_update.py --check-missing
+
+        - name: Check if committed pre-tokenizer hashes matches generated version
+          run: |
+              if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
+                  echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
+                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
+                  echo "Differences found:"
+                  diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
+                  exit 1
+              fi
+              echo "Model pre-tokenizer hashes are up to date."
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1646,7 +1646,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
        "|<function name=\"([^\"]+)\">"  // match 5 (function name again)
    );

-    if (auto res = builder.try_find_regex(open_regex)) {
+    while (auto res = builder.try_find_regex(open_regex)) {
        const auto & block_start = res->groups[1];
        std::string block_end = block_start.empty() ? "" : "```";

@@ -1668,7 +1668,6 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
                    builder.consume_literal(block_end);
                    builder.consume_spaces();
                }
-                builder.add_content(builder.consume_rest());
            } else {
                throw common_chat_msg_partial_exception("failed to parse tool call");
            }
@@ -1693,11 +1692,10 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
                    builder.consume_spaces();
                }
            }
-            builder.add_content(builder.consume_rest());
        }
-    } else {
-        builder.add_content(builder.consume_rest());
    }
+
+    builder.add_content(builder.consume_rest());
 }

 static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -702,6 +702,9 @@ class TextModel(ModelBase):
        if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
            # ref: https://huggingface.co/moonshotai/Kimi-K2-Base
            res = "kimi-k2"
+        if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
+            # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
+            res = "qwen2"
        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
            res = "llama-bpe"
@@ -849,9 +852,6 @@ class TextModel(ModelBase):
        if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
            res = "exaone4"
-        if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
-            # ref: https://huggingface.co/Qwen/Qwen3-Embedding-8B
-            res = "qwen2"

        if res is None:
            logger.warning("\n")
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -59,6 +59,10 @@ parser.add_argument(
    "--full", action="store_true",
    help="download full list of models - make sure you have access to all of them",
 )
+parser.add_argument(
+    "--check-missing", action="store_true",
+    help="only check for missing pre-tokenizer hashes",
+)
 parser.add_argument(
    "hf_token",
    help="optional HF token",
@@ -70,6 +74,10 @@ hf_token = args.hf_token if args.hf_token is not None else hf_token
 if hf_token is None:
    logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")

+if args.check_missing and args.full:
+    logger.warning("Downloading full list of models requested, ignoring --check-missing!")
+    args.check_missing = False
+
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
 CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
@@ -147,6 +155,7 @@ pre_computed_hashes = [
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
    {"name": "kimi-k2",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base",   "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
+    {"name": "qwen2",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
 ]


@@ -221,12 +230,13 @@ if not args.full:
    all_models = models.copy()
    models = [model for model in all_models if model["name"] not in existing_models]

-logging.info(f"Downloading {len(models)} models...")
-for model in models:
-    try:
-        download_model(model)
-    except Exception as e:
-        logger.error(f"Failed to download model {model['name']}. Error: {e}")
+if not args.check_missing:
+    logging.info(f"Downloading {len(models)} models...")
+    for model in models:
+        try:
+            download_model(model)
+        except Exception as e:
+            logger.error(f"Failed to download model {model['name']}. Error: {e}")


 # generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1852,6 +1852,9 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
    ggml_cuda_pool_alloc<cuda_t> src0_alloc(ctx.pool());
    ggml_cuda_pool_alloc<cuda_t> src1_alloc(ctx.pool());

+    bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
+    bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
+
    // Handle src0
    src0_ptr = (const cuda_t *) src0->data;

@@ -1870,6 +1873,8 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
        s11 = ne10;
        s12 = ne11*s11;
        s13 = ne12*s12;
+
+        is_src1_cont_2 = true;
    }

    // Setup destination buffer
@@ -1918,15 +1923,19 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
    const int64_t r2 = ne12/ne02;
    const int64_t r3 = ne13/ne03;

-    if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
+    if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
+        // with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
+        const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
+        const int64_t smb = ne12 == 1 ? s13       : s12;
+
        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
        // use cublasGemmStridedBatchedEx
        CUBLAS_CHECK(
        cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
-                alpha, src0_ptr, cu_data_type_a, nb01/nb00, nb02/nb00, // strideA
-                       src1_ptr, cu_data_type_b, s11,       s12,       // strideB
-                beta,     dst_t, cu_data_type,   ne0,       ne1*ne0,   // strideC
+                alpha, src0_ptr, cu_data_type_a, nb01/nb00, sma,     // strideA
+                       src1_ptr, cu_data_type_b, s11,       smb,     // strideB
+                beta,     dst_t, cu_data_type,   ne0,       ne1*ne0, // strideC
                ne12*ne13,
                cu_compute_type,
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -2688,6 +2688,9 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
    const size_t       type_size_src0 = ggml_type_size(src0->type);
    const size_t       type_size_src1 = ggml_type_size(src1->type);

+    bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
+    bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
+
    // SRC1 strides
    int64_t                          s11 = nb11 / type_size_src1;
    int64_t                          s12 = nb12 / type_size_src1;
@@ -2737,6 +2740,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
        s11      = ne10;
        s12      = ne11 * s11;
        s13      = ne12 * s12;
+
+        is_src1_cont_2 = true;
    }

    ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
@@ -2852,12 +2857,16 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
    else
 #endif
    {
-        if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
+        if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
+            // with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
+            const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
+            const int64_t smb = ne12 == 1 ? s13       : s12;
+
            // there is no broadcast and src0, src1 are contiguous across dims 2, 3
            SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
                                                        oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
-                                                        src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
-                                                        src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_ddf,
+                                                        src0_f16, dpct::library_data_t::real_half, nb01 / nb00, sma,
+                                                        src1_f16, dpct::library_data_t::real_half, s11, smb, beta, dst_ddf,
                                                        mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
        } else {
            const int ne23 = ne12 * ne13;
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -2106,12 +2106,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
        s_mmq_wg_denoms = { 32,  64,  1 };

        // spec constants and tile sizes for quant matmul (Qi_K)
-        l_warptile_mmq_k = { 256, 64, 128, 64,  1 };
-        m_warptile_mmq_k = { 256, 32,  64, 64,  0 };
-        s_warptile_mmq_k = { 256, 32,  32, 128, 0 };
-        l_mmq_wg_denoms_k = { 64, 128, 1 };
-        m_mmq_wg_denoms_k = { 32,  64, 1 };
-        s_mmq_wg_denoms_k = { 32,  32, 1 };
+        l_warptile_mmq_k = { 256, 128, 256, 64, 1 };
+        m_warptile_mmq_k = { 256, 128, 128, 64, 1 };
+        s_warptile_mmq_k = { 256, 32,  64, 128, 0 };
+        l_mmq_wg_denoms_k = { 128, 256, 1 };
+        m_mmq_wg_denoms_k = { 128, 128, 1 };
+        s_mmq_wg_denoms_k = { 32,  64,  1 };

        // spec constants and tile sizes for quant matmul_id
        l_warptile_mmqid = { 256, 128, 128, 16, 0 };
@@ -5022,26 +5022,37 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
    ggml_vk_queue_command_pools_cleanup(dst->device);
 }

-static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
+static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, uint32_t m, uint32_t n, uint32_t k, const vk_pipeline& pipeline) {
    VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");

    uint32_t split_k = 1;
-    if (ctx->device->shader_core_count != 0 && m >= (int)pipeline->wg_denoms[0] && n >= (int)pipeline->wg_denoms[1]) {
+    if (ctx->device->shader_core_count != 0 && m >= pipeline->wg_denoms[0] && n >= pipeline->wg_denoms[1]) {
        // If k is 'large' and the SMs will fill less than halfway, use split_k.
        uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]);
        uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]);
-        if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count / 2) {
-            split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
-            // Clamp to 2 or 4
-            split_k = std::min(split_k, 4u);
-            if (split_k == 3) {
-                split_k = 2;
+
+        if (k >= 2048) {
+            if (m_tiles * n_tiles <= ctx->device->shader_core_count / 2) {
+                split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
+            } else if (m_tiles * n_tiles <= ctx->device->shader_core_count * 2 / 3) {
+                split_k = 3;
            }
-            if (ctx->device->coopmat2) {
-                // coopmat2 shader expects splits to be aligned to 256
-                while (split_k > 1 && ((k / split_k) % 256) != 0) {
-                    split_k /= 2;
+            // Cap the split at 8x. Unless k is huge this is a lot of overhead.
+            split_k = std::min(split_k, 8u);
+
+            // ggml_vk_matmul will align the splits to be a multiple of 256.
+            // If this rounded up size would cause the last split to be empty,
+            // then reduce the split count.
+            while (true) {
+                if (split_k == 1) {
+                    break;
                }
+                uint32_t k_split = CEIL_DIV(k, split_k);
+                k_split = ROUNDUP_POW2(k_split, 256);
+                if (k_split * (split_k - 1) < k) {
+                    break;
+                }
+                split_k--;
            }
        }
    }
@@ -5053,9 +5064,22 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
    VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");

    if (ctx->device->coopmat2) {
+        const uint32_t shader_core_count = ctx->device->shader_core_count;
+        const uint32_t tiles_l = CEIL_DIV(m, mmp->a_l->wg_denoms[0]) * CEIL_DIV(n, mmp->a_l->wg_denoms[1]);
+        const uint32_t tiles_m = CEIL_DIV(m, mmp->a_m->wg_denoms[0]) * CEIL_DIV(n, mmp->a_m->wg_denoms[1]);
+
        // Use large shader when the N dimension is greater than the medium shader's tile size
        uint32_t crossover_large = mmp->m->wg_denoms[1];
-        if ((ctx->device->mul_mat_l[src0_type] && (n > crossover_large)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
+
+        // Prefer large over medium if either:
+        // - medium or large tiles would overfill the GPU
+        // - large tiles with a split_k==3 fits in the GPU and medium tiles with split_k==2 does not
+        //   (medium with split_k==2 is probably better if it fits - more workgroups running and less split_k overhead)
+        bool prefer_large = tiles_m > shader_core_count || tiles_l > shader_core_count ||
+                            // split_k==3 with large tiles likely better than medium tiles with no split_k.
+                            (tiles_l <= shader_core_count / 3 && tiles_m > shader_core_count / 2);
+
+        if ((ctx->device->mul_mat_l[src0_type] && (n > crossover_large && prefer_large)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
            return aligned ? mmp->a_l : mmp->l;
        }
        // Use medium shader when the N dimension is greater than the small shader's tile size
@@ -5099,7 +5123,11 @@ static void ggml_vk_matmul(

    GGML_ASSERT(batch_stride_d == m * n);

-    const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n };
+    // Round the split size up to a multiple of 256 (k-quant alignment)
+    uint32_t k_split = CEIL_DIV(k, split_k);
+    k_split = ROUNDUP_POW2(k_split, 256);
+
+    const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k_split, ne02, ne12, broadcast2, broadcast3, padded_n };
    // Make sure enough workgroups get assigned for split k to work
    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
    ggml_vk_sync_buffers(subctx);
--- a/requirements/requirements-convert_hf_to_gguf_update.txt
+++ b/requirements/requirements-convert_hf_to_gguf_update.txt
@@ -1,7 +1 @@
 -r ./requirements-convert_legacy_llama.txt
--extra-index-url https://download.pytorch.org/whl/cpu
-torch~=2.2.1; platform_machine != "s390x"
-
-# torch s390x packages can only be found from nightly builds
--extra-index-url https://download.pytorch.org/whl/nightly
-torch>=0.0.0.dev0; platform_machine == "s390x"
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -953,6 +953,33 @@ static void test_template_output_parsers() {
                /* is_partial= */ false,
                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));

+        // Test multiple tool calls
+        common_chat_msg message_assist_multiple_calls;
+        message_assist_multiple_calls.role = "assistant";
+        message_assist_multiple_calls.content = "";
+        message_assist_multiple_calls.tool_calls.push_back({"special_function", "{\"arg1\": 1}", ""});
+        message_assist_multiple_calls.tool_calls.push_back({"python", "{\"code\":\"print('hello')\"}", ""});
+
+        assert_msg_equals(
+            message_assist_multiple_calls,
+            common_chat_parse(
+                "<tool_call>\n"
+                "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</tool_call>\n"
+                "<tool_call>\n"
+                "{\"name\": \"python\", \"arguments\": {\"code\":\"print('hello')\"}}\n"
+                "</tool_call>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+
+        assert_msg_equals(
+            message_assist_multiple_calls,
+            common_chat_parse(
+                "<function=special_function>{\"arg1\": 1}</function>\n"
+                "<function=python>{\"code\":\"print('hello')\"}</function>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+
        assert_msg_equals(
            simple_assist_msg(
                "This is not a tool call:",
@@ -1039,6 +1066,22 @@ static void test_template_output_parsers() {
                      "<tool_call>\n"
                      "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                      "</tool_call>");
+
+        // Test multiple tool calls with template
+        common_chat_msg message_assist_multiple_calls_template;
+        message_assist_multiple_calls_template.role = "assistant";
+        message_assist_multiple_calls_template.content = "";
+        message_assist_multiple_calls_template.tool_calls.push_back({"special_function", "{\"arg1\": 1}", ""});
+        message_assist_multiple_calls_template.tool_calls.push_back({"python", "{\"code\":\"print('test')\"}", ""});
+
+        test_templates(tmpls.get(), end_tokens, message_assist_multiple_calls_template, tools,
+                      "<tool_call>\n"
+                      "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                      "</tool_call>\n"
+                      "<tool_call>\n"
+                      "{\"name\": \"python\", \"arguments\": {\"code\":\"print('test')\"}}\n"
+                      "</tool_call>");
+
        test_templates(tmpls.get(), end_tokens, message_assist_call_python_lines, tools,
                      "<tool_call>\n"
                      "{\"name\": \"python\", \"arguments\": {\"code\":\"# This is a program:\\nprint('hey')\"}}\n"
Author	SHA1	Message	Date
Georgi Gerganov	15e92fd337	cuda, sycl : fix batched gemm when ne02 == 1 && ne03 > 1 (#15038 ) * cuda, sycl : fix batched gemm when ne02 == 1 && ne03 > 1 ggml-ci * cont : fix cont types ggml-ci * cont : adopt variable names and comment from the other branch	2025-08-02 17:13:05 +03:00
Sigbjørn Skjæret	2bf3fbf0b5	ci : check that pre-tokenizer hashes are up-to-date (#15032 ) * torch is not required for convert_hf_to_gguf_update * add --check-missing parameter * check that pre-tokenizer hashes are up-to-date	2025-08-02 14:39:01 +02:00
Douglas Hanley	711d5e6fe6	convert : fix Qwen3-Embedding pre-tokenizer hash (#15030 ) Some checks are pending CI / macOS-latest-cmake-arm64 (push) Waiting to run CI / macOS-latest-cmake-x64 (push) Waiting to run CI / macOS-latest-cmake-arm64-webgpu (push) Waiting to run CI / ubuntu-cpu-cmake (arm64, ubuntu-22.04-arm) (push) Waiting to run CI / ubuntu-cpu-cmake (x64, ubuntu-22.04) (push) Waiting to run CI / ubuntu-latest-cmake-sanitizer (Debug, ADDRESS) (push) Waiting to run CI / ubuntu-latest-cmake-sanitizer (Debug, THREAD) (push) Waiting to run CI / ubuntu-latest-cmake-sanitizer (Debug, UNDEFINED) (push) Waiting to run CI / ubuntu-latest-llguidance (push) Waiting to run CI / ubuntu-latest-cmake-rpc (push) Waiting to run CI / ubuntu-22-cmake-vulkan (push) Waiting to run CI / ubuntu-22-cmake-webgpu (push) Waiting to run CI / ubuntu-22-cmake-hip (push) Waiting to run CI / ubuntu-22-cmake-musa (push) Waiting to run CI / ubuntu-22-cmake-sycl (push) Waiting to run CI / ubuntu-22-cmake-sycl-fp16 (push) Waiting to run CI / build-linux-cross (push) Waiting to run CI / build-cmake-pkg (push) Waiting to run CI / macOS-latest-cmake-ios (push) Waiting to run CI / macOS-latest-cmake-tvos (push) Waiting to run CI / macOS-latest-cmake-visionos (push) Waiting to run CI / macOS-latest-swift (generic/platform=iOS) (push) Waiting to run CI / macOS-latest-swift (generic/platform=macOS) (push) Waiting to run CI / macOS-latest-swift (generic/platform=tvOS) (push) Waiting to run CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Waiting to run CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Waiting to run CI / windows-latest-cmake (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Waiting to run CI / windows-latest-cmake (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Waiting to run CI / windows-latest-cmake (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Waiting to run CI / windows-latest-cmake (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=… (push) Waiting to run CI / windows-latest-cmake (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Waiting to run CI / ubuntu-latest-cmake-cuda (push) Waiting to run CI / windows-2022-cmake-cuda (12.4) (push) Waiting to run CI / windows-latest-cmake-sycl (push) Waiting to run CI / windows-latest-cmake-hip (push) Waiting to run CI / ios-xcode-build (push) Waiting to run CI / android-build (push) Waiting to run CI / openEuler-latest-cmake-cann (aarch64, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run CI / openEuler-latest-cmake-cann (x86, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run Python check requirements.txt / check-requirements (push) Waiting to run flake8 Lint / Lint (push) Waiting to run Python Type-Check / pyright type-check (push) Waiting to run	2025-08-02 12:51:02 +02:00
Jhen-Jie Hong	f738989dcb	chat : fix multiple tool_calls on hermes-2-pro (#14962 )	2025-08-02 18:04:48 +08:00
Jeff Bolz	4cb208c93c	vulkan: coopmat2 mul_mat optimizations (#14934 ) - Increase tile size for k-quants, to match non-k-quants - Choose more carefully between large and medium tiles, considering how it interacts with split_k - Allow larger/non-power of two split_k, and make the splits a multiple of 256 - Use split_k==3 to when >1/2 and <=2/3 of the SMs would hae been used	2025-08-02 11:21:37 +02:00