log : add common_log_get_verbosity_thold()

libs : add libllama-common-base
cont : fix build_info exports
2026-04-16 16:27:32 +03:00 · 2026-04-15 14:05:42 +03:00 · 2026-04-15 13:24:09 +03:00 · 2026-04-15 13:21:47 +03:00 · 2026-04-15 13:21:47 +03:00 · 2026-04-15 13:21:47 +03:00
76 changed files with 1091 additions and 322 deletions
--- a/.github/workflows/build-vulkan.yml
+++ b/.github/workflows/build-vulkan.yml
@@ -93,4 +93,5 @@ jobs:
          export GGML_VK_DISABLE_F16=1
          export GGML_VK_DISABLE_COOPMAT=1
          # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 4800
+          # test-backend-ops is too slow on llvmpipe, skip it
+          ctest -L main -E test-backend-ops --verbose --timeout 900
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -225,7 +225,7 @@ foreach(FILE_PATH ${EXTRA_LICENSES})
 endforeach()

 if (LLAMA_BUILD_COMMON)
-    license_generate(common)
+    license_generate(llama-common)
 endif()

 #
@@ -249,6 +249,10 @@ set_target_properties(llama

 install(TARGETS llama LIBRARY PUBLIC_HEADER)

+if (LLAMA_BUILD_COMMON)
+    install(TARGETS llama-common LIBRARY)
+endif()
+
 configure_package_config_file(
        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in
        ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,9 +1,11 @@
-# common
-
 find_package(Threads REQUIRED)

 llama_add_compile_flags()

+#
+# llama-common-base
+#
+
 # Build info header

 if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
@@ -33,17 +35,25 @@ endif()

 set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
 set(OUTPUT_FILE   "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
+
 configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})

-set(TARGET build_info)
-add_library(${TARGET} OBJECT ${OUTPUT_FILE})
+set(TARGET llama-common-base)
+add_library(${TARGET} STATIC ${OUTPUT_FILE})
+
+target_include_directories(${TARGET} PUBLIC .)
+
 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()

-set(TARGET common)
+#
+# llama-common
+#

-add_library(${TARGET} STATIC
+set(TARGET llama-common)
+
+add_library(${TARGET}
    arg.cpp
    arg.h
    base64.hpp
@@ -106,17 +116,24 @@ add_library(${TARGET} STATIC
    jinja/caps.h
    )

+set_target_properties(${TARGET} PROPERTIES
+    VERSION ${LLAMA_INSTALL_VERSION}
+    SOVERSION 0
+    MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
+)
+
 target_include_directories(${TARGET} PUBLIC . ../vendor)
 target_compile_features   (${TARGET} PUBLIC cxx_std_17)

 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+    # TODO: make fine-grained exports in the future
+    set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()

-target_link_libraries(${TARGET} PRIVATE
-    build_info
-    cpp-httplib
-)
+target_link_libraries(${TARGET} PUBLIC  llama-common-base)
+target_link_libraries(${TARGET} PRIVATE cpp-httplib)

 if (LLAMA_LLGUIDANCE)
    include(ExternalProject)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1,5 +1,6 @@
 #include "arg.h"

+#include "build-info.h"
 #include "chat.h"
 #include "common.h"
 #include "download.h"
@@ -1044,8 +1045,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--version"},
        "show version and build info",
        [](common_params &) {
-            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+            fprintf(stderr, "version: %d (%s)\n", llama_build_number(), llama_commit());
+            fprintf(stderr, "built with %s for %s\n", llama_compiler(), llama_build_target());
            exit(0);
        }
    ));
--- a/common/build-info.cpp.in
+++ b/common/build-info.cpp.in
@@ -1,4 +1,35 @@
+#include "build-info.h"
+
+#include <cstdio>
+#include <string>
+
 int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
-char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
-char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
-char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
+char const * LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
+char const * LLAMA_COMPILER = "@BUILD_COMPILER@";
+char const * LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
+
+int llama_build_number(void) {
+    return LLAMA_BUILD_NUMBER;
+}
+
+const char * llama_commit(void) {
+    return LLAMA_COMMIT;
+}
+
+const char * llama_compiler(void) {
+    return LLAMA_COMPILER;
+}
+
+const char * llama_build_target(void) {
+    return LLAMA_BUILD_TARGET;
+}
+
+const char * llama_build_info(void) {
+    static std::string s = "b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT;
+    return s.c_str();
+}
+
+void llama_print_build_info(void) {
+    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, llama_build_number(), llama_commit());
+    fprintf(stderr, "%s: built with %s for %s\n", __func__, llama_compiler(), llama_build_target());
+}
--- a/common/build-info.h
+++ b/common/build-info.h
@@ -0,0 +1,11 @@
+#pragma once
+
+int llama_build_number(void);
+
+const char * llama_commit(void);
+const char * llama_compiler(void);
+
+const char * llama_build_target(void);
+const char * llama_build_info(void);
+
+void llama_print_build_info(void);
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -198,10 +198,19 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
        args_field = format.function_field + "." + args_field;
    }

-    auto tools_parser = p.standard_json_tools(
-        format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
-        inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
-        format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
+    auto tools_parser = p.eps();
+    if (format.section_start.empty() && !format.per_call_start.empty()) {
+        auto single_tool_parser = p.standard_json_tools(
+            format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls,
+            inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
+            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
+        tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space()));
+    } else {
+        tools_parser = p.standard_json_tools(
+            format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
+            inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
+            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
+    }

    // Handle content wrappers if present
    if (ctx.content && ctx.content->is_always_wrapped()) {
--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@@ -308,19 +308,23 @@ struct analyze_tools : analyze_base {

  private:
    // Extract tool calling 'haystack' for further analysis and delegate further analysis based on format
-    void analyze_tool_calls(const analyze_reasoning & reasoning);
+    void analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls);

    // Analyze format based on position of function and argument name in needle
    void analyze_tool_call_format(const std::string &       haystack,
                                  const std::string &       fun_name_needle,
                                  const std::string &       arg_name_needle,
-                                  const analyze_reasoning & reasoning);
+                                  const analyze_reasoning & reasoning,
+                                  bool                      supports_parallel_tool_calls);

    // Analyze specifics of JSON native format (entire tool call is a JSON object)
    void analyze_tool_call_format_json_native(const std::string & clean_haystack,
                                              const std::string & fun_name_needle,
                                              const std::string & arg_name_needle);

+    // Check if parallel calls in JSON native format array wrapped or tag wrapped
+    void analyze_json_native_parallel_calls();
+
    // Analyze specifics of non-JSON native format (tags for function name or for function name and arguments)
    void analyze_tool_call_format_non_json(const std::string & clean_haystack,
                                           const std::string & fun_name_needle);
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@@ -558,7 +558,7 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
    : analyze_base(tmpl) {
    LOG_DBG(ANSI_ORANGE "Phase 3: Tool call analysis\n" ANSI_RESET);

-    analyze_tool_calls(reasoning);
+    analyze_tool_calls(reasoning, caps.supports_parallel_tool_calls);

    if (format.mode != tool_format::NONE && format.mode != tool_format::JSON_NATIVE) {
        if (caps.supports_parallel_tool_calls) {
@@ -577,7 +577,7 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
    }
 }

-void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
+void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls) {
    json assistant_no_tools = json{
        { "role",    "assistant"   },
        { "content", ASSISTANT_MSG }
@@ -611,13 +611,14 @@ void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
        return;
    }

-    analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning);
+    analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning, supports_parallel_tool_calls);
 }

 void analyze_tools::analyze_tool_call_format(const std::string &       haystack,
                                             const std::string &       fun_name_needle,
                                             const std::string &       arg_name_needle,
-                                             const analyze_reasoning & reasoning) {
+                                             const analyze_reasoning & reasoning,
+                                             bool                      supports_parallel_tool_calls) {
    if (fun_name_needle.empty() || arg_name_needle.empty() || haystack.empty()) {
        return;
    }
@@ -660,6 +661,9 @@ void analyze_tools::analyze_tool_call_format(const std::string &       haystack,

    if (format.mode == tool_format::JSON_NATIVE) {
        analyze_tool_call_format_json_native(clean_haystack, fun_name_needle, arg_name_needle);
+        if (supports_parallel_tool_calls) {
+            analyze_json_native_parallel_calls();
+        }
    } else {
        analyze_tool_call_format_non_json(clean_haystack, fun_name_needle);
    }
@@ -668,6 +672,42 @@ void analyze_tools::analyze_tool_call_format(const std::string &       haystack,
    format.per_call_end = trim_whitespace(format.per_call_end);
 }

+void analyze_tools::analyze_json_native_parallel_calls() {
+    json assistant_one_tool = json{
+        { "role",       "assistant" },
+        { "content",    ""          },
+        { "tool_calls", json::array({ first_tool_call }) }
+    };
+
+    json assistant_two_tools = json{
+        { "role",       "assistant" },
+        { "content",    ""          },
+        { "tool_calls", json::array({ first_tool_call, second_tool_call }) }
+    };
+
+    template_params params;
+    params.messages              = json::array({ user_msg, assistant_one_tool });
+    params.tools                 = tools;
+    params.add_generation_prompt = false;
+    params.enable_thinking       = true;
+
+    auto comparison = compare_variants(
+        *tmpl, params, [&](template_params & p) { p.messages = json::array({ user_msg, assistant_two_tools }); });
+
+    if (!comparison) {
+        LOG_DBG(ANSI_ORANGE "%s: Template application failed\n" ANSI_RESET, __func__);
+        return;
+    }
+
+    std::string & second_call = comparison->diff.right;
+    if (!format.section_start.empty() && second_call.find(format.section_start) != std::string::npos) {
+        format.per_call_start = format.section_start;
+        format.per_call_end = format.section_end;
+        format.section_start.clear();
+        format.section_end.clear();
+    }
+}
+
 void analyze_tools::analyze_tool_call_format_json_native(const std::string & clean_haystack,
                                                         const std::string & fun_name_needle,
                                                         const std::string & arg_name_needle) {
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -676,7 +676,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();

        auto nested_name = literal("\"" + nested_name_field + "\"") + space() + literal(":") + space() +
-                          literal("\"") + tool_name(literal(name)) + literal("\"");
+                          atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
        auto nested_args = literal("\"" + nested_args_field + "\"") + space() + literal(":") + space() +
                          tool_args(schema(json(), "tool-" + name + "-schema", params));

@@ -744,7 +744,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();

        auto tool_name_ = name_key_parser + space() + literal(":") + space() +
-                         literal("\"") + tool_name(literal(name)) + literal("\"");
+                         atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
        auto tool_args_ = args_key_parser + space() + literal(":") + space() +
                         tool_args(schema(json(), "tool-" + name + "-schema", params));

--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,6 +1,7 @@
 #include "ggml.h"
 #include "gguf.h"

+#include "build-info.h"
 #include "common.h"
 #include "log.h"
 #include "llama.h"
@@ -372,7 +373,7 @@ void common_init() {
    const char * build_type = " (debug)";
 #endif

-    LOG_DBG("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
+    LOG_DBG("build: %d (%s) with %s for %s%s\n", llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
 }

 std::string common_params_get_system_info(const common_params & params) {
--- a/common/common.h
+++ b/common/common.h
@@ -2,9 +2,10 @@

 #pragma once

+#include "llama-cpp.h"
+
 #include "ggml-opt.h"
 #include "ggml.h"
-#include "llama-cpp.h"

 #include <set>
 #include <sstream>
@@ -27,11 +28,6 @@
 #define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

-#define print_build_info() do {                                                                     \
-    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);      \
-    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
-} while(0)
-
 struct common_time_meas {
    common_time_meas(int64_t & t_acc, bool disable = false);
    ~common_time_meas();
@@ -53,14 +49,6 @@ struct common_adapter_lora_info {

 using llama_tokens = std::vector<llama_token>;

-// build info
-extern int LLAMA_BUILD_NUMBER;
-extern const char * LLAMA_COMMIT;
-extern const char * LLAMA_COMPILER;
-extern const char * LLAMA_BUILD_TARGET;
-
-const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
-
 struct common_control_vector_load_info;

 //
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -1,5 +1,6 @@
 #include "arg.h"

+#include "build-info.h"
 #include "common.h"
 #include "log.h"
 #include "download.h"
@@ -303,7 +304,7 @@ static int common_download_file_single_online(const std::string & url,
        headers.emplace(h.first, h.second);
    }
    if (headers.find("User-Agent") == headers.end()) {
-        headers.emplace("User-Agent", "llama-cpp/" + build_info);
+        headers.emplace("User-Agent", "llama-cpp/" + std::string(llama_build_info()));
    }
    if (!opts.bearer_token.empty()) {
        headers.emplace("Authorization", "Bearer " + opts.bearer_token);
@@ -441,7 +442,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
        headers.emplace(h.first, h.second);
    }
    if (headers.find("User-Agent") == headers.end()) {
-        headers.emplace("User-Agent", "llama-cpp/" + build_info);
+        headers.emplace("User-Agent", "llama-cpp/" + std::string(llama_build_info()));
    }

    if (params.timeout > 0) {
--- a/common/hf-cache.cpp
+++ b/common/hf-cache.cpp
@@ -1,5 +1,6 @@
 #include "hf-cache.h"

+#include "build-info.h"
 #include "common.h"
 #include "log.h"
 #include "http.h"
@@ -200,7 +201,7 @@ static nl::json api_get(const std::string & url,
    auto [cli, parts] = common_http_client(url);

    httplib::Headers headers = {
-        {"User-Agent", "llama-cpp/" + build_info},
+        {"User-Agent", "llama-cpp/" + std::string(llama_build_info())},
        {"Accept", "application/json"}
    };

--- a/common/log.cpp
+++ b/common/log.cpp
@@ -23,6 +23,10 @@

 int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;

+int common_log_get_verbosity_thold(void) {
+    return common_log_verbosity_thold;
+}
+
 void common_log_set_verbosity_thold(int verbosity) {
    common_log_verbosity_thold = verbosity;
 }
--- a/common/log.h
+++ b/common/log.h
@@ -38,7 +38,7 @@ enum log_colors {

 // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
 // set via common_log_set_verbosity()
-extern int common_log_verbosity_thold;
+int  common_log_get_verbosity_thold(void);

 void common_log_set_verbosity_thold(int verbosity); // not thread-safe

@@ -98,7 +98,7 @@ void common_log_flush         (struct common_log * log);                    // f

 #define LOG_TMPL(level, verbosity, ...) \
    do { \
-        if ((verbosity) <= common_log_verbosity_thold) { \
+        if ((verbosity) <= common_log_get_verbosity_thold()) { \
            common_log_add(common_log_main(), (level), __VA_ARGS__); \
        } \
    } while (0)
--- a/examples/batched/CMakeLists.txt
+++ b/examples/batched/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-batched)
 add_executable(${TARGET} batched.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/convert-llama2c-to-ggml/CMakeLists.txt
+++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-convert-llama2c-to-ggml)
 add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/debug/CMakeLists.txt
+++ b/examples/debug/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-debug)
 add_executable(${TARGET} debug.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/diffusion/CMakeLists.txt
+++ b/examples/diffusion/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-diffusion-cli)
 add_executable(${TARGET} diffusion-cli.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -602,8 +602,8 @@ int main(int argc, char ** argv) {

    int n_input = input_tokens.size();

-    if (n_input >= params.n_ctx) {
-        LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx);
+    if (static_cast<uint32_t>(n_input) >= llama_n_ctx(ctx)) {
+        LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, llama_n_ctx(ctx));
        llama_free(ctx);
        llama_model_free(model);
        return 1;
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-embedding)
 add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(TARGET llama-eval-callback)
 add_executable(${TARGET} eval-callback.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 if(LLAMA_BUILD_TESTS)
--- a/examples/gen-docs/CMakeLists.txt
+++ b/examples/gen-docs/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-gen-docs)
 add_executable(${TARGET} gen-docs.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/idle/CMakeLists.txt
+++ b/examples/idle/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-idle)
 add_executable(${TARGET} idle.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/lookahead/CMakeLists.txt
+++ b/examples/lookahead/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-lookahead)
 add_executable(${TARGET} lookahead.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/lookup/CMakeLists.txt
+++ b/examples/lookup/CMakeLists.txt
@@ -1,23 +1,23 @@
 set(TARGET llama-lookup)
 add_executable(${TARGET} lookup.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 set(TARGET llama-lookup-create)
 add_executable(${TARGET} lookup-create.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 set(TARGET llama-lookup-merge)
 add_executable(${TARGET} lookup-merge.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 set(TARGET llama-lookup-stats)
 add_executable(${TARGET} lookup-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/parallel/CMakeLists.txt
+++ b/examples/parallel/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-parallel)
 add_executable(${TARGET} parallel.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/passkey/CMakeLists.txt
+++ b/examples/passkey/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-passkey)
 add_executable(${TARGET} passkey.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/retrieval/CMakeLists.txt
+++ b/examples/retrieval/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-retrieval)
 add_executable(${TARGET} retrieval.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-save-load-state)
 add_executable(${TARGET} save-load-state.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/speculative-simple/CMakeLists.txt
+++ b/examples/speculative-simple/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-speculative-simple)
 add_executable(${TARGET} speculative-simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/speculative/CMakeLists.txt
+++ b/examples/speculative/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-speculative)
 add_executable(${TARGET} speculative.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -5,5 +5,5 @@
 set(TARGET llama-ls-sycl-device)
 add_executable(${TARGET} ls-sycl-device.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/training/CMakeLists.txt
+++ b/examples/training/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-finetune)
 add_executable(${TARGET} finetune.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -47,6 +47,7 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)

 if (_hmx_idx GREATER_EQUAL 0)
    target_sources(${HTP_LIB} PRIVATE
+        hmx-queue.c
        hmx-matmul-ops.c
    )

--- a/ggml/src/ggml-hexagon/htp/hex-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hex-utils.h
@@ -31,6 +31,14 @@ static inline uint64_t hex_get_pktcnt() {
    return pktcnt;
 }

+static inline uint32_t hex_ceil_pow2(uint32_t x) {
+    if (x <= 1) { return 1; }
+    int p = 2;
+    x--;
+    while (x >>= 1) { p <<= 1; }
+    return p;
+}
+
 static inline size_t hmx_ceil_div(size_t num, size_t den) {
    return (num + den - 1) / den;
 }
@@ -73,8 +81,7 @@ static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride,
 #define HEX_L2_LINE_SIZE  64
 #define HEX_L2_FLUSH_SIZE (128 * 1024)

-static inline void hex_l2flush(void * addr, size_t size)
-{
+static inline void hex_l2flush(void * addr, size_t size) {
    if (size > HEX_L2_FLUSH_SIZE) {
        qurt_mem_cache_clean((qurt_addr_t) 0, 0, QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, QURT_MEM_DCACHE);
    } else {
@@ -89,4 +96,8 @@ static inline void hex_l2flush(void * addr, size_t size)
    }
 }

+static inline void hex_pause() {
+    asm volatile(" pause(#255)\n");
+}
+
 #endif /* HEX_UTILS_H */
--- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
@@ -16,14 +16,16 @@
 #include "ggml-common.h"

 #include "hex-dma.h"
+#include "worker-pool.h"
+
 #include "hvx-utils.h"
 #include "hvx-dump.h"
-#include "worker-pool.h"
 #include "htp-ctx.h"
 #include "htp-ops.h"

-#include "hmx-utils.h"
 #include "hmx-ops.h"
+#include "hmx-utils.h"
+#include "hmx-queue.h"
 #include "hmx-profile.h"

 static const __fp16 q4_0_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
@@ -47,7 +49,8 @@ static const __fp16 iq4_nl_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
 static const int32_t weight_transpose_scatter_offsets[32] __attribute__((aligned(VLEN))) = {
    0*128,  1*128,  2*128,  3*128,  4*128,  5*128,  6*128,  7*128,
    8*128,  9*128, 10*128, 11*128, 12*128, 13*128, 14*128, 15*128,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    16*128, 17*128, 18*128, 19*128, 20*128, 21*128, 22*128, 23*128,
+    24*128, 25*128, 26*128, 27*128, 28*128, 29*128, 30*128, 31*128
 };

 // Scales per x4x2 logical block: 8 × sizeof(__fp16) = 16 bytes
@@ -109,36 +112,45 @@ static inline bool hmx_add_overflow(size_t a, size_t b, size_t *out) {
    return false;
 }

-// Search for optimal (mc, nc) chunk sizes that maximize mc * nc within VTCM budget.
+// Search for optimal (mc, nc) chunk sizes within VTCM budget.
 //
-// Cost model: total = nc * per_n_cost + mc * per_m_cost + mc * nc * per_mn_cost + overhead
-//   per_n_cost:  bytes per nc column (weight + scratch buffers)
-//   per_m_cost:  bytes per mc row (activation)
-//   per_mn_cost: bytes per mc*nc element (output)
-//   overhead:    fixed bytes (scales 256B, eye_tile 2048B, etc.)
+// VTCM model: nc * per_n_cost + mc * per_m_cost + mc * nc * per_mn_cost + overhead
+//
+// Minimize ceil(m/mc) * m_block_cost + ceil(n/nc) * n_block_cost.
+// All matmul paths repeat weight processing per M-block and activation loading
+// per N-block, so discrete block counts drive total overhead.
+// Tie-break: when cost is equal, prefer larger mc * nc.
+//
+// Caller-provided coefficients:
+//   m_block_cost: penalty per extra M-block (weight redundancy, scales with n).
+//   n_block_cost: penalty per extra N-block (activation redundancy, scales with m).
 //
 // Algorithm: nc sweeps from n_max down by 32, analytically solving for mc_max.
 // Returns 0 on success, -1 if VTCM is insufficient.
-static int hmx_compute_chunks(
-    size_t vtcm_total, size_t overhead,
-    size_t per_n_cost, size_t per_m_cost, size_t per_mn_cost,
-    int m, int n,
-    size_t *m_chunk_out, size_t *n_chunk_out,
-    size_t *total_out)
-{
+static int hmx_compute_chunks(size_t   vtcm_total,
+                              size_t   overhead,
+                              size_t   per_n_cost,
+                              size_t   per_m_cost,
+                              size_t   per_mn_cost,
+                              int      m,
+                              int      n,
+                              size_t   m_block_cost,
+                              size_t   n_block_cost,
+                              size_t * m_chunk_out,
+                              size_t * n_chunk_out,
+                              size_t * total_out) {
    if (m <= 0 || n <= 0) return -1;
    if (vtcm_total <= overhead) return -1;
    if (per_n_cost == 0 || per_m_cost == 0 || per_mn_cost == 0) return -1;

    const size_t usable = vtcm_total - overhead;
-    size_t best_mn = 0, best_m = 0, best_n = 0;
+
+    size_t best_cost = SIZE_MAX;
+    size_t best_mn   = 0;
+    size_t best_m = 0, best_n = 0;

    const size_t n_max = hex_align_down((size_t)n, HMX_FP16_TILE_N_COLS);
    for (size_t nc = n_max; nc >= HMX_FP16_TILE_N_COLS; nc -= HMX_FP16_TILE_N_COLS) {
-        // Early exit: if nc * m_max cannot beat best, smaller nc won't either
-        if (nc * hex_align_down((size_t)m, HMX_FP16_TILE_N_ROWS) <= best_mn)
-            break;
-
        size_t n_fixed = 0, ncmn = 0, mc_denom = 0;
        if (hmx_mul_overflow(nc, per_n_cost, &n_fixed)) continue;
        if (n_fixed >= usable) goto next_nc;
@@ -152,10 +164,19 @@ static int hmx_compute_chunks(
            mc = hex_align_down(mc, HMX_FP16_TILE_N_ROWS);
            mc = hex_smin(mc, (size_t)m);

-            if (mc > 0 && mc * nc > best_mn) {
-                best_mn = mc * nc;
-                best_m  = mc;
-                best_n  = nc;
+            if (mc == 0) {
+                goto next_nc;
+            }
+
+            size_t mblocks = ((size_t) m + mc - 1) / mc;
+            size_t nblocks = ((size_t) n + nc - 1) / nc;
+            size_t cost    = mblocks * m_block_cost + nblocks * n_block_cost;
+            size_t mn      = mc * nc;
+            if (cost < best_cost || (cost == best_cost && mn > best_mn)) {
+                best_cost = cost;
+                best_mn   = mn;
+                best_m    = mc;
+                best_n    = nc;
            }
        }

@@ -233,7 +254,7 @@ static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(
    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
    HVX_Vector v_scales = hvx_vec_splat_f16(*scale);
    // q4x4x2 stores two int4 values per byte. Keep only the selected nibble.
-    HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
+    HVX_Vector v_quants =  Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
    v_quants = Q6_V_vand_VV(v_quants, mask_h4);
    // Shuffle before LUT
    v_quants = Q6_Vb_vshuff_Vb(v_quants);
@@ -257,7 +278,7 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
    // Load all 128 packed bytes (4 contiguous 32-byte groups)
    HVX_Vector vq = hvx_vmemu(packed_128);
    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
-    HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
+    HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
    v_quants = Q6_V_vand_VV(v_quants, mask_h4);

    // Shuffle before LUT
@@ -277,10 +298,8 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
    v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));

    // Extract individual groups: scatter uses q_mask64 so only first 64 bytes matter
-    out[0] = v_lo;                      // group0 already in [0:63]
-    out[1] = Q6_V_vror_VR(v_lo, 64);    // group1 rotated to [0:63]
-    out[2] = v_hi;                      // group2 already in [0:63]
-    out[3] = Q6_V_vror_VR(v_hi, 64);    // group3 rotated to [0:63]
+    out[0] = v_lo; // group0 already in [0:63]
+    out[1] = v_hi; // group2 already in [0:63]
 }

 // Dequantize one x4x2 Q8_0 group (32 int8 quants) -> 32 FP16 in first 64 bytes.
@@ -384,8 +403,9 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
        size_t row_stride, int weight_type,
        int start_tile, int end_tile) {

-    const int n_k_tiles = k_block / HMX_FP16_TILE_N_COLS;
-    const int qrow_size = (weight_type == HTP_TYPE_Q8_0) ? k_block : (k_block / 2);
+    const int n_k_tiles = (unsigned)k_block / HMX_FP16_TILE_N_COLS;
+    const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL);
+    const int qrow_size = is_q4 ? ((unsigned)k_block / 2) : k_block;

    const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) ? hvx_vmem(iq4_nl_to_fp16_lut) :
                                (weight_type == HTP_TYPE_MXFP4)  ? hvx_vmem(mxfp4_to_fp16_lut) :
@@ -398,47 +418,46 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
    const HVX_Vector v_scat_step = Q6_V_vsplat_R(4);  // 4 bytes = 1 column step
    const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64);  // first 16 words (64 bytes)

-    for (int t = start_tile; t < end_tile; ) {
-        int ct = t / n_k_tiles;  // column tile index
-        int kt = t % n_k_tiles;  // K tile index
+    unsigned ct = (unsigned)start_tile / n_k_tiles;  // column tile index
+    unsigned kt = (unsigned)start_tile % n_k_tiles;  // K tile index
+    for (unsigned t = start_tile; t < end_tile; ) {
+        if (kt >= n_k_tiles) { kt = 0; ct++; }

-        // --- Batch-4 fast path for Q4_0/IQ4_NL: process 4 contiguous K-tiles with one vlut16 per row ---
-        if ((weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) && (kt % 4 == 0) && (t + 4 <= end_tile) &&
-            ((t + 3) / n_k_tiles == ct)) {
-            int blk_idx      = (kt * 32) / QK_Q4_0x4x2;
-            int sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32;  // 0 or 4
-            bool upper       = (sub_blk_base >= 4);
-            int packed_off   = blk_idx * (QK_Q4_0x4x2 / 2);     // 128 contiguous packed bytes
-            int scale_off    = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE
-                             + sub_blk_base * (int)sizeof(__fp16);   // 4 consecutive scales
+        // --- Batch-4 fast path for Q4: process 4 contiguous K-tiles with one vlut16 per row ---
+        if (is_q4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
+            unsigned blk_idx      = (kt * 32) / QK_Q4_0x4x2;
+            unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32;  // 0 or 4
+            bool upper            = (sub_blk_base >= 4);
+            unsigned packed_off   = blk_idx * (QK_Q4_0x4x2 / 2);     // 128 contiguous packed bytes
+            unsigned scale_off    = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE
+                                  + sub_blk_base * (int)sizeof(__fp16);   // 4 consecutive scales

            __fp16 *tile_bases[4];
-            for (int g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; }
+            for (unsigned g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; }

            HVX_Vector v_off = v_scat_base;
-            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
-                int row0 = ct * HMX_FP16_TILE_N_COLS + r;
-                int row1 = row0 + 1;
-                const uint8_t *r0 = vtcm_src + row0 * row_stride;
-                const uint8_t *r1 = vtcm_src + row1 * row_stride;

-                HVX_Vector v0[4], v1[4];
+            unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
+            unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
+
+            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
+                HVX_Vector v0[2];
+                const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
                dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
-                if (row1 < n_cols) {
-                    dequantize_x4x2_q4_0_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt, v1);
-                } else {
-                    v1[0] = v1[1] = v1[2] = v1[3] = Q6_V_vzero();
-                }
-
-                for (int g = 0; g < 4; g++) { Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v0[g]); }
+                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]);
+                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]);
                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-                for (int g = 0; g < 4; g++) { Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v1[g]); }
+
+
+                r0 = vtcm_src + row_offset; row_offset += row_stride;
+                dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
+                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]);
+                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]);
                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
            }

            for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); }
-
-            t += 4;
+            t += 4; kt += 4;
            continue;
        }

@@ -495,20 +514,19 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
        // --- Single-tile fallback ---
        __fp16 *tile_base = vtcm_dst + t * HMX_FP16_TILE_N_ELMS;

-        if (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) {
-            int blk_idx  = (kt * 32) / QK_Q4_0x4x2;
-            int sub_blk  = ((kt * 32) % QK_Q4_0x4x2) / 32;
-            bool upper   = (sub_blk >= 4);
-            int byte_off  = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
-            int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);
+        if (is_q4) {
+            unsigned blk_idx   = (kt * 32) / QK_Q4_0x4x2;
+            unsigned sub_blk   = ((kt * 32) % QK_Q4_0x4x2) / 32;
+            bool upper         = (sub_blk >= 4);
+            unsigned byte_off  = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
+            unsigned scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);

            HVX_Vector v_off = v_scat_base;  // reset to column 0
-            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
-                int row0 = ct * HMX_FP16_TILE_N_COLS + r;
-                int row1 = row0 + 1;
-
-                const uint8_t *r0 = vtcm_src + row0 * row_stride;
-                const uint8_t *r1 = vtcm_src + row1 * row_stride;
+            unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
+            unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
+            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
+                const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
+                const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;

                HVX_Vector v0 = dequantize_x4x2_q4_0_group_hvx(
                    r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
@@ -585,7 +603,7 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
            }
            (void) *(volatile HVX_Vector *)(tile_base);
        }
-        ++t;
+        ++t; ++kt;
    }

    // Drain HVX scatter write buffer: a vmem load on the same HW thread retires
@@ -653,9 +671,13 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
 // --- End x4x2 dequantizers ---

 // requires external HMX lock
-static void core_dot_chunk_fp16(__fp16 *output, const __fp16 *activation, const __fp16 *weight, const __fp16 *scales,
+static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict activation, const __fp16 *restrict weight, const __fp16 *restrict scales,
                                int n_row_tiles, int n_col_tiles, int n_dot_tiles) {
-    hmx_set_output_scales(scales);
+    __builtin_assume(n_row_tiles > 0);
+    __builtin_assume(n_col_tiles > 0);
+    __builtin_assume(n_dot_tiles > 0);
+
+    Q6_bias_mxmem2_A((void *)scales);

    for (int r = 0; r < n_row_tiles; ++r) {
        for (int c = 0; c < n_col_tiles; ++c) {
@@ -665,16 +687,55 @@ static void core_dot_chunk_fp16(__fp16 *output, const __fp16 *activation, const
            const __fp16 *col_tiles = weight + c * n_dot_tiles * HMX_FP16_TILE_N_ELMS;

            for (int k = 0; k < n_dot_tiles; ++k) {
-                int offset = k * HMX_FP16_TILE_N_ELMS;
-                hmx_load_tile_pair_fp16(row_tiles + offset, col_tiles + offset);
+                Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047);
+                Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047);
+                row_tiles += HMX_FP16_TILE_N_ELMS;
+                col_tiles += HMX_FP16_TILE_N_ELMS;
            }

            __fp16 *out_tile = output + (r * n_col_tiles + c) * HMX_FP16_TILE_N_ELMS;
-            hmx_consume_accumulator_fp16(out_tile);
+            Q6_mxmem_AR_after_hf(out_tile, 0);
        }
    }
 }

+// --- Async HMX matmul job (for pipeline overlap) ---
+
+typedef struct {
+    __fp16 *       output;
+    const __fp16 * activation;
+    const __fp16 * weight;
+    const __fp16 * scales;
+    uint32_t       n_row_tiles;
+    uint32_t       n_col_tiles;
+    uint32_t       n_dot_tiles;
+} hmx_matmul_job_t;
+
+static void hmx_matmul_worker_fn(void * data) {
+    hmx_matmul_job_t * job = (hmx_matmul_job_t *) data;
+    FARF(HIGH, "hmx-mm-job: n_row_tiles %u n_col_tiles %u n_dot_tiles %u", job->n_row_tiles, job->n_col_tiles, job->n_dot_tiles);
+    core_dot_chunk_fp16(job->output, job->activation, job->weight, job->scales, job->n_row_tiles, job->n_col_tiles, job->n_dot_tiles);
+}
+
+static inline void hmx_matmul_job_init(hmx_matmul_job_t * job,
+                                       __fp16 *           output,
+                                       const __fp16 *     activation,
+                                       const __fp16 *     weight,
+                                       const __fp16 *     scales,
+                                       int                n_row_tiles,
+                                       int                n_col_tiles,
+                                       int                n_dot_tiles) {
+    job->output      = output;
+    job->activation  = activation;
+    job->weight      = weight;
+    job->scales      = scales;
+    job->n_row_tiles = n_row_tiles;
+    job->n_col_tiles = n_col_tiles;
+    job->n_dot_tiles = n_dot_tiles;
+}
+
+// --- End async HMX matmul job ---
+
 static void transfer_output_chunk_fp16_to_fp32(float *restrict dst, const __fp16 *restrict vtcm_src, int n_rows, int n_cols, int n) {
    assert(n_cols % HMX_FP16_TILE_N_COLS == 0);
    const int n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS;
@@ -832,12 +893,13 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
    const size_t f32_scratch_per_m = use_dma_activation ? (size_t) params->k * sizeof(float) : 0;

    size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
+    // FP16 weight: interleave and activation load have similar per-element cost.
    if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256,
-                             /*per_n=*/3 * vec_dot_size,
-                             /*per_m=*/group_size * vec_dot_size + f32_scratch_per_m,
-                             /*per_mn=*/sizeof(__fp16),
-                             params->m, params->n,
-                             &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
+                           /*per_n=*/3 * vec_dot_size,
+                           /*per_m=*/group_size * vec_dot_size + f32_scratch_per_m,
+                           /*per_mn=*/sizeof(__fp16), params->m, params->n,
+                           /*m_block_cost=*/(size_t) params->n,
+                           /*n_block_cost=*/(size_t) params->m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
        FARF(HIGH, "%s: grouped path does not fit VTCM, falling back to legacy batched loop", __func__);
        return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params);
    }
@@ -1006,13 +1068,15 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
    const size_t f32_scratch_per_m = use_dma_activation ? (size_t) k * sizeof(float) : 0;

    size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
+    // FP16 weight: interleave and activation load have similar per-element cost.
    if (hmx_compute_chunks(vtcm_budget,
-                              /*overhead=*/ 256,
-                              /*per_n=*/    3 * vec_dot_size,  // W + S0 + S1
-                              /*per_m=*/    vec_dot_size + f32_scratch_per_m,  // A + optional F32 scratch
-                              /*per_mn=*/   sizeof(__fp16),     // O
-                              m, n,
-                              &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
+                           /*overhead=*/256,
+                           /*per_n=*/3 * vec_dot_size,                  // W + S0 + S1
+                           /*per_m=*/vec_dot_size + f32_scratch_per_m,  // A + optional F32 scratch
+                           /*per_mn=*/sizeof(__fp16),                   // O
+                           m, n,
+                           /*m_block_cost=*/(size_t) n,
+                           /*n_block_cost=*/(size_t) m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
        FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget);
        return -1;
    }
@@ -1157,6 +1221,8 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
 int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict out, const float *restrict x, const uint8_t *restrict w, int m,
                                       int k, int n, int w_type);

+#define FALLBACK_TO_STANDARD 1
+
 int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
                                     const uint8_t *restrict permuted_weight, int m, int k, int n,
                                     int weight_type) {
@@ -1169,9 +1235,12 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds

    // for large m, k (e.g. prefill FFN Down), use out-stationary version
    if (m >= 128 && k > n && n > 1024) {
-        FARF(MEDIUM, "hmx_matmul_qk: OUT-STATIONARY path m=%d k=%d n=%d type=%d (K_BLOCK=512, %d K-iters with fp16 intermediate)",
-             m, k, n, weight_type, (k + 511) / 512);
-        return mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type);
+        int rc = mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type);
+        if (rc != FALLBACK_TO_STANDARD) {
+            return rc;  // 0 success, -1 error
+        }
+        FARF(MEDIUM, "hmx_matmul_qk: out-stationary fallback to standard m=%d k=%d n=%d", m, k, n);
+        // fall through to standard path
    }

    size_t row_stride = get_x4x2_row_stride(weight_type, k);
@@ -1197,9 +1266,10 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
    }

    size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
-    if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256,
-                              per_n_cost, /*per_m=*/vec_dot_size, per_mn_cost,
-                              m, n, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
+    // Quantized weight: dequant ~1.5x more expensive per element than activation load.
+    if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, per_n_cost, /*per_m=*/vec_dot_size, per_mn_cost, m, n,
+                           /*m_block_cost=*/(size_t) n * 3,
+                           /*n_block_cost=*/(size_t) m * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
        FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d pipe=%d budget=%zu)",
             __func__, m, k, n, use_pipeline, vtcm_budget);
        return -1;
@@ -1256,9 +1326,8 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
         use_pipeline ? "PIPELINE" : "SEQUENTIAL", m_chunk_n_rows, n_chunk_n_cols,
         (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);

-    HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
-
    if (!use_pipeline) {
+        HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
        for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
            // transfer activation matrix chunk into VTCM
            size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
@@ -1318,20 +1387,22 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
                TIMER_STOP(output_store);
            }
        }
+        HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
    } else {
        // 4-stage pipeline: DMA load (A), dequantize (B), HMX matmul (C), store (D)
-        // stage B and D (dequantize and store) are expected to be on the critical path
+        // HMX compute (C) runs on dedicated worker thread, overlapping with HVX stages (B, D).

        // A --> B: vtcm_qweight, 1 buffer
        // B --> C: vtcm_weight0/vtcm_weight1, 2 buffers
        // C --> D: vtcm_output0/vtcm_output1, 2 buffers

-        //
-        // LD ||A3|  | B3 ||
-        // MM ||    C2    ||
-        // ST || D1 |     ||
+        // Async timeline (C overlaps B+D):
+        //   main+HVX:   [A0][Act][B0][A1][sub C0][B1‖C0][A2][wait,sub C1][D0+B2‖C1][wait,sub C2][D1‖C2][wait][D2]
+        //   HMX queue:                   [████ C0 ████████][████ C1 ████████████][████ C2 ████████]

        int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols);
+        hmx_matmul_job_t job_slots[2];  // persistent double-buffered job descriptors
+
        for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
            const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);

@@ -1352,31 +1423,34 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
                transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k);
            }

-            // prologue: B0, A1, C0, B1
+            // prologue: B0, A1, submit C0 (async), B1 (overlaps C0)
            {
-                // B0
+                // B0: wait for DMA, dequant weight chunk 0
                dma_queue_pop(ctx->dma[0]);
                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type);

-                // A1
+                // A1: issue DMA for weight chunk 1
                const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols);
                if (1 < n_chunk_cnt) {
                    const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * row_stride;
                    dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1);
                }

-                // C0
-                core_dot_chunk_fp16((__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
-                         hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
+                // submit C0 (non-blocking — HMX worker executes in parallel)
+                hmx_matmul_job_init(&job_slots[0], (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation,
+                                    (__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
+                                    hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
+                                    hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
+                hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[0]));

-                // B1
+                // B1: DMA pop + dequant (runs in parallel with C0 on HMX worker)
                if (1 < n_chunk_cnt) {
                    dma_queue_pop(ctx->dma[0]);
                    dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type);
                }
            }

-            // main loop
+            // main loop: wait C_i → submit C_{i+1} → D_i + B_{i+2} (parallel with C_{i+1})
            for (int i = 0; i < n_chunk_cnt; ++i) {
                const size_t nc    = i * n_chunk_n_cols;
                const size_t nc_p1 = nc + 1 * n_chunk_n_cols;
@@ -1386,36 +1460,41 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
                const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols);
                const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols);

-                // issue A_{i+2}
+                // issue A_{i+2}: DMA push (non-blocking)
                if (i + 2 < n_chunk_cnt) {
                    const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * row_stride;
                    dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2);
                }

-                // wait for HMX (C_{i}) -- C_{i} is done
+                // wait C_i: block until prologue/previous C completes
+                hmx_queue_pop(ctx->hmx_queue);

-                // result of B_{i+1} (input of C_{i+1}) should be ready now
-
-                // issue C_{i+1}
+                // submit C_{i+1} (non-blocking, overlaps with D_i + B_{i+2} below)
+                // job_slots[(i+1)%2] is safe: C_i just completed, freeing slot i%2's
+                // counterpart — and (i+1)%2 was last used by C_{i-1} which completed
+                // before C_i was submitted.
                if (i + 1 < n_chunk_cnt) {
-                    core_dot_chunk_fp16((__fp16 *) vtcm_output_bufs[(i + 1) % 2], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2], vtcm_scales,
-                        hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
+                    hmx_matmul_job_init(&job_slots[(i + 1) % 2], (__fp16 *) vtcm_output_bufs[(i + 1) % 2],
+                                        (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2],
+                                        vtcm_scales, hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
+                                        hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
+                    hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[(i + 1) % 2]));
                }

-                // compute D_{i}
+                // D_i: store output (multi-thread HVX, parallel with C_{i+1})
                float *output_chunk = dst + (mr * n + nc);
                transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, n);

-                // wait for DMA (A_{i+2}), compute B_{i+2}
+                // B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1})
                if (i + 2 < n_chunk_cnt) {
                    dma_queue_pop(ctx->dma[0]);
                    dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type);
                }
            }
        }
-    }

-    HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
+        hmx_queue_suspend(ctx->hmx_queue);
+    }

    TIMER_STOP(total);

@@ -1434,10 +1513,13 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
 }

 // C += AB
-void core_mma_chunk_fp16(__fp16 *c, const __fp16 *a, const __fp16 *b, const __fp16 *col_scales, const __fp16 *eye_tile,
+void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, const __fp16 *restrict b, const __fp16 *restrict col_scales, const __fp16 *restrict eye_tile,
                         int n_row_tiles, int n_col_tiles, int n_dot_tiles, bool zero_init) {
+    __builtin_assume(n_row_tiles > 0);
+    __builtin_assume(n_col_tiles > 0);
+    __builtin_assume(n_dot_tiles > 0);

-    hmx_set_output_scales(col_scales);
+    Q6_bias_mxmem2_A((void *)col_scales);

    for (int i = 0; i < n_row_tiles; ++i) {
        for (int j = 0; j < n_col_tiles; ++j) {
@@ -1448,15 +1530,17 @@ void core_mma_chunk_fp16(__fp16 *c, const __fp16 *a, const __fp16 *b, const __fp

            __fp16 *accum_tile = c + (i * n_col_tiles + j) * HMX_FP16_TILE_N_ELMS;
            if (!zero_init) {
-                hmx_load_tile_pair_fp16(accum_tile, eye_tile);
+                Q6_activation_hf_mxmem_RR((unsigned int)accum_tile, 2047);
+                Q6_weight_hf_mxmem_RR((unsigned int)eye_tile, 2047);
            }

            for (int k = 0; k < n_dot_tiles; ++k) {
-                int offset = k * HMX_FP16_TILE_N_ELMS;
-                hmx_load_tile_pair_fp16(row_tiles + offset, col_tiles + offset);
+                Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047);
+                Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047);
+                row_tiles += HMX_FP16_TILE_N_ELMS;
+                col_tiles += HMX_FP16_TILE_N_ELMS;
            }
-
-            hmx_consume_accumulator_fp16(accum_tile);
+            Q6_mxmem_AR_after_hf(accum_tile, 0);
        }
    }
 }
@@ -1540,12 +1624,41 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict

    const size_t vtcm_budget = ctx->vtcm_size;

-    const size_t M_BLOCK_SIZE = 512;
-    const size_t N_BLOCK_SIZE = 512;
-    const size_t K_BLOCK_SIZE = 512;
+    const size_t K_BLOCK_SIZE = 1024;

-    // Compute precise buffer sizes
+    // Fallback: if k doesn't need K-blocking, out-stationary has no advantage
+    const size_t k_iters_check = (k + K_BLOCK_SIZE - 1) / K_BLOCK_SIZE;
+    if (k_iters_check <= 1) {
+        FARF(MEDIUM, "%s: K_BLK=%zu >= k=%d, fallback to standard path", __func__, K_BLOCK_SIZE, k);
+        return FALLBACK_TO_STANDARD;
+    }
+
+    // Dynamic M,N search via hmx_compute_chunks
    const size_t sub_row_stride_alloc = get_x4x2_row_stride(weight_type, K_BLOCK_SIZE);
+    const size_t per_m                = K_BLOCK_SIZE * sizeof(float)  // scratch1: M×K×4 (act DMA staging F32)
+                         + K_BLOCK_SIZE * sizeof(__fp16);             // activation: M×K×2 (F16 tiles)
+    const size_t per_n = sub_row_stride_alloc                         // scratch0: N×sub_row(K) (packed quant)
+                         + K_BLOCK_SIZE * sizeof(__fp16);             // weight: N×K×2 (F16 tiles)
+    const size_t per_mn       = sizeof(__fp16);                       // output: M×N×2 (out-stationary)
+    // Alignment margin: hex_align_up can add up to 2047 bytes per buffer;
+    // scratch1 (mc×6144) is naturally 2048-aligned, remaining 4 buffers need margin
+    const size_t align_margin = 4 * HMX_FP16_TILE_SIZE;
+    const size_t overhead     = HMX_FP16_TILE_SIZE + 256 + align_margin;  // eye_tile + scales + alignment
+
+    size_t       M_BLOCK_SIZE, N_BLOCK_SIZE, vtcm_used;
+    // Cost-based search: minimize ceil(m/mc)*m_block_cost + ceil(n/nc)*n_block_cost.
+    // From profiling: wt_dequant per element ≈ 1.5× activation load per element.
+    // m_block_cost = n*3: each extra M-block re-dequants all N×K weight (expensive).
+    // n_block_cost = m*2: each extra N-block re-loads all M×K activation (cheaper).
+    const size_t m_block_cost = (size_t) n * 3;
+    const size_t n_block_cost = (size_t) m * 2;
+    if (hmx_compute_chunks(vtcm_budget, overhead, per_n, per_m, per_mn, m, n, m_block_cost, n_block_cost, &M_BLOCK_SIZE,
+                           &N_BLOCK_SIZE, &vtcm_used) != 0) {
+        FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget);
+        return -1;
+    }
+
+    // Compute precise buffer sizes from searched M,N and fixed K
    const size_t weight_size  = hex_align_up(N_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
    const size_t act_size     = hex_align_up(M_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
    const size_t out_size     = hex_align_up(M_BLOCK_SIZE * N_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
@@ -1554,7 +1667,8 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict

    const size_t total_vtcm = weight_size + act_size + out_size + scratch0_sz + scratch1_sz + HMX_FP16_TILE_SIZE + 256;
    if (total_vtcm > vtcm_budget) {
-        FARF(HIGH, "%s: VTCM too small: need %zu have %zu (m=%d k=%d n=%d)", __func__, total_vtcm, vtcm_budget, m, k, n);
+        FARF(HIGH, "%s: VTCM overflow after search: need %zu have %zu (M=%zu N=%zu K=%zu)", __func__, total_vtcm,
+             vtcm_budget, M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE);
        return -1;
    }

@@ -1568,8 +1682,8 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
    __fp16  *vtcm_scales     = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256);
    assert((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) <= vtcm_budget);

-    FARF(MEDIUM, "%s: m=%d k=%d n=%d wtype=%d vtcm=%zu/%zu", __func__, m, k, n, weight_type,
-         (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
+    FARF(HIGH, "hmx-mm: m=%d k=%d n=%d wtype=%d block M=%zu N=%zu K=%zu vtcm=%zu/%zu", __func__, m, k, n, weight_type,
+         M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE, (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget);

    // initialize eye tile (32x32 identity matrix)
    {
--- a/ggml/src/ggml-hexagon/htp/hmx-queue.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-queue.c
@@ -0,0 +1,158 @@
+#pragma clang diagnostic ignored "-Wunused-function"
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <qurt_thread.h>
+#include <qurt_futex.h>
+
+#include <HAP_compute_res.h>
+
+#include "hmx-queue.h"
+
+#define QURT_LOWEST_PRIO (254)
+
+static inline void hmx_lock(struct hmx_queue *q)
+{
+    if (!q->hmx_locked) {
+        HAP_compute_res_hmx_lock(q->hap_rctx);
+        q->hmx_locked = true;
+    }
+}
+
+static inline void hmx_unlock(struct hmx_queue *q)
+{
+    if (q->hmx_locked) {
+        HAP_compute_res_hmx_unlock(q->hap_rctx);
+        q->hmx_locked = false;
+    }
+}
+
+static inline void hmx_queue_process(struct hmx_queue *q, bool* killed) {
+    unsigned int ir = atomic_load(&q->idx_read);
+
+    while (ir != atomic_load(&q->idx_write)) {
+        struct hmx_queue_desc *d = &q->desc[ir];
+        if (!d->done) {
+            FARF(HIGH, "hmx-queue-process: ir %u func %p data %p", ir, d->func, d->data);
+
+            enum hmx_queue_signal sig = (enum hmx_queue_signal) (unsigned int) d->func;
+            switch (sig) {
+                case HMX_QUEUE_NOOP:    /* noop */;     break;
+                case HMX_QUEUE_KILL:    *killed = true; break;
+                case HMX_QUEUE_SUSPEND: hmx_unlock(q);  break;
+                default:
+                    hmx_lock(q);
+                    d->func(d->data);
+                    break;
+            }
+
+            atomic_fetch_add(&d->done, 1);
+        }
+
+        ir = (ir + 1) & q->idx_mask;
+        atomic_store(&q->idx_read, ir);
+    }
+}
+
+static void hmx_queue_thread(void * arg) {
+    struct hmx_queue * q = (struct hmx_queue *) arg;
+
+    FARF(HIGH, "hmx-queue-thread: started");
+
+    bool killed = false;
+
+    unsigned int poll_cnt  = HMX_QUEUE_POLL_COUNT;
+    unsigned int prev_seqn = 0;
+    while (!killed) {
+        unsigned int seqn = atomic_load(&q->seqn);
+        if (seqn == prev_seqn) {
+            if (--poll_cnt) { hex_pause(); continue; }
+            FARF(HIGH, "hmx-queue-thread: sleeping");
+            qurt_futex_wait(&q->seqn, prev_seqn);
+            continue;
+        }
+        prev_seqn = seqn;
+        poll_cnt  = HMX_QUEUE_POLL_COUNT;
+
+        FARF(HIGH, "hmx-queue-thread: new work");
+
+        hmx_queue_process(q, &killed);
+    }
+
+    FARF(HIGH, "hmx-queue-thread: stopped");
+}
+
+struct hmx_queue * hmx_queue_create(size_t capacity, uint32_t hap_rctx) {
+    capacity = hex_ceil_pow2(capacity);
+
+    struct hmx_queue * q = (struct hmx_queue *) memalign(32, sizeof(struct hmx_queue));
+    if (q == NULL) {
+        FARF(ERROR, "%s: failed to allocate DMA queue\n", __FUNCTION__);
+        return NULL;
+    }
+    memset(q, 0, sizeof(struct hmx_queue));
+    q->capacity = capacity;
+    q->idx_mask = capacity - 1;
+    q->hap_rctx = hap_rctx;
+
+    q->desc = (struct hmx_queue_desc *) memalign(64, capacity * sizeof(struct hmx_queue_desc));
+    if (!q->desc) {
+        FARF(ERROR, "hmx-queue: failed to allocate HMX queue descriptors\n");
+        return NULL;
+    }
+    memset(q->desc, 0, capacity * sizeof(struct hmx_queue_desc));
+
+    const size_t stack_size = HMX_QUEUE_THREAD_STACK_SIZE;
+    q->stack = (unsigned char *) memalign(64, stack_size);
+    if (!q->stack) {
+        FARF(ERROR, "hmx-queue: thread stack allocation failed (%zu bytes)", stack_size);
+        return NULL;
+    }
+    memset(q->stack, 0, stack_size);
+
+    // Match caller thread priority (same pattern as worker-pool.c).
+    int prio = qurt_thread_get_priority(qurt_thread_get_id());
+    if (prio < 1) {
+        prio = 1;
+    }
+    if (prio > QURT_LOWEST_PRIO) {
+        prio = QURT_LOWEST_PRIO;
+    }
+
+    qurt_thread_attr_t attr;
+    qurt_thread_attr_init(&attr);
+    qurt_thread_attr_set_stack_addr(&attr, q->stack);
+    qurt_thread_attr_set_stack_size(&attr, stack_size);
+    qurt_thread_attr_set_priority(&attr, prio);
+    qurt_thread_attr_set_name(&attr, "hmx-queue");
+
+    int err = qurt_thread_create(&q->thread, &attr, hmx_queue_thread, q);
+    if (err) {
+        FARF(ERROR, "hmx-worker: thread create failed (%d)", err);
+        return NULL;
+    }
+
+    FARF(HIGH, "hmx-queue: capacity %u\n", capacity);
+
+    return q;
+}
+
+void hmx_queue_delete(struct hmx_queue * q) {
+    if (!q) {
+        return;
+    }
+
+    // Tell the worker to exit.
+    hmx_queue_flush(q);
+    hmx_queue_signal(q, HMX_QUEUE_KILL);
+    hmx_queue_flush(q);
+
+    int status;
+    qurt_thread_join(q->thread, &status);
+
+    free(q->desc);
+    free(q->stack);
+    free(q);
+}
--- a/ggml/src/ggml-hexagon/htp/hmx-queue.h
+++ b/ggml/src/ggml-hexagon/htp/hmx-queue.h
@@ -0,0 +1,134 @@
+#ifndef HMX_QUEUE_H
+#define HMX_QUEUE_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdatomic.h>
+
+#include <hexagon_types.h>
+#include <qurt_thread.h>
+#include <qurt_futex.h>
+#include <HAP_farf.h>
+
+#include "hex-utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HMX_QUEUE_THREAD_STACK_SIZE (16 * 1024)
+#define HMX_QUEUE_POLL_COUNT        2000
+
+typedef void (*hmx_queue_func)(void *);
+
+// Dummy funcs used as signals
+enum hmx_queue_signal {
+    HMX_QUEUE_NOOP = 0, // aka NULL
+    HMX_QUEUE_SUSPEND,
+    HMX_QUEUE_KILL
+};
+
+struct hmx_queue_desc {
+    hmx_queue_func   func;
+    void *           data;
+    atomic_uint      done;
+};
+
+struct hmx_queue {
+    struct hmx_queue_desc * desc;
+    atomic_uint      idx_write; // updated by producer (push)
+    atomic_uint      idx_read;  // updated by consumer (process)
+    unsigned int     idx_pop;   // updated by producer (pop)
+    uint32_t         idx_mask;
+    uint32_t         capacity;
+
+    atomic_uint      seqn;      // incremented for all pushes, used with futex
+    qurt_thread_t    thread;
+    void *           stack;
+    uint32_t         hap_rctx;
+    bool             hmx_locked;
+};
+
+struct hmx_queue * hmx_queue_create(size_t capacity, uint32_t hap_rctx);
+void hmx_queue_delete(struct hmx_queue * q);
+
+static inline struct hmx_queue_desc hmx_queue_make_desc(hmx_queue_func func, void * data) {
+    struct hmx_queue_desc d = { func, data };
+    return d;
+}
+
+static inline bool hmx_queue_push(struct hmx_queue * q, struct hmx_queue_desc d) {
+    unsigned int ir = atomic_load(&q->idx_read);
+    unsigned int iw = q->idx_write;
+
+    if (((iw + 1) & q->idx_mask) == ir) {
+        FARF(HIGH, "hmx-queue-push: queue is full\n");
+        return false;
+    }
+
+    atomic_store(&d.done, 0);
+
+    FARF(HIGH, "hmx-queue-push: iw %u func %p data %p\n", iw, d.func, d.data);
+
+    q->desc[iw] = d;
+    atomic_store(&q->idx_write, (iw + 1) & q->idx_mask);
+    // wake up our thread
+    atomic_fetch_add(&q->seqn, 1);
+    qurt_futex_wake(&q->seqn, 1);
+
+    return true;
+}
+
+static inline bool hmx_queue_signal(struct hmx_queue *q, enum hmx_queue_signal sig) {
+    return hmx_queue_push(q, hmx_queue_make_desc((hmx_queue_func) sig, NULL));
+}
+
+static inline bool hmx_queue_empty(struct hmx_queue * q) {
+    return q->idx_pop == q->idx_write;
+}
+
+static inline uint32_t hmx_queue_depth(struct hmx_queue * q) {
+    return (q->idx_read - q->idx_read) & q->idx_mask;
+}
+
+static inline uint32_t hmx_queue_capacity(struct hmx_queue * q) {
+    return q->capacity;
+}
+
+static inline struct hmx_queue_desc hmx_queue_pop(struct hmx_queue * q) {
+    unsigned int ip = q->idx_pop;
+    unsigned int iw = q->idx_write;
+
+    struct hmx_queue_desc rd = { NULL, NULL };
+    if (ip == iw) {
+        return rd;
+    }
+
+    // Wait for desc to complete
+    struct hmx_queue_desc * d = &q->desc[ip];
+    while (!atomic_load(&d->done)) {
+        FARF(HIGH, "hmx-queue-pop: waiting for HMX queue : %u\n", ip);
+        hex_pause();
+    }
+
+    rd = *d;
+    q->idx_pop = (ip + 1) & q->idx_mask;
+
+    FARF(HIGH, "hmx-queue-pop: ip %u func %p data %p\n", ip, rd.func, rd.data);
+    return rd;
+}
+
+static inline void hmx_queue_flush(struct hmx_queue * q) {
+    while (hmx_queue_pop(q).func != NULL) ;
+}
+
+static inline void hmx_queue_suspend(struct hmx_queue *q) {
+    hmx_queue_signal(q, HMX_QUEUE_SUSPEND);
+    hmx_queue_flush(q);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif /* HMX_QUEUE_H */
--- a/ggml/src/ggml-hexagon/htp/hmx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hmx-utils.h
@@ -14,10 +14,6 @@

 #define HMX_INLINE_ALWAYS inline __attribute__((unused, always_inline))

-static HMX_INLINE_ALWAYS void hmx_set_output_scales(const void *scales) {
-    asm volatile("bias = mxmem2(%0)" :: "r"(scales));
-}
-
 // Initialise aligned 256-byte area with scale vector + zero padding.
 static HMX_INLINE_ALWAYS void hmx_init_column_scales(void *out_scales, HVX_Vector v_scale) {
    HVX_Vector *pv = (HVX_Vector *)out_scales;
@@ -25,58 +21,6 @@ static HMX_INLINE_ALWAYS void hmx_init_column_scales(void *out_scales, HVX_Vecto
    *pv   = Q6_V_vzero();
 }

-// Load multiple contiguous tiles with :deep streaming.
-// Rt = total region size - 1; the hardware streams through [Rs, Rs + Rt].
-// IMPORTANT: the tile region [Rs, Rs + Rt] must NOT cross a VTCM 4 MB bank
-// boundary, otherwise the mxmem instruction will raise a precise bus error.
-// Callers must ensure their VTCM layout satisfies this constraint.
-static HMX_INLINE_ALWAYS void hmx_load_tiles_fp16(const __fp16 *row_tiles,
-                                                   const __fp16 *col_tiles,
-                                                   size_t n_tiles) {
-    size_t limit = n_tiles * HMX_FP16_TILE_SIZE - 1;
-    asm volatile(
-        "{ activation.hf = mxmem(%0, %1):deep\n"
-        "weight.hf = mxmem(%2, %3) }\n"
-        :: "r"(row_tiles), "r"(limit), "r"(col_tiles), "r"(limit)
-        : "memory");
-}
-
-// Load a single activation+weight tile pair (no :deep streaming).
-// Rt defines the accessible region [Rs, Rs+Rt].  Following the reference formula
-// (limit = n_tiles * HMX_FP16_TILE_SIZE - 1), for a single tile Rt = 2047.
-// The original code used Rt=0x7FFF (32 KB region); when dynamic VTCM allocation
-// places a tile near a 4 MB bank boundary, the oversized region crosses it and
-// triggers a precise bus error (0x2601).  Rt=2047 confines accesses to exactly
-// one 2048-byte tile while covering all 16 HVX vectors (offsets 0..2047).
-static HMX_INLINE_ALWAYS void hmx_load_tile_pair_fp16(const __fp16 *act_tile,
-                                                       const __fp16 *wt_tile) {
-    asm volatile(
-        "{ activation.hf = mxmem(%0, %1)\n"
-        "weight.hf = mxmem(%2, %3) }\n"
-        :: "r"(act_tile), "r"(2047),
-           "r"(wt_tile),  "r"(2047)
-        : "memory");
-}
-
-static HMX_INLINE_ALWAYS void hmx_consume_accumulator_fp16(__fp16 *out) {
-    // Use the combined convert-and-store instruction (matches the reference
-    // Q6_mxmem_AR_after_hf intrinsic).  The previous two-instruction sequence
-    // "cvt.hf = acc(2); mxmem = cvt" used an undocumented Rs=2 parameter.
-    asm volatile(
-        "mxmem(%0, %1):after.hf = acc\n"
-        :: "r"(out), "r"(0)
-        : "memory");
-}
-
-// Compute inner product of two vectors of tiles and store result.
-static HMX_INLINE_ALWAYS void hmx_dot_fp16(__fp16 *out,
-                                            const __fp16 *row_tiles,
-                                            const __fp16 *col_tiles,
-                                            size_t n_tiles) {
-    hmx_load_tiles_fp16(row_tiles, col_tiles, n_tiles);
-    hmx_consume_accumulator_fp16(out);
-}
-
 // --- VTCM sequential allocator (from htp-ops-lib/include/dsp/vtcm_mgr.h) ---

 static inline uint8_t *vtcm_seq_alloc(uint8_t **vtcm_ptr, size_t size) {
--- a/ggml/src/ggml-hexagon/htp/htp-ctx.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@@ -2,6 +2,7 @@
 #define HTP_CTX_H

 #include "hex-dma.h"
+#include "hmx-queue.h"
 #include "htp-ops.h"
 #include "worker-pool.h"

@@ -30,6 +31,8 @@ struct htp_spad {
    uint32_t                  size_per_thread; // size per thread
 };

+struct htp_context;
+
 // Context while processing an Op
 // TODO: fold this into the main context
 struct htp_ops_context {
@@ -72,6 +75,10 @@ struct htp_context {
    atomic_bool            vtcm_needs_release;

    struct htp_ops_context octx;
+
+#ifdef HTP_HAS_HMX
+    struct hmx_queue *     hmx_queue; // Async HMX queue for pipeline overlap
+#endif
 };

 int op_matmul(struct htp_ops_context * octx);
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -91,7 +91,12 @@ enum htp_op_code {
 #define HTP_OP_MAX_BUFS    8
 #define HTP_OP_MAX_REQS    256
 #define HTP_OP_MAX_TENSORS (HTP_OP_MAX_REQS * HTP_OP_MAX_INPUTS + HTP_OP_MAX_REQS)
+
+#if __HVX_ARCH__ < 75
+#define HTP_OP_MAX_VMEM    (3167538380u)
+#else
 #define HTP_OP_MAX_VMEM    (3221225472u)
+#endif

 enum htp_tensor_flags {
    HTP_TENSOR_COMPUTE = (1U << 0), // Tensor buffer temporal compute data (not weights)
--- a/ggml/src/ggml-hexagon/htp/hvx-base.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-base.h
@@ -116,9 +116,14 @@ static inline HVX_VectorPred hvx_vec_is_nan_f16(HVX_Vector v) {
 }

 static inline HVX_Vector hvx_vec_f32_to_f16_shuff(HVX_Vector v0, HVX_Vector v1) {
+#if __HVX_ARCH__ >= 81
+    HVX_Vector q0 = Q6_Vqf32_equals_Vsf(v0);
+    HVX_Vector q1 = Q6_Vqf32_equals_Vsf(v1);
+#else
    const HVX_Vector zero = Q6_V_vzero();
    HVX_Vector q0 = Q6_Vqf32_vadd_VsfVsf(v0, zero);
    HVX_Vector q1 = Q6_Vqf32_vadd_VsfVsf(v1, zero);
+#endif
    return Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(q1, q0));
 }

--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -18,8 +18,9 @@
 #include <remote.h>
 #include <string.h>

-#include "hex-dma.h"
 #include "hex-utils.h"
+#include "hex-dma.h"
+#include "hmx-queue.h"

 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
@@ -324,6 +325,14 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que

 #ifdef HTP_HAS_HMX
    ctx->hmx_enabled = use_hmx;
+    ctx->hmx_queue   = NULL;
+    if (use_hmx) {
+        ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx);
+        if (!ctx->hmx_queue) {
+            FARF(ERROR, "hmx-queue-create failed");
+            ctx->hmx_enabled = false;
+        }
+    }
    FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx);
 #endif

@@ -389,7 +398,11 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
    }

 #ifdef HTP_HAS_HMX
-    ctx->hmx_enabled = 0;
+    if (ctx->hmx_queue) {
+        hmx_queue_delete(ctx->hmx_queue);
+        ctx->hmx_queue = NULL;
+    }
+    ctx->hmx_enabled = false;
 #endif

    vtcm_free(ctx);
--- a/models/templates/Reka-Edge.jinja
+++ b/models/templates/Reka-Edge.jinja
@@ -0,0 +1,161 @@
+{%- macro render_content(content, num_img_tokens, num_video_frames) -%}
+    {%- if content is string -%}
+        {{- content -}}
+    {%- elif content is sequence -%}
+        {%- set ns = namespace(out="", prev_was_text=false) -%}
+        {%- for item in content -%}
+            {%- set item_type = item.get("type") -%}
+            {%- if item_type == "text" or item.get("text") is not none -%}
+                {%- set text = item.get("text", "") -%}
+                {%- if text -%}
+                    {%- if ns.prev_was_text -%}
+                        {%- set ns.out = ns.out ~ " " -%}
+                    {%- endif -%}
+                    {%- set ns.out = ns.out ~ text -%}
+                {%- endif -%}
+                {%- set ns.prev_was_text = text != "" -%}
+            {%- elif item_type in ["image", "image_url"] or item.get("image") is not none or item.get("image_url") is not none -%}
+                {%- set ns.out = ns.out ~ "<image>" ~ ("<REKA_IMG_TOKEN>" * num_img_tokens) ~ "</image>" -%}
+                {%- set ns.prev_was_text = false -%}
+            {%- elif item_type in ["video", "video_url"] or item.get("video") is not none or item.get("video_url") is not none -%}
+                {%- set repeat_tokens = num_img_tokens * num_video_frames -%}
+                {%- set ns.out = ns.out ~ "<video>" ~ ("<REKA_IMG_TOKEN>" * repeat_tokens) ~ "</video>" -%}
+                {%- set ns.prev_was_text = false -%}
+            {%- endif -%}
+        {%- endfor -%}
+        {{- ns.out -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- set ns = namespace(out="", last_query_index=messages|length - 1) -%}
+{%- for msg in messages[::-1] -%}
+    {%- set idx = messages|length - 1 - loop.index0 -%}
+    {%- if msg.get("role") == "user" -%}
+        {%- set content = msg.get("content", "") -%}
+        {%- if not (content is string and content.startswith("<tool_response>") and content.endswith("</tool_response>")) -%}
+            {%- set ns.last_query_index = idx -%}
+            {%- break -%}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+{%- set last_query_index = ns.last_query_index -%}
+{%- set num_img_tokens = num_img_tokens | default(64, true) | int -%}
+{%- set num_video_frames = num_video_frames | default(6, true) | int -%}
+{%- set start_idx = 0 -%}
+{%- set system_text = "" -%}
+{%- if messages|length > 0 and messages[0].get("role") in ["system", "developer"] -%}
+    {%- set system_text = render_content(messages[0].get("content", ""), num_img_tokens, num_video_frames) -%}
+    {%- set start_idx = 1 -%}
+{%- endif -%}
+{%- if tools or system_text -%}
+    {%- set preamble_ns = namespace(text="") -%}
+    {%- if system_text -%}
+        {%- set preamble_ns.text = "system: " ~ system_text -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- if preamble_ns.text -%}
+            {%- set preamble_ns.text = preamble_ns.text ~ "\n\n" -%}
+        {%- else -%}
+            {%- set preamble_ns.text = "system: " -%}
+        {%- endif -%}
+        {%- set preamble_ns.text = preamble_ns.text
+            ~ "# Tools\n\n"
+            ~ "You may call one or more functions to assist with the user query.\n\n"
+            ~ "You are provided with function signatures within <tools></tools> XML tags:\n"
+            ~ "<tools>" -%}
+        {%- for tool in tools -%}
+            {%- set preamble_ns.text = preamble_ns.text ~ "\n" ~ (tool | tojson(ensure_ascii=True)) -%}
+        {%- endfor -%}
+        {%- set preamble_ns.text = preamble_ns.text
+            ~ "\n</tools>\n\n"
+            ~ "For each function call, return a json object with function name and arguments "
+            ~ "within <tool_call></tool_call> XML tags:\n"
+            ~ "<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>" -%}
+    {%- endif -%}
+    {%- set ns.out = ns.out ~ preamble_ns.text ~ "\n\n<sep>" -%}
+{%- endif -%}
+{%- for idx in range(start_idx, messages|length) -%}
+    {%- set message = messages[idx] -%}
+    {%- set role = message.get("role") -%}
+    {%- set content = message.get("content") -%}
+    {%- if role == "user" -%}
+        {%- set prefix_ns = namespace(value="human: ") -%}
+        {%- if content is sequence and content is not string -%}
+            {%- for item in content -%}
+                {%- if item.get("type") == "text" or item.get("text") is not none -%}
+                    {%- set text = item.get("text", "") -%}
+                    {%- if text -%}
+                        {%- break -%}
+                    {%- endif -%}
+                {%- elif item.get("type") in ["image", "image_url", "video", "video_url"] -%}
+                    {%- set prefix_ns.value = "human:" -%}
+                    {%- break -%}
+                {%- endif -%}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set ns.out = ns.out ~ prefix_ns.value ~ render_content(content, num_img_tokens, num_video_frames) ~ "<sep>" -%}
+    {%- elif role == "assistant" -%}
+        {%- set tool_calls = message.get("tool_calls") -%}
+        {%- set content_text = render_content(content, num_img_tokens, num_video_frames) -%}
+        {%- set reasoning_text = "" -%}
+        {%- if message.get("reasoning_content") is string -%}
+            {%- set reasoning_text = message.get("reasoning_content") -%}
+        {%- elif "</think>" in content_text -%}
+            {%- set reasoning_text = content_text.split("</think>", 1)[0].rstrip("\n").split("<think>")[-1].lstrip("\n") -%}
+            {%- set content_text = content_text.split("</think>", 1)[1].lstrip("\n") -%}
+        {%- endif -%}
+        {%- set ns.out = ns.out ~ "assistant: " -%}
+        {%- set include_thinking = enable_thinking is true
+            and idx > last_query_index
+            and (idx == messages|length - 1 or reasoning_text)
+        -%}
+        {%- if include_thinking -%}
+            {%- set ns.out = ns.out ~ "<think>\n" ~ (reasoning_text.strip() ) ~ "\n</think>\n\n" -%}
+        {%- endif -%}
+        {%- set ns.out = ns.out ~ content_text -%}
+        {%- if tool_calls -%}
+            {%- if content_text and not ns.out.endswith("\n") -%}
+                {%- set ns.out = ns.out ~ "\n" -%}
+            {%- endif -%}
+            {%- for tool_call in tool_calls -%}
+                {%- if tool_call.get("function") is not none -%}
+                    {%- set tool_call = tool_call.get("function") -%}
+                {%- endif -%}
+                {%- set arguments = tool_call.get("arguments", {}) -%}
+                {%- if arguments is string -%}
+                    {%- set arguments_json = arguments -%}
+                {%- elif arguments is mapping -%}
+                    {%- set arguments_json = arguments | tojson(ensure_ascii=True) -%}
+                {%- else -%}
+                    {%- set arguments_json = arguments | tojson(ensure_ascii=True) -%}
+                {%- endif -%}
+                {%- set ns.out = ns.out
+                    ~ "<tool_call>\n"
+                    ~ "{\"name\": \"" ~ tool_call.get("name", "") ~ "\", \"arguments\": "
+                    ~ arguments_json
+                    ~ "}\n</tool_call>" -%}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- if not (continue_final_message and idx == messages|length - 1) -%}
+            {%- set ns.out = ns.out ~ "\n\n<sep>" -%}
+        {%- endif -%}
+    {%- elif role == "tool" -%}
+        {%- if idx == start_idx or messages[idx - 1].get("role") != "tool" -%}
+            {%- set ns.out = ns.out ~ "human: " -%}
+        {%- endif -%}
+        {%- set response_text = render_content(content, num_img_tokens, num_video_frames) -%}
+        {%- set ns.out = ns.out ~ "<tool_response>\n" ~ response_text ~ "\n</tool_response>" -%}
+        {%- if idx == messages|length - 1 or messages[idx + 1].get("role") != "tool" -%}
+            {%- set ns.out = ns.out ~ "<sep>" -%}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt
+    and (messages|length == 0 or messages[-1].get("role") != "assistant")
+-%}
+    {%- if enable_thinking is true -%}
+        {%- set ns.out = ns.out ~ "assistant: <think>\n" -%}
+    {%- else -%}
+        {%- set ns.out = ns.out ~ "assistant:" -%}
+    {%- endif -%}
+{%- endif -%}
+{{- ns.out -}}
--- a/pocs/vdot/CMakeLists.txt
+++ b/pocs/vdot/CMakeLists.txt
@@ -1,9 +1,9 @@
 set(TARGET llama-vdot)
 add_executable(${TARGET} vdot.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 set(TARGET llama-q8dot)
 add_executable(${TARGET} q8dot.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,7 +10,7 @@ function(llama_build source)
    endif()

    add_executable(${TEST_TARGET} ${TEST_SOURCES})
-    target_link_libraries(${TEST_TARGET} PRIVATE common)
+    target_link_libraries(${TEST_TARGET} PRIVATE llama llama-common)
    if (LLAMA_TESTS_INSTALL)
        install(TARGETS ${TEST_TARGET} RUNTIME)
    endif()
@@ -105,7 +105,7 @@ function(llama_build_and_test source)
    if (LLAMA_TESTS_INSTALL)
        install(TARGETS ${TEST_TARGET} RUNTIME)
    endif()
-    target_link_libraries(${TEST_TARGET} PRIVATE common)
+    target_link_libraries(${TEST_TARGET} PRIVATE llama-common)

    add_test(
        NAME ${TEST_TARGET}
@@ -269,11 +269,11 @@ if (TARGET cpp-httplib)
    get_target_property(_cpp_httplib_defs cpp-httplib INTERFACE_COMPILE_DEFINITIONS)
    if (_cpp_httplib_defs MATCHES "CPPHTTPLIB_OPENSSL_SUPPORT")
        add_library(gguf-model-data STATIC gguf-model-data.cpp)
-        target_link_libraries(gguf-model-data PRIVATE common cpp-httplib)
+        target_link_libraries(gguf-model-data PRIVATE llama-common cpp-httplib)
        target_include_directories(gguf-model-data PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})

        add_executable(test-gguf-model-data test-gguf-model-data.cpp)
-        target_link_libraries(test-gguf-model-data PRIVATE gguf-model-data common)
+        target_link_libraries(test-gguf-model-data PRIVATE gguf-model-data llama-common)
        llama_test(test-gguf-model-data LABEL "model")

        # test-quant-type-selection requires gguf-model-data for remote model metadata
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -2164,7 +2164,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {

        tst.test(
               "<tool_call>\n"
-               "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+               "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}"
               "</tool_call>")
            .tools({ special_function_tool })
            .expect(message_assist_call)
@@ -2172,7 +2172,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {

        tst.test(
               "Hello, world!\nWhat's up?<tool_call>\n"
-               "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+               "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}"
               "</tool_call>")
            .tools({ special_function_tool })
            .expect(message_assist_call_content)
@@ -3329,6 +3329,92 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .run();
    }

+    // Reka-Edge tests - uses native JSON format with per-call wrapper
+    {
+        auto tst = peg_tester("models/templates/Reka-Edge.jinja", detailed_debug);
+
+        // Basic content only
+        tst.test("Hello, world!\nWhat's up?").enable_thinking(false).expect(message_assist).run();
+
+        // Single tool call without reasoning
+        tst.test("<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>")
+            .enable_thinking(false)
+            .tools({ special_function_tool })
+            .expect(message_assist_call)
+            .run();
+
+        // Tool call with string argument
+        tst.test("<tool_call>\n{\"name\": \"get_time\", \"arguments\": {\"city\": \"XYZCITY\"}}</tool_call>")
+            .enable_thinking(false)
+            .tools({ get_time_tool })
+            .expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
+            .run();
+
+        // Tool call with reasoning (enable_thinking=true)
+        tst.test("I'm\nthinking</think><tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>")
+            .enable_thinking(true)
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .tools({ special_function_tool })
+            .expect(message_assist_call_thoughts)
+            .run();
+
+        // Multiple tool calls (parallel)
+        tst.test(
+            "<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>"
+            "<tool_call>\n{\"name\": \"special_function_with_opt\", \"arguments\": {\"arg1\": 1, \"arg2\": 2}}</tool_call>"
+        )
+            .enable_thinking(false)
+            .parallel_tool_calls(true)
+            .tools({
+                special_function_tool, special_function_tool_with_optional_param
+            })
+            .expect_tool_calls({
+                { "special_function", R"({"arg1": 1})", {} },
+                { "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
+            })
+            .run();
+
+        // Tool call with reasoning and content
+        tst.test("I need to call a function</think>"
+                 "Let me check the time.<tool_call>\n{\"name\": \"get_time\", \"arguments\": {\"city\": \"XYZCITY\"}}</tool_call>")
+            .enable_thinking(true)
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .tools({ get_time_tool })
+            .expect(message_with_reasoning_content_and_multiple_tool_calls(
+                "I need to call a function", "Let me check the time.", { { "get_time", "{\"city\":\"XYZCITY\"}" } }
+            ))
+            .run();
+
+        // Partial tool call (streaming)
+        tst.test("<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\":")
+            .tools({ special_function_tool })
+            .enable_thinking(false)
+            .is_partial(true)
+            .expect(simple_assist_msg("", "", "special_function", "{\"arg1\": "))
+            .run();
+
+        // Tool call with empty arguments
+        tst.test("<tool_call>\n{\"name\": \"empty_args\", \"arguments\": {}}</tool_call>")
+            .enable_thinking(false)
+            .tools({ empty_args_tool })
+            .expect(simple_assist_msg("", "", "empty_args", "{}"))
+            .run();
+
+        // fake tool call marker in reasoning
+        tst.test(
+               "Let me think about <tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 2}}</tool_call> hmm</think>"
+               "<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>")
+            .enable_thinking(true)
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .tools({ special_function_tool })
+            .expect_reasoning("Let me think about <tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 2}}</tool_call> hmm")
+            .expect_tool_calls({
+                { "special_function", R"({"arg1": 1})", {} },
+            })
+            .run();
+    }
+
+
    // Apertus-8B-Instruct tests - FUNC_NAME_AS_KEY format
    // Format: <|tools_prefix|>[{"function_name": {...arguments...}}]<|tools_suffix|>
    {
--- a/tests/test-quantize-stats.cpp
+++ b/tests/test-quantize-stats.cpp
@@ -1,10 +1,13 @@
-#include "ggml.h"
-#include "ggml-cpu.h"
 #include "llama.h"
+
+#include "build-info.h"
 #include "common.h"

 #include "../src/llama-model.h"

+#include "ggml.h"
+#include "ggml-cpu.h"
+
 #include <algorithm>
 #include <cassert>
 #include <cinttypes>
@@ -298,7 +301,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    print_build_info();
+    llama_print_build_info();

    // load the model
    fprintf(stderr, "Loading model\n");
--- a/tools/batched-bench/CMakeLists.txt
+++ b/tools/batched-bench/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-batched-bench)
 add_executable(${TARGET} batched-bench.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 if(LLAMA_TOOLS_INSTALL)
--- a/tools/cli/CMakeLists.txt
+++ b/tools/cli/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-cli)
 add_executable(${TARGET} cli.cpp)
-target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE server-context PUBLIC llama-common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 include_directories(../server)
--- a/tools/completion/CMakeLists.txt
+++ b/tools/completion/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-completion)
 add_executable(${TARGET} completion.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 if(LLAMA_TOOLS_INSTALL)
--- a/tools/cvector-generator/CMakeLists.txt
+++ b/tools/cvector-generator/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-cvector-generator)
 add_executable(${TARGET} cvector-generator.cpp pca.hpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 if(LLAMA_TOOLS_INSTALL)
--- a/tools/cvector-generator/cvector-generator.cpp
+++ b/tools/cvector-generator/cvector-generator.cpp
@@ -2,6 +2,7 @@
 #include "gguf.h"

 #include "arg.h"
+#include "build-info.h"
 #include "common.h"
 #include "llama.h"
 #include "pca.hpp"
@@ -420,7 +421,7 @@ int main(int argc, char ** argv) {
    params.cb_eval_user_data = &cb_data;
    params.warmup = false;

-    print_build_info();
+    llama_print_build_info();
    llama_backend_init();
    llama_numa_init(params.numa);

--- a/tools/export-lora/CMakeLists.txt
+++ b/tools/export-lora/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-export-lora)
 add_executable(${TARGET} export-lora.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 if(LLAMA_TOOLS_INSTALL)
--- a/tools/fit-params/CMakeLists.txt
+++ b/tools/fit-params/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-fit-params)
 add_executable(${TARGET} fit-params.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 if(LLAMA_TOOLS_INSTALL)
--- a/tools/gguf-split/CMakeLists.txt
+++ b/tools/gguf-split/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-gguf-split)
 add_executable(${TARGET} gguf-split.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 if(LLAMA_TOOLS_INSTALL)
--- a/tools/gguf-split/gguf-split.cpp
+++ b/tools/gguf-split/gguf-split.cpp
@@ -1,7 +1,10 @@
+#include "llama.h"
+
+#include "build-info.h"
+#include "common.h"
+
 #include "ggml.h"
 #include "gguf.h"
-#include "llama.h"
-#include "common.h"

 #include <algorithm>
 #include <cinttypes>
@@ -101,8 +104,8 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
            split_print_usage(argv[0]);
            exit(0);
        } else if (arg == "--version") {
-            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+            fprintf(stderr, "version: %d (%s)\n", llama_build_number(), llama_commit());
+            fprintf(stderr, "built with %s for %s\n", llama_compiler(), llama_build_target());
            exit(0);
        } else if (arg == "--dry-run") {
            arg_found = true;
--- a/tools/imatrix/CMakeLists.txt
+++ b/tools/imatrix/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-imatrix)
 add_executable(${TARGET} imatrix.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 if(LLAMA_TOOLS_INSTALL)
--- a/tools/llama-bench/CMakeLists.txt
+++ b/tools/llama-bench/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-bench)
 add_executable(${TARGET} llama-bench.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 if(LLAMA_TOOLS_INSTALL)
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -19,6 +19,7 @@
 #include <vector>
 #include <unordered_set>

+#include "build-info.h"
 #include "common.h"
 #include "download.h"
 #include "ggml.h"
@@ -1624,8 +1625,8 @@ struct test {
    }
 };

-const std::string test::build_commit = LLAMA_COMMIT;
-const int         test::build_number = LLAMA_BUILD_NUMBER;
+const std::string test::build_commit = llama_commit();
+const int         test::build_number = llama_build_number();

 struct printer {
    virtual ~printer() {}
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -86,12 +86,12 @@ if (TARGET BUILD_INFO)
    add_dependencies(mtmd-helper BUILD_INFO)
 endif()

-# if mtmd is linked against common, we throw an error
+# if mtmd is linked against llama-common, we throw an error
 if (TARGET mtmd)
    get_target_property(libs mtmd LINK_LIBRARIES)
-    if (libs AND "common" IN_LIST libs)
+    if (libs AND "llama-common" IN_LIST libs)
        message(FATAL_ERROR "mtmd is designed to be a public library.\n"
-                            "It must not link against common")
+                            "It must not link against llama-common")
    endif()
 endif()

@@ -106,11 +106,11 @@ set_target_properties  (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
 if(LLAMA_TOOLS_INSTALL)
    install(TARGETS ${TARGET} RUNTIME)
 endif()
-target_link_libraries  (${TARGET} PRIVATE common mtmd Threads::Threads)
+target_link_libraries  (${TARGET} PRIVATE llama-common mtmd Threads::Threads)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 # mtmd-debug tool
 add_executable(llama-mtmd-debug debug/mtmd-debug.cpp)
 set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug)
-target_link_libraries(llama-mtmd-debug PRIVATE common mtmd Threads::Threads)
+target_link_libraries(llama-mtmd-debug PRIVATE llama-common mtmd Threads::Threads)
 target_compile_features(llama-mtmd-debug PRIVATE cxx_std_17)
--- a/tools/parser/CMakeLists.txt
+++ b/tools/parser/CMakeLists.txt
@@ -2,7 +2,7 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
    # this tool is disabled on Windows when building with shared libraries because it uses internal functions not exported with LLAMA_API
    set(TARGET llama-debug-template-parser)
    add_executable(${TARGET} debug-template-parser.cpp)
-    target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+    target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
    target_compile_features(${TARGET} PRIVATE cxx_std_17)

    if(LLAMA_TOOLS_INSTALL)
@@ -12,7 +12,7 @@ endif()

 set(TARGET llama-template-analysis)
 add_executable(${TARGET} template-analysis.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 if(LLAMA_TOOLS_INSTALL)
--- a/tools/perplexity/CMakeLists.txt
+++ b/tools/perplexity/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-perplexity)
 add_executable(${TARGET} perplexity.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 if(LLAMA_TOOLS_INSTALL)
--- a/tools/quantize/CMakeLists.txt
+++ b/tools/quantize/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-quantize)
 add_executable(${TARGET} quantize.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -1,5 +1,8 @@
-#include "common.h"
 #include "llama.h"
+
+#include "build-info.h"
+#include "common.h"
+
 #include "gguf.h"

 #include <algorithm>
@@ -709,7 +712,7 @@ int main(int argc, char ** argv) {
        }
    }

-    print_build_info();
+    llama_print_build_info();

    if (params.dry_run) {
        fprintf(stderr, "%s: calculating quantization size for '%s' as %s", __func__, fname_inp.c_str(), ftype_str.c_str());
--- a/tools/results/CMakeLists.txt
+++ b/tools/results/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-results)
 add_executable(${TARGET} results.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 if(LLAMA_TOOLS_INSTALL)
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -23,7 +23,7 @@ endif()

 target_include_directories(${TARGET} PRIVATE ../mtmd)
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
-target_link_libraries(${TARGET} PUBLIC common mtmd ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PUBLIC llama-common mtmd ${CMAKE_THREAD_LIBS_INIT})


 # llama-server executable
@@ -68,6 +68,6 @@ install(TARGETS ${TARGET} RUNTIME)

 target_include_directories(${TARGET} PRIVATE ../mtmd)
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
-target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE server-context PUBLIC llama-common cpp-httplib ${CMAKE_THREAD_LIBS_INIT})

 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -4,6 +4,7 @@
 #include "server-task.h"
 #include "server-queue.h"

+#include "build-info.h"
 #include "common.h"
 #include "llama.h"
 #include "log.h"
@@ -3009,7 +3010,7 @@ server_context_meta server_context::get_meta() const {
    auto eos_token_str = eos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, eos_id, true) : "";

    return server_context_meta {
-        /* build_info             */ build_info,
+        /* build_info             */ std::string(llama_build_info()),
        /* model_name             */ impl->model_name,
        /* model_aliases          */ impl->model_aliases,
        /* model_tags             */ impl->model_tags,
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -1,6 +1,7 @@
 #include "server-common.h"
 #include "server-models.h"

+#include "build-info.h"
 #include "preset.h"
 #include "download.h"

@@ -936,7 +937,7 @@ void server_models_routes::init_routes() {
                    {"n_ctx",  0},
                }},
                {"webui_settings", webui_settings},
-                {"build_info",     build_info},
+                {"build_info",     std::string(llama_build_info())},
            });
            return res;
        }
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -1,5 +1,6 @@
 #include "server-task.h"

+#include "build-info.h"
 #include "chat.h"
 #include "common.h"
 #include "json-schema-to-grammar.h"
@@ -791,7 +792,7 @@ json server_task_result_cmpl_final::to_json_oaicompat() {
        })},
        {"created",            t},
        {"model",              oaicompat_model},
-        {"system_fingerprint", build_info},
+        {"system_fingerprint", std::string(llama_build_info())},
        {"object",             "text_completion"},
        {"usage",              usage_json_oaicompat()},
        {"id", oaicompat_cmpl_id}
@@ -839,7 +840,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat() {
        {"choices",            json::array({choice})},
        {"created",            t},
        {"model",              oaicompat_model},
-        {"system_fingerprint", build_info},
+        {"system_fingerprint", std::string(llama_build_info())},
        {"object",             "chat.completion"},
        {"usage",              usage_json_oaicompat()},
        {"id", oaicompat_cmpl_id}
@@ -876,7 +877,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
            {"created", t},
            {"id", oaicompat_cmpl_id},
            {"model", oaicompat_model},
-            {"system_fingerprint", build_info},
+            {"system_fingerprint", std::string(llama_build_info())},
            {"object", "chat.completion.chunk"},
        });
    }
@@ -892,7 +893,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
        {"created",            t},
        {"id",                 oaicompat_cmpl_id},
        {"model",              oaicompat_model},
-        {"system_fingerprint", build_info},
+        {"system_fingerprint", std::string(llama_build_info())},
        {"object",             "chat.completion.chunk"},
    });

@@ -904,7 +905,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
            {"created",            t},
            {"id",                 oaicompat_cmpl_id},
            {"model",              oaicompat_model},
-            {"system_fingerprint", build_info},
+            {"system_fingerprint", std::string(llama_build_info())},
            {"object",             "chat.completion.chunk"},
            {"usage",              usage_json_oaicompat()},
        });
@@ -1469,7 +1470,7 @@ json server_task_result_cmpl_partial::to_json_oaicompat() {
        })},
        {"created",            t},
        {"model",              oaicompat_model},
-        {"system_fingerprint", build_info},
+        {"system_fingerprint", std::string(llama_build_info())},
        {"object",             "text_completion"},
        {"id",                 oaicompat_cmpl_id}
    };
@@ -1506,7 +1507,7 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
            {"created", t},
            {"id", oaicompat_cmpl_id},
            {"model", oaicompat_model},
-            {"system_fingerprint", build_info},
+            {"system_fingerprint", std::string(llama_build_info())},
            {"object", "chat.completion.chunk"},
        });
    };
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -5,6 +5,7 @@
 #include "server-tools.h"

 #include "arg.h"
+#include "build-info.h"
 #include "common.h"
 #include "llama.h"
 #include "log.h"
@@ -108,7 +109,7 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    LOG_INF("build_info: %s\n", build_info.c_str());
+    LOG_INF("build_info: %s\n", llama_build_info());
    LOG_INF("%s\n", common_params_get_system_info(params).c_str());

    server_http_context ctx_http;
--- a/tools/tokenize/CMakeLists.txt
+++ b/tools/tokenize/CMakeLists.txt
@@ -3,5 +3,5 @@ add_executable(${TARGET} tokenize.cpp)
 if(LLAMA_TOOLS_INSTALL)
    install(TARGETS ${TARGET} RUNTIME)
 endif()
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/tools/tts/CMakeLists.txt
+++ b/tools/tts/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-tts)
 add_executable(${TARGET} tts.cpp)
-target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 if(LLAMA_TOOLS_INSTALL)
--- a/vendor/cpp-httplib/CMakeLists.txt
+++ b/vendor/cpp-httplib/CMakeLists.txt
@@ -5,6 +5,8 @@ find_package(Threads REQUIRED)

 llama_add_compile_flags()

+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
 add_library(${TARGET} STATIC httplib.cpp httplib.h)

 # disable warnings in 3rd party code
Author	SHA1	Message	Date
Georgi Gerganov	aefc0d1653	log : add common_log_get_verbosity_thold()	2026-04-15 14:05:42 +03:00
Georgi Gerganov	64c8c88ac9	libs : add libllama-common-base	2026-04-15 13:24:09 +03:00
Georgi Gerganov	35f69bd4b2	cont : fix build_info exports	2026-04-15 13:21:47 +03:00
Georgi Gerganov	a9e852d21a	cont : export all symbols	2026-04-15 13:21:47 +03:00
Georgi Gerganov	48cb5bcb0e	cont : set -fPIC for httplib	2026-04-15 13:21:47 +03:00
Georgi Gerganov	639b199eb2	cmake : rename libcommon to libllama-common	2026-04-15 13:21:47 +03:00
Georgi Gerganov	742a584ccb	cmake : allow libcommon to be shared	2026-04-15 13:21:45 +03:00
Ruben Ortlam	8dc530b86d	ci: disable test-backend-ops on Vulkan llvmpipe run and resture default timeout (#21901 )	2026-04-15 10:55:21 +02:00
Piotr Wilkin (ilintar)	e1a9a6dcbe	autoparser: support case of JSON_NATIVE with per-call markers (test case: Reka-Edge) (#21892 )	2026-04-15 10:51:50 +02:00
Matt	e39eba26f3	read n_ctx back after making llama_context (#21939 )	2026-04-15 15:24:57 +08:00
Yiwei Shao	5d14e5d19b	hexagon: optimization for HMX mat_mul (#21554 ) * hexagon: add async HMX worker Introduce hmx-worker (dedicated thread for HMX compute) to overlap HMX matmul with HVX dequant/DMA stages in the pipeline path, replacing the previous synchronous HMX calls that blocked the main thread. * hexagon: cost-based VTCM chunk search for out-stationary matmul * hexagon: fix futex race in hmx_worker_drain Store the boolean to local variable avoid atomic load twice * hex-mm: hmx optimize scatter/transpose and use HMX intrinsics * hex-vmem: drop vmem limit a touch under 3GB on v73 * hexagon: add fwd declaration of htp_context * hex-hmx: replace hmx-worker with hmx-queue that mimics dma-queue interface Simplifies the overall implemantion, reduces thread wakeup roundtrips. * hex-mm: add debug log to hmx work func called from hmx-queue * Update hmx-queue.h Co-authored-by: Max Krasnyansky <max.krasnyansky@gmail.com> --------- Co-authored-by: Kim-Chyan Gan <kgan@qti.qualcomm.com> Co-authored-by: Max Krasnyansky <maxk@qti.qualcomm.com> Co-authored-by: Max Krasnyansky <max.krasnyansky@gmail.com>	2026-04-14 14:09:03 -07:00