Compare commits

..

11 Commits

Author SHA1 Message Date
Georgi Gerganov
aefc0d1653 log : add common_log_get_verbosity_thold() 2026-04-15 14:05:42 +03:00
Georgi Gerganov
64c8c88ac9 libs : add libllama-common-base 2026-04-15 13:24:09 +03:00
Georgi Gerganov
35f69bd4b2 cont : fix build_info exports 2026-04-15 13:21:47 +03:00
Georgi Gerganov
a9e852d21a cont : export all symbols 2026-04-15 13:21:47 +03:00
Georgi Gerganov
48cb5bcb0e cont : set -fPIC for httplib 2026-04-15 13:21:47 +03:00
Georgi Gerganov
639b199eb2 cmake : rename libcommon to libllama-common 2026-04-15 13:21:47 +03:00
Georgi Gerganov
742a584ccb cmake : allow libcommon to be shared 2026-04-15 13:21:45 +03:00
Ruben Ortlam
8dc530b86d ci: disable test-backend-ops on Vulkan llvmpipe run and resture default timeout (#21901) 2026-04-15 10:55:21 +02:00
Piotr Wilkin (ilintar)
e1a9a6dcbe autoparser: support case of JSON_NATIVE with per-call markers (test case: Reka-Edge) (#21892) 2026-04-15 10:51:50 +02:00
Matt
e39eba26f3 read n_ctx back after making llama_context (#21939) 2026-04-15 15:24:57 +08:00
Yiwei Shao
5d14e5d19b hexagon: optimization for HMX mat_mul (#21554)
* hexagon: add async HMX worker

Introduce hmx-worker (dedicated thread for HMX compute) to overlap HMX
matmul with HVX dequant/DMA stages in the pipeline path, replacing the
previous synchronous HMX calls that blocked the main thread.

* hexagon: cost-based VTCM chunk search for out-stationary matmul

* hexagon: fix futex race in hmx_worker_drain
Store the boolean to local variable avoid atomic load twice

* hex-mm: hmx optimize scatter/transpose and use HMX intrinsics

* hex-vmem: drop vmem limit a touch under 3GB on v73

* hexagon: add fwd declaration of htp_context

* hex-hmx: replace hmx-worker with hmx-queue that mimics dma-queue interface

Simplifies the overall implemantion, reduces thread wakeup roundtrips.

* hex-mm: add debug log to hmx work func called from hmx-queue

* Update hmx-queue.h

Co-authored-by: Max Krasnyansky <max.krasnyansky@gmail.com>

---------

Co-authored-by: Kim-Chyan Gan <kgan@qti.qualcomm.com>
Co-authored-by: Max Krasnyansky <maxk@qti.qualcomm.com>
Co-authored-by: Max Krasnyansky <max.krasnyansky@gmail.com>
2026-04-14 14:09:03 -07:00
76 changed files with 1091 additions and 322 deletions

View File

@@ -93,4 +93,5 @@ jobs:
export GGML_VK_DISABLE_F16=1
export GGML_VK_DISABLE_COOPMAT=1
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 4800
# test-backend-ops is too slow on llvmpipe, skip it
ctest -L main -E test-backend-ops --verbose --timeout 900

View File

@@ -225,7 +225,7 @@ foreach(FILE_PATH ${EXTRA_LICENSES})
endforeach()
if (LLAMA_BUILD_COMMON)
license_generate(common)
license_generate(llama-common)
endif()
#
@@ -249,6 +249,10 @@ set_target_properties(llama
install(TARGETS llama LIBRARY PUBLIC_HEADER)
if (LLAMA_BUILD_COMMON)
install(TARGETS llama-common LIBRARY)
endif()
configure_package_config_file(
${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake

View File

@@ -1,9 +1,11 @@
# common
find_package(Threads REQUIRED)
llama_add_compile_flags()
#
# llama-common-base
#
# Build info header
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
@@ -33,17 +35,25 @@ endif()
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
set(TARGET build_info)
add_library(${TARGET} OBJECT ${OUTPUT_FILE})
set(TARGET llama-common-base)
add_library(${TARGET} STATIC ${OUTPUT_FILE})
target_include_directories(${TARGET} PUBLIC .)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
set(TARGET common)
#
# llama-common
#
add_library(${TARGET} STATIC
set(TARGET llama-common)
add_library(${TARGET}
arg.cpp
arg.h
base64.hpp
@@ -106,17 +116,24 @@ add_library(${TARGET} STATIC
jinja/caps.h
)
set_target_properties(${TARGET} PROPERTIES
VERSION ${LLAMA_INSTALL_VERSION}
SOVERSION 0
MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
)
target_include_directories(${TARGET} PUBLIC . ../vendor)
target_compile_features (${TARGET} PUBLIC cxx_std_17)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
# TODO: make fine-grained exports in the future
set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()
target_link_libraries(${TARGET} PRIVATE
build_info
cpp-httplib
)
target_link_libraries(${TARGET} PUBLIC llama-common-base)
target_link_libraries(${TARGET} PRIVATE cpp-httplib)
if (LLAMA_LLGUIDANCE)
include(ExternalProject)

View File

@@ -1,5 +1,6 @@
#include "arg.h"
#include "build-info.h"
#include "chat.h"
#include "common.h"
#include "download.h"
@@ -1044,8 +1045,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--version"},
"show version and build info",
[](common_params &) {
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
fprintf(stderr, "version: %d (%s)\n", llama_build_number(), llama_commit());
fprintf(stderr, "built with %s for %s\n", llama_compiler(), llama_build_target());
exit(0);
}
));

View File

@@ -1,4 +1,35 @@
#include "build-info.h"
#include <cstdio>
#include <string>
int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
char const * LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
char const * LLAMA_COMPILER = "@BUILD_COMPILER@";
char const * LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
int llama_build_number(void) {
return LLAMA_BUILD_NUMBER;
}
const char * llama_commit(void) {
return LLAMA_COMMIT;
}
const char * llama_compiler(void) {
return LLAMA_COMPILER;
}
const char * llama_build_target(void) {
return LLAMA_BUILD_TARGET;
}
const char * llama_build_info(void) {
static std::string s = "b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT;
return s.c_str();
}
void llama_print_build_info(void) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, llama_build_number(), llama_commit());
fprintf(stderr, "%s: built with %s for %s\n", __func__, llama_compiler(), llama_build_target());
}

11
common/build-info.h Normal file
View File

@@ -0,0 +1,11 @@
#pragma once
int llama_build_number(void);
const char * llama_commit(void);
const char * llama_compiler(void);
const char * llama_build_target(void);
const char * llama_build_info(void);
void llama_print_build_info(void);

View File

@@ -198,10 +198,19 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
args_field = format.function_field + "." + args_field;
}
auto tools_parser = p.standard_json_tools(
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
auto tools_parser = p.eps();
if (format.section_start.empty() && !format.per_call_start.empty()) {
auto single_tool_parser = p.standard_json_tools(
format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls,
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space()));
} else {
tools_parser = p.standard_json_tools(
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
}
// Handle content wrappers if present
if (ctx.content && ctx.content->is_always_wrapped()) {

View File

@@ -308,19 +308,23 @@ struct analyze_tools : analyze_base {
private:
// Extract tool calling 'haystack' for further analysis and delegate further analysis based on format
void analyze_tool_calls(const analyze_reasoning & reasoning);
void analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls);
// Analyze format based on position of function and argument name in needle
void analyze_tool_call_format(const std::string & haystack,
const std::string & fun_name_needle,
const std::string & arg_name_needle,
const analyze_reasoning & reasoning);
const analyze_reasoning & reasoning,
bool supports_parallel_tool_calls);
// Analyze specifics of JSON native format (entire tool call is a JSON object)
void analyze_tool_call_format_json_native(const std::string & clean_haystack,
const std::string & fun_name_needle,
const std::string & arg_name_needle);
// Check if parallel calls in JSON native format array wrapped or tag wrapped
void analyze_json_native_parallel_calls();
// Analyze specifics of non-JSON native format (tags for function name or for function name and arguments)
void analyze_tool_call_format_non_json(const std::string & clean_haystack,
const std::string & fun_name_needle);

View File

@@ -558,7 +558,7 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
: analyze_base(tmpl) {
LOG_DBG(ANSI_ORANGE "Phase 3: Tool call analysis\n" ANSI_RESET);
analyze_tool_calls(reasoning);
analyze_tool_calls(reasoning, caps.supports_parallel_tool_calls);
if (format.mode != tool_format::NONE && format.mode != tool_format::JSON_NATIVE) {
if (caps.supports_parallel_tool_calls) {
@@ -577,7 +577,7 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
}
}
void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls) {
json assistant_no_tools = json{
{ "role", "assistant" },
{ "content", ASSISTANT_MSG }
@@ -611,13 +611,14 @@ void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
return;
}
analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning);
analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning, supports_parallel_tool_calls);
}
void analyze_tools::analyze_tool_call_format(const std::string & haystack,
const std::string & fun_name_needle,
const std::string & arg_name_needle,
const analyze_reasoning & reasoning) {
const analyze_reasoning & reasoning,
bool supports_parallel_tool_calls) {
if (fun_name_needle.empty() || arg_name_needle.empty() || haystack.empty()) {
return;
}
@@ -660,6 +661,9 @@ void analyze_tools::analyze_tool_call_format(const std::string & haystack,
if (format.mode == tool_format::JSON_NATIVE) {
analyze_tool_call_format_json_native(clean_haystack, fun_name_needle, arg_name_needle);
if (supports_parallel_tool_calls) {
analyze_json_native_parallel_calls();
}
} else {
analyze_tool_call_format_non_json(clean_haystack, fun_name_needle);
}
@@ -668,6 +672,42 @@ void analyze_tools::analyze_tool_call_format(const std::string & haystack,
format.per_call_end = trim_whitespace(format.per_call_end);
}
void analyze_tools::analyze_json_native_parallel_calls() {
json assistant_one_tool = json{
{ "role", "assistant" },
{ "content", "" },
{ "tool_calls", json::array({ first_tool_call }) }
};
json assistant_two_tools = json{
{ "role", "assistant" },
{ "content", "" },
{ "tool_calls", json::array({ first_tool_call, second_tool_call }) }
};
template_params params;
params.messages = json::array({ user_msg, assistant_one_tool });
params.tools = tools;
params.add_generation_prompt = false;
params.enable_thinking = true;
auto comparison = compare_variants(
*tmpl, params, [&](template_params & p) { p.messages = json::array({ user_msg, assistant_two_tools }); });
if (!comparison) {
LOG_DBG(ANSI_ORANGE "%s: Template application failed\n" ANSI_RESET, __func__);
return;
}
std::string & second_call = comparison->diff.right;
if (!format.section_start.empty() && second_call.find(format.section_start) != std::string::npos) {
format.per_call_start = format.section_start;
format.per_call_end = format.section_end;
format.section_start.clear();
format.section_end.clear();
}
}
void analyze_tools::analyze_tool_call_format_json_native(const std::string & clean_haystack,
const std::string & fun_name_needle,
const std::string & arg_name_needle) {

View File

@@ -676,7 +676,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
auto nested_name = literal("\"" + nested_name_field + "\"") + space() + literal(":") + space() +
literal("\"") + tool_name(literal(name)) + literal("\"");
atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
auto nested_args = literal("\"" + nested_args_field + "\"") + space() + literal(":") + space() +
tool_args(schema(json(), "tool-" + name + "-schema", params));
@@ -744,7 +744,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
auto tool_name_ = name_key_parser + space() + literal(":") + space() +
literal("\"") + tool_name(literal(name)) + literal("\"");
atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
auto tool_args_ = args_key_parser + space() + literal(":") + space() +
tool_args(schema(json(), "tool-" + name + "-schema", params));

View File

@@ -1,6 +1,7 @@
#include "ggml.h"
#include "gguf.h"
#include "build-info.h"
#include "common.h"
#include "log.h"
#include "llama.h"
@@ -372,7 +373,7 @@ void common_init() {
const char * build_type = " (debug)";
#endif
LOG_DBG("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
LOG_DBG("build: %d (%s) with %s for %s%s\n", llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
}
std::string common_params_get_system_info(const common_params & params) {

View File

@@ -2,9 +2,10 @@
#pragma once
#include "llama-cpp.h"
#include "ggml-opt.h"
#include "ggml.h"
#include "llama-cpp.h"
#include <set>
#include <sstream>
@@ -27,11 +28,6 @@
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0)
struct common_time_meas {
common_time_meas(int64_t & t_acc, bool disable = false);
~common_time_meas();
@@ -53,14 +49,6 @@ struct common_adapter_lora_info {
using llama_tokens = std::vector<llama_token>;
// build info
extern int LLAMA_BUILD_NUMBER;
extern const char * LLAMA_COMMIT;
extern const char * LLAMA_COMPILER;
extern const char * LLAMA_BUILD_TARGET;
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
struct common_control_vector_load_info;
//

View File

@@ -1,5 +1,6 @@
#include "arg.h"
#include "build-info.h"
#include "common.h"
#include "log.h"
#include "download.h"
@@ -303,7 +304,7 @@ static int common_download_file_single_online(const std::string & url,
headers.emplace(h.first, h.second);
}
if (headers.find("User-Agent") == headers.end()) {
headers.emplace("User-Agent", "llama-cpp/" + build_info);
headers.emplace("User-Agent", "llama-cpp/" + std::string(llama_build_info()));
}
if (!opts.bearer_token.empty()) {
headers.emplace("Authorization", "Bearer " + opts.bearer_token);
@@ -441,7 +442,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
headers.emplace(h.first, h.second);
}
if (headers.find("User-Agent") == headers.end()) {
headers.emplace("User-Agent", "llama-cpp/" + build_info);
headers.emplace("User-Agent", "llama-cpp/" + std::string(llama_build_info()));
}
if (params.timeout > 0) {

View File

@@ -1,5 +1,6 @@
#include "hf-cache.h"
#include "build-info.h"
#include "common.h"
#include "log.h"
#include "http.h"
@@ -200,7 +201,7 @@ static nl::json api_get(const std::string & url,
auto [cli, parts] = common_http_client(url);
httplib::Headers headers = {
{"User-Agent", "llama-cpp/" + build_info},
{"User-Agent", "llama-cpp/" + std::string(llama_build_info())},
{"Accept", "application/json"}
};

View File

@@ -23,6 +23,10 @@
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
int common_log_get_verbosity_thold(void) {
return common_log_verbosity_thold;
}
void common_log_set_verbosity_thold(int verbosity) {
common_log_verbosity_thold = verbosity;
}

View File

@@ -38,7 +38,7 @@ enum log_colors {
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
// set via common_log_set_verbosity()
extern int common_log_verbosity_thold;
int common_log_get_verbosity_thold(void);
void common_log_set_verbosity_thold(int verbosity); // not thread-safe
@@ -98,7 +98,7 @@ void common_log_flush (struct common_log * log); // f
#define LOG_TMPL(level, verbosity, ...) \
do { \
if ((verbosity) <= common_log_verbosity_thold) { \
if ((verbosity) <= common_log_get_verbosity_thold()) { \
common_log_add(common_log_main(), (level), __VA_ARGS__); \
} \
} while (0)

View File

@@ -1,5 +1,5 @@
set(TARGET llama-batched)
add_executable(${TARGET} batched.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,5 +1,5 @@
set(TARGET llama-convert-llama2c-to-ggml)
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,5 +1,5 @@
set(TARGET llama-debug)
add_executable(${TARGET} debug.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,5 +1,5 @@
set(TARGET llama-diffusion-cli)
add_executable(${TARGET} diffusion-cli.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -602,8 +602,8 @@ int main(int argc, char ** argv) {
int n_input = input_tokens.size();
if (n_input >= params.n_ctx) {
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx);
if (static_cast<uint32_t>(n_input) >= llama_n_ctx(ctx)) {
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, llama_n_ctx(ctx));
llama_free(ctx);
llama_model_free(model);
return 1;

View File

@@ -1,5 +1,5 @@
set(TARGET llama-embedding)
add_executable(${TARGET} embedding.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,7 +1,7 @@
set(TARGET llama-eval-callback)
add_executable(${TARGET} eval-callback.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_BUILD_TESTS)

View File

@@ -1,5 +1,5 @@
set(TARGET llama-gen-docs)
add_executable(${TARGET} gen-docs.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,5 +1,5 @@
set(TARGET llama-idle)
add_executable(${TARGET} idle.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -1,5 +1,5 @@
set(TARGET llama-lookahead)
add_executable(${TARGET} lookahead.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,23 +1,23 @@
set(TARGET llama-lookup)
add_executable(${TARGET} lookup.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-lookup-create)
add_executable(${TARGET} lookup-create.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-lookup-merge)
add_executable(${TARGET} lookup-merge.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-lookup-stats)
add_executable(${TARGET} lookup-stats.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,5 +1,5 @@
set(TARGET llama-parallel)
add_executable(${TARGET} parallel.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,5 +1,5 @@
set(TARGET llama-passkey)
add_executable(${TARGET} passkey.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,5 +1,5 @@
set(TARGET llama-retrieval)
add_executable(${TARGET} retrieval.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,5 +1,5 @@
set(TARGET llama-save-load-state)
add_executable(${TARGET} save-load-state.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,5 +1,5 @@
set(TARGET llama-speculative-simple)
add_executable(${TARGET} speculative-simple.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,5 +1,5 @@
set(TARGET llama-speculative)
add_executable(${TARGET} speculative.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -5,5 +5,5 @@
set(TARGET llama-ls-sycl-device)
add_executable(${TARGET} ls-sycl-device.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,5 +1,5 @@
set(TARGET llama-finetune)
add_executable(${TARGET} finetune.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -47,6 +47,7 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
if (_hmx_idx GREATER_EQUAL 0)
target_sources(${HTP_LIB} PRIVATE
hmx-queue.c
hmx-matmul-ops.c
)

View File

@@ -31,6 +31,14 @@ static inline uint64_t hex_get_pktcnt() {
return pktcnt;
}
static inline uint32_t hex_ceil_pow2(uint32_t x) {
if (x <= 1) { return 1; }
int p = 2;
x--;
while (x >>= 1) { p <<= 1; }
return p;
}
static inline size_t hmx_ceil_div(size_t num, size_t den) {
return (num + den - 1) / den;
}
@@ -73,8 +81,7 @@ static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride,
#define HEX_L2_LINE_SIZE 64
#define HEX_L2_FLUSH_SIZE (128 * 1024)
static inline void hex_l2flush(void * addr, size_t size)
{
static inline void hex_l2flush(void * addr, size_t size) {
if (size > HEX_L2_FLUSH_SIZE) {
qurt_mem_cache_clean((qurt_addr_t) 0, 0, QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, QURT_MEM_DCACHE);
} else {
@@ -89,4 +96,8 @@ static inline void hex_l2flush(void * addr, size_t size)
}
}
static inline void hex_pause() {
asm volatile(" pause(#255)\n");
}
#endif /* HEX_UTILS_H */

View File

@@ -16,14 +16,16 @@
#include "ggml-common.h"
#include "hex-dma.h"
#include "worker-pool.h"
#include "hvx-utils.h"
#include "hvx-dump.h"
#include "worker-pool.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "hmx-utils.h"
#include "hmx-ops.h"
#include "hmx-utils.h"
#include "hmx-queue.h"
#include "hmx-profile.h"
static const __fp16 q4_0_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
@@ -47,7 +49,8 @@ static const __fp16 iq4_nl_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
static const int32_t weight_transpose_scatter_offsets[32] __attribute__((aligned(VLEN))) = {
0*128, 1*128, 2*128, 3*128, 4*128, 5*128, 6*128, 7*128,
8*128, 9*128, 10*128, 11*128, 12*128, 13*128, 14*128, 15*128,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16*128, 17*128, 18*128, 19*128, 20*128, 21*128, 22*128, 23*128,
24*128, 25*128, 26*128, 27*128, 28*128, 29*128, 30*128, 31*128
};
// Scales per x4x2 logical block: 8 × sizeof(__fp16) = 16 bytes
@@ -109,36 +112,45 @@ static inline bool hmx_add_overflow(size_t a, size_t b, size_t *out) {
return false;
}
// Search for optimal (mc, nc) chunk sizes that maximize mc * nc within VTCM budget.
// Search for optimal (mc, nc) chunk sizes within VTCM budget.
//
// Cost model: total = nc * per_n_cost + mc * per_m_cost + mc * nc * per_mn_cost + overhead
// per_n_cost: bytes per nc column (weight + scratch buffers)
// per_m_cost: bytes per mc row (activation)
// per_mn_cost: bytes per mc*nc element (output)
// overhead: fixed bytes (scales 256B, eye_tile 2048B, etc.)
// VTCM model: nc * per_n_cost + mc * per_m_cost + mc * nc * per_mn_cost + overhead
//
// Minimize ceil(m/mc) * m_block_cost + ceil(n/nc) * n_block_cost.
// All matmul paths repeat weight processing per M-block and activation loading
// per N-block, so discrete block counts drive total overhead.
// Tie-break: when cost is equal, prefer larger mc * nc.
//
// Caller-provided coefficients:
// m_block_cost: penalty per extra M-block (weight redundancy, scales with n).
// n_block_cost: penalty per extra N-block (activation redundancy, scales with m).
//
// Algorithm: nc sweeps from n_max down by 32, analytically solving for mc_max.
// Returns 0 on success, -1 if VTCM is insufficient.
static int hmx_compute_chunks(
size_t vtcm_total, size_t overhead,
size_t per_n_cost, size_t per_m_cost, size_t per_mn_cost,
int m, int n,
size_t *m_chunk_out, size_t *n_chunk_out,
size_t *total_out)
{
static int hmx_compute_chunks(size_t vtcm_total,
size_t overhead,
size_t per_n_cost,
size_t per_m_cost,
size_t per_mn_cost,
int m,
int n,
size_t m_block_cost,
size_t n_block_cost,
size_t * m_chunk_out,
size_t * n_chunk_out,
size_t * total_out) {
if (m <= 0 || n <= 0) return -1;
if (vtcm_total <= overhead) return -1;
if (per_n_cost == 0 || per_m_cost == 0 || per_mn_cost == 0) return -1;
const size_t usable = vtcm_total - overhead;
size_t best_mn = 0, best_m = 0, best_n = 0;
size_t best_cost = SIZE_MAX;
size_t best_mn = 0;
size_t best_m = 0, best_n = 0;
const size_t n_max = hex_align_down((size_t)n, HMX_FP16_TILE_N_COLS);
for (size_t nc = n_max; nc >= HMX_FP16_TILE_N_COLS; nc -= HMX_FP16_TILE_N_COLS) {
// Early exit: if nc * m_max cannot beat best, smaller nc won't either
if (nc * hex_align_down((size_t)m, HMX_FP16_TILE_N_ROWS) <= best_mn)
break;
size_t n_fixed = 0, ncmn = 0, mc_denom = 0;
if (hmx_mul_overflow(nc, per_n_cost, &n_fixed)) continue;
if (n_fixed >= usable) goto next_nc;
@@ -152,10 +164,19 @@ static int hmx_compute_chunks(
mc = hex_align_down(mc, HMX_FP16_TILE_N_ROWS);
mc = hex_smin(mc, (size_t)m);
if (mc > 0 && mc * nc > best_mn) {
best_mn = mc * nc;
best_m = mc;
best_n = nc;
if (mc == 0) {
goto next_nc;
}
size_t mblocks = ((size_t) m + mc - 1) / mc;
size_t nblocks = ((size_t) n + nc - 1) / nc;
size_t cost = mblocks * m_block_cost + nblocks * n_block_cost;
size_t mn = mc * nc;
if (cost < best_cost || (cost == best_cost && mn > best_mn)) {
best_cost = cost;
best_mn = mn;
best_m = mc;
best_n = nc;
}
}
@@ -233,7 +254,7 @@ static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
HVX_Vector v_scales = hvx_vec_splat_f16(*scale);
// q4x4x2 stores two int4 values per byte. Keep only the selected nibble.
HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
// Shuffle before LUT
v_quants = Q6_Vb_vshuff_Vb(v_quants);
@@ -257,7 +278,7 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
// Load all 128 packed bytes (4 contiguous 32-byte groups)
HVX_Vector vq = hvx_vmemu(packed_128);
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
// Shuffle before LUT
@@ -277,10 +298,8 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
// Extract individual groups: scatter uses q_mask64 so only first 64 bytes matter
out[0] = v_lo; // group0 already in [0:63]
out[1] = Q6_V_vror_VR(v_lo, 64); // group1 rotated to [0:63]
out[2] = v_hi; // group2 already in [0:63]
out[3] = Q6_V_vror_VR(v_hi, 64); // group3 rotated to [0:63]
out[0] = v_lo; // group0 already in [0:63]
out[1] = v_hi; // group2 already in [0:63]
}
// Dequantize one x4x2 Q8_0 group (32 int8 quants) -> 32 FP16 in first 64 bytes.
@@ -384,8 +403,9 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
size_t row_stride, int weight_type,
int start_tile, int end_tile) {
const int n_k_tiles = k_block / HMX_FP16_TILE_N_COLS;
const int qrow_size = (weight_type == HTP_TYPE_Q8_0) ? k_block : (k_block / 2);
const int n_k_tiles = (unsigned)k_block / HMX_FP16_TILE_N_COLS;
const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL);
const int qrow_size = is_q4 ? ((unsigned)k_block / 2) : k_block;
const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) ? hvx_vmem(iq4_nl_to_fp16_lut) :
(weight_type == HTP_TYPE_MXFP4) ? hvx_vmem(mxfp4_to_fp16_lut) :
@@ -398,47 +418,46 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); // 4 bytes = 1 column step
const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); // first 16 words (64 bytes)
for (int t = start_tile; t < end_tile; ) {
int ct = t / n_k_tiles; // column tile index
int kt = t % n_k_tiles; // K tile index
unsigned ct = (unsigned)start_tile / n_k_tiles; // column tile index
unsigned kt = (unsigned)start_tile % n_k_tiles; // K tile index
for (unsigned t = start_tile; t < end_tile; ) {
if (kt >= n_k_tiles) { kt = 0; ct++; }
// --- Batch-4 fast path for Q4_0/IQ4_NL: process 4 contiguous K-tiles with one vlut16 per row ---
if ((weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) && (kt % 4 == 0) && (t + 4 <= end_tile) &&
((t + 3) / n_k_tiles == ct)) {
int blk_idx = (kt * 32) / QK_Q4_0x4x2;
int sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; // 0 or 4
bool upper = (sub_blk_base >= 4);
int packed_off = blk_idx * (QK_Q4_0x4x2 / 2); // 128 contiguous packed bytes
int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE
+ sub_blk_base * (int)sizeof(__fp16); // 4 consecutive scales
// --- Batch-4 fast path for Q4: process 4 contiguous K-tiles with one vlut16 per row ---
if (is_q4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
unsigned blk_idx = (kt * 32) / QK_Q4_0x4x2;
unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; // 0 or 4
bool upper = (sub_blk_base >= 4);
unsigned packed_off = blk_idx * (QK_Q4_0x4x2 / 2); // 128 contiguous packed bytes
unsigned scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE
+ sub_blk_base * (int)sizeof(__fp16); // 4 consecutive scales
__fp16 *tile_bases[4];
for (int g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; }
for (unsigned g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; }
HVX_Vector v_off = v_scat_base;
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
int row0 = ct * HMX_FP16_TILE_N_COLS + r;
int row1 = row0 + 1;
const uint8_t *r0 = vtcm_src + row0 * row_stride;
const uint8_t *r1 = vtcm_src + row1 * row_stride;
HVX_Vector v0[4], v1[4];
unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
HVX_Vector v0[2];
const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
if (row1 < n_cols) {
dequantize_x4x2_q4_0_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt, v1);
} else {
v1[0] = v1[1] = v1[2] = v1[3] = Q6_V_vzero();
}
for (int g = 0; g < 4; g++) { Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v0[g]); }
Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]);
Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]);
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
for (int g = 0; g < 4; g++) { Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v1[g]); }
r0 = vtcm_src + row_offset; row_offset += row_stride;
dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]);
Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]);
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
}
for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); }
t += 4;
t += 4; kt += 4;
continue;
}
@@ -495,20 +514,19 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
// --- Single-tile fallback ---
__fp16 *tile_base = vtcm_dst + t * HMX_FP16_TILE_N_ELMS;
if (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) {
int blk_idx = (kt * 32) / QK_Q4_0x4x2;
int sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32;
bool upper = (sub_blk >= 4);
int byte_off = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);
if (is_q4) {
unsigned blk_idx = (kt * 32) / QK_Q4_0x4x2;
unsigned sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32;
bool upper = (sub_blk >= 4);
unsigned byte_off = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
unsigned scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);
HVX_Vector v_off = v_scat_base; // reset to column 0
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
int row0 = ct * HMX_FP16_TILE_N_COLS + r;
int row1 = row0 + 1;
const uint8_t *r0 = vtcm_src + row0 * row_stride;
const uint8_t *r1 = vtcm_src + row1 * row_stride;
unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
HVX_Vector v0 = dequantize_x4x2_q4_0_group_hvx(
r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
@@ -585,7 +603,7 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
}
(void) *(volatile HVX_Vector *)(tile_base);
}
++t;
++t; ++kt;
}
// Drain HVX scatter write buffer: a vmem load on the same HW thread retires
@@ -653,9 +671,13 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
// --- End x4x2 dequantizers ---
// requires external HMX lock
static void core_dot_chunk_fp16(__fp16 *output, const __fp16 *activation, const __fp16 *weight, const __fp16 *scales,
static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict activation, const __fp16 *restrict weight, const __fp16 *restrict scales,
int n_row_tiles, int n_col_tiles, int n_dot_tiles) {
hmx_set_output_scales(scales);
__builtin_assume(n_row_tiles > 0);
__builtin_assume(n_col_tiles > 0);
__builtin_assume(n_dot_tiles > 0);
Q6_bias_mxmem2_A((void *)scales);
for (int r = 0; r < n_row_tiles; ++r) {
for (int c = 0; c < n_col_tiles; ++c) {
@@ -665,16 +687,55 @@ static void core_dot_chunk_fp16(__fp16 *output, const __fp16 *activation, const
const __fp16 *col_tiles = weight + c * n_dot_tiles * HMX_FP16_TILE_N_ELMS;
for (int k = 0; k < n_dot_tiles; ++k) {
int offset = k * HMX_FP16_TILE_N_ELMS;
hmx_load_tile_pair_fp16(row_tiles + offset, col_tiles + offset);
Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047);
Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047);
row_tiles += HMX_FP16_TILE_N_ELMS;
col_tiles += HMX_FP16_TILE_N_ELMS;
}
__fp16 *out_tile = output + (r * n_col_tiles + c) * HMX_FP16_TILE_N_ELMS;
hmx_consume_accumulator_fp16(out_tile);
Q6_mxmem_AR_after_hf(out_tile, 0);
}
}
}
// --- Async HMX matmul job (for pipeline overlap) ---
typedef struct {
__fp16 * output;
const __fp16 * activation;
const __fp16 * weight;
const __fp16 * scales;
uint32_t n_row_tiles;
uint32_t n_col_tiles;
uint32_t n_dot_tiles;
} hmx_matmul_job_t;
static void hmx_matmul_worker_fn(void * data) {
hmx_matmul_job_t * job = (hmx_matmul_job_t *) data;
FARF(HIGH, "hmx-mm-job: n_row_tiles %u n_col_tiles %u n_dot_tiles %u", job->n_row_tiles, job->n_col_tiles, job->n_dot_tiles);
core_dot_chunk_fp16(job->output, job->activation, job->weight, job->scales, job->n_row_tiles, job->n_col_tiles, job->n_dot_tiles);
}
static inline void hmx_matmul_job_init(hmx_matmul_job_t * job,
__fp16 * output,
const __fp16 * activation,
const __fp16 * weight,
const __fp16 * scales,
int n_row_tiles,
int n_col_tiles,
int n_dot_tiles) {
job->output = output;
job->activation = activation;
job->weight = weight;
job->scales = scales;
job->n_row_tiles = n_row_tiles;
job->n_col_tiles = n_col_tiles;
job->n_dot_tiles = n_dot_tiles;
}
// --- End async HMX matmul job ---
static void transfer_output_chunk_fp16_to_fp32(float *restrict dst, const __fp16 *restrict vtcm_src, int n_rows, int n_cols, int n) {
assert(n_cols % HMX_FP16_TILE_N_COLS == 0);
const int n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS;
@@ -832,12 +893,13 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
const size_t f32_scratch_per_m = use_dma_activation ? (size_t) params->k * sizeof(float) : 0;
size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
// FP16 weight: interleave and activation load have similar per-element cost.
if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256,
/*per_n=*/3 * vec_dot_size,
/*per_m=*/group_size * vec_dot_size + f32_scratch_per_m,
/*per_mn=*/sizeof(__fp16),
params->m, params->n,
&m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
/*per_n=*/3 * vec_dot_size,
/*per_m=*/group_size * vec_dot_size + f32_scratch_per_m,
/*per_mn=*/sizeof(__fp16), params->m, params->n,
/*m_block_cost=*/(size_t) params->n,
/*n_block_cost=*/(size_t) params->m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
FARF(HIGH, "%s: grouped path does not fit VTCM, falling back to legacy batched loop", __func__);
return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params);
}
@@ -1006,13 +1068,15 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
const size_t f32_scratch_per_m = use_dma_activation ? (size_t) k * sizeof(float) : 0;
size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
// FP16 weight: interleave and activation load have similar per-element cost.
if (hmx_compute_chunks(vtcm_budget,
/*overhead=*/ 256,
/*per_n=*/ 3 * vec_dot_size, // W + S0 + S1
/*per_m=*/ vec_dot_size + f32_scratch_per_m, // A + optional F32 scratch
/*per_mn=*/ sizeof(__fp16), // O
m, n,
&m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
/*overhead=*/256,
/*per_n=*/3 * vec_dot_size, // W + S0 + S1
/*per_m=*/vec_dot_size + f32_scratch_per_m, // A + optional F32 scratch
/*per_mn=*/sizeof(__fp16), // O
m, n,
/*m_block_cost=*/(size_t) n,
/*n_block_cost=*/(size_t) m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget);
return -1;
}
@@ -1157,6 +1221,8 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict out, const float *restrict x, const uint8_t *restrict w, int m,
int k, int n, int w_type);
#define FALLBACK_TO_STANDARD 1
int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
const uint8_t *restrict permuted_weight, int m, int k, int n,
int weight_type) {
@@ -1169,9 +1235,12 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
// for large m, k (e.g. prefill FFN Down), use out-stationary version
if (m >= 128 && k > n && n > 1024) {
FARF(MEDIUM, "hmx_matmul_qk: OUT-STATIONARY path m=%d k=%d n=%d type=%d (K_BLOCK=512, %d K-iters with fp16 intermediate)",
m, k, n, weight_type, (k + 511) / 512);
return mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type);
int rc = mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type);
if (rc != FALLBACK_TO_STANDARD) {
return rc; // 0 success, -1 error
}
FARF(MEDIUM, "hmx_matmul_qk: out-stationary fallback to standard m=%d k=%d n=%d", m, k, n);
// fall through to standard path
}
size_t row_stride = get_x4x2_row_stride(weight_type, k);
@@ -1197,9 +1266,10 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
}
size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256,
per_n_cost, /*per_m=*/vec_dot_size, per_mn_cost,
m, n, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
// Quantized weight: dequant ~1.5x more expensive per element than activation load.
if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, per_n_cost, /*per_m=*/vec_dot_size, per_mn_cost, m, n,
/*m_block_cost=*/(size_t) n * 3,
/*n_block_cost=*/(size_t) m * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d pipe=%d budget=%zu)",
__func__, m, k, n, use_pipeline, vtcm_budget);
return -1;
@@ -1256,9 +1326,8 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
use_pipeline ? "PIPELINE" : "SEQUENTIAL", m_chunk_n_rows, n_chunk_n_cols,
(size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
if (!use_pipeline) {
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
// transfer activation matrix chunk into VTCM
size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
@@ -1318,20 +1387,22 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
TIMER_STOP(output_store);
}
}
HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
} else {
// 4-stage pipeline: DMA load (A), dequantize (B), HMX matmul (C), store (D)
// stage B and D (dequantize and store) are expected to be on the critical path
// HMX compute (C) runs on dedicated worker thread, overlapping with HVX stages (B, D).
// A --> B: vtcm_qweight, 1 buffer
// B --> C: vtcm_weight0/vtcm_weight1, 2 buffers
// C --> D: vtcm_output0/vtcm_output1, 2 buffers
//
// LD ||A3| | B3 ||
// MM || C2 ||
// ST || D1 | ||
// Async timeline (C overlaps B+D):
// main+HVX: [A0][Act][B0][A1][sub C0][B1‖C0][A2][wait,sub C1][D0+B2‖C1][wait,sub C2][D1‖C2][wait][D2]
// HMX queue: [████ C0 ████████][████ C1 ████████████][████ C2 ████████]
int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols);
hmx_matmul_job_t job_slots[2]; // persistent double-buffered job descriptors
for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
@@ -1352,31 +1423,34 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k);
}
// prologue: B0, A1, C0, B1
// prologue: B0, A1, submit C0 (async), B1 (overlaps C0)
{
// B0
// B0: wait for DMA, dequant weight chunk 0
dma_queue_pop(ctx->dma[0]);
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type);
// A1
// A1: issue DMA for weight chunk 1
const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols);
if (1 < n_chunk_cnt) {
const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * row_stride;
dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1);
}
// C0
core_dot_chunk_fp16((__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
// submit C0 (non-blocking — HMX worker executes in parallel)
hmx_matmul_job_init(&job_slots[0], (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation,
(__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[0]));
// B1
// B1: DMA pop + dequant (runs in parallel with C0 on HMX worker)
if (1 < n_chunk_cnt) {
dma_queue_pop(ctx->dma[0]);
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type);
}
}
// main loop
// main loop: wait C_i → submit C_{i+1} → D_i + B_{i+2} (parallel with C_{i+1})
for (int i = 0; i < n_chunk_cnt; ++i) {
const size_t nc = i * n_chunk_n_cols;
const size_t nc_p1 = nc + 1 * n_chunk_n_cols;
@@ -1386,36 +1460,41 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols);
const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols);
// issue A_{i+2}
// issue A_{i+2}: DMA push (non-blocking)
if (i + 2 < n_chunk_cnt) {
const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * row_stride;
dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2);
}
// wait for HMX (C_{i}) -- C_{i} is done
// wait C_i: block until prologue/previous C completes
hmx_queue_pop(ctx->hmx_queue);
// result of B_{i+1} (input of C_{i+1}) should be ready now
// issue C_{i+1}
// submit C_{i+1} (non-blocking, overlaps with D_i + B_{i+2} below)
// job_slots[(i+1)%2] is safe: C_i just completed, freeing slot i%2's
// counterpart — and (i+1)%2 was last used by C_{i-1} which completed
// before C_i was submitted.
if (i + 1 < n_chunk_cnt) {
core_dot_chunk_fp16((__fp16 *) vtcm_output_bufs[(i + 1) % 2], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2], vtcm_scales,
hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
hmx_matmul_job_init(&job_slots[(i + 1) % 2], (__fp16 *) vtcm_output_bufs[(i + 1) % 2],
(__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2],
vtcm_scales, hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[(i + 1) % 2]));
}
// compute D_{i}
// D_i: store output (multi-thread HVX, parallel with C_{i+1})
float *output_chunk = dst + (mr * n + nc);
transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, n);
// wait for DMA (A_{i+2}), compute B_{i+2}
// B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1})
if (i + 2 < n_chunk_cnt) {
dma_queue_pop(ctx->dma[0]);
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type);
}
}
}
}
HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
hmx_queue_suspend(ctx->hmx_queue);
}
TIMER_STOP(total);
@@ -1434,10 +1513,13 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
}
// C += AB
void core_mma_chunk_fp16(__fp16 *c, const __fp16 *a, const __fp16 *b, const __fp16 *col_scales, const __fp16 *eye_tile,
void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, const __fp16 *restrict b, const __fp16 *restrict col_scales, const __fp16 *restrict eye_tile,
int n_row_tiles, int n_col_tiles, int n_dot_tiles, bool zero_init) {
__builtin_assume(n_row_tiles > 0);
__builtin_assume(n_col_tiles > 0);
__builtin_assume(n_dot_tiles > 0);
hmx_set_output_scales(col_scales);
Q6_bias_mxmem2_A((void *)col_scales);
for (int i = 0; i < n_row_tiles; ++i) {
for (int j = 0; j < n_col_tiles; ++j) {
@@ -1448,15 +1530,17 @@ void core_mma_chunk_fp16(__fp16 *c, const __fp16 *a, const __fp16 *b, const __fp
__fp16 *accum_tile = c + (i * n_col_tiles + j) * HMX_FP16_TILE_N_ELMS;
if (!zero_init) {
hmx_load_tile_pair_fp16(accum_tile, eye_tile);
Q6_activation_hf_mxmem_RR((unsigned int)accum_tile, 2047);
Q6_weight_hf_mxmem_RR((unsigned int)eye_tile, 2047);
}
for (int k = 0; k < n_dot_tiles; ++k) {
int offset = k * HMX_FP16_TILE_N_ELMS;
hmx_load_tile_pair_fp16(row_tiles + offset, col_tiles + offset);
Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047);
Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047);
row_tiles += HMX_FP16_TILE_N_ELMS;
col_tiles += HMX_FP16_TILE_N_ELMS;
}
hmx_consume_accumulator_fp16(accum_tile);
Q6_mxmem_AR_after_hf(accum_tile, 0);
}
}
}
@@ -1540,12 +1624,41 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
const size_t vtcm_budget = ctx->vtcm_size;
const size_t M_BLOCK_SIZE = 512;
const size_t N_BLOCK_SIZE = 512;
const size_t K_BLOCK_SIZE = 512;
const size_t K_BLOCK_SIZE = 1024;
// Compute precise buffer sizes
// Fallback: if k doesn't need K-blocking, out-stationary has no advantage
const size_t k_iters_check = (k + K_BLOCK_SIZE - 1) / K_BLOCK_SIZE;
if (k_iters_check <= 1) {
FARF(MEDIUM, "%s: K_BLK=%zu >= k=%d, fallback to standard path", __func__, K_BLOCK_SIZE, k);
return FALLBACK_TO_STANDARD;
}
// Dynamic M,N search via hmx_compute_chunks
const size_t sub_row_stride_alloc = get_x4x2_row_stride(weight_type, K_BLOCK_SIZE);
const size_t per_m = K_BLOCK_SIZE * sizeof(float) // scratch1: M×K×4 (act DMA staging F32)
+ K_BLOCK_SIZE * sizeof(__fp16); // activation: M×K×2 (F16 tiles)
const size_t per_n = sub_row_stride_alloc // scratch0: N×sub_row(K) (packed quant)
+ K_BLOCK_SIZE * sizeof(__fp16); // weight: N×K×2 (F16 tiles)
const size_t per_mn = sizeof(__fp16); // output: M×N×2 (out-stationary)
// Alignment margin: hex_align_up can add up to 2047 bytes per buffer;
// scratch1 (mc×6144) is naturally 2048-aligned, remaining 4 buffers need margin
const size_t align_margin = 4 * HMX_FP16_TILE_SIZE;
const size_t overhead = HMX_FP16_TILE_SIZE + 256 + align_margin; // eye_tile + scales + alignment
size_t M_BLOCK_SIZE, N_BLOCK_SIZE, vtcm_used;
// Cost-based search: minimize ceil(m/mc)*m_block_cost + ceil(n/nc)*n_block_cost.
// From profiling: wt_dequant per element ≈ 1.5× activation load per element.
// m_block_cost = n*3: each extra M-block re-dequants all N×K weight (expensive).
// n_block_cost = m*2: each extra N-block re-loads all M×K activation (cheaper).
const size_t m_block_cost = (size_t) n * 3;
const size_t n_block_cost = (size_t) m * 2;
if (hmx_compute_chunks(vtcm_budget, overhead, per_n, per_m, per_mn, m, n, m_block_cost, n_block_cost, &M_BLOCK_SIZE,
&N_BLOCK_SIZE, &vtcm_used) != 0) {
FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget);
return -1;
}
// Compute precise buffer sizes from searched M,N and fixed K
const size_t weight_size = hex_align_up(N_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
const size_t act_size = hex_align_up(M_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
const size_t out_size = hex_align_up(M_BLOCK_SIZE * N_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
@@ -1554,7 +1667,8 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
const size_t total_vtcm = weight_size + act_size + out_size + scratch0_sz + scratch1_sz + HMX_FP16_TILE_SIZE + 256;
if (total_vtcm > vtcm_budget) {
FARF(HIGH, "%s: VTCM too small: need %zu have %zu (m=%d k=%d n=%d)", __func__, total_vtcm, vtcm_budget, m, k, n);
FARF(HIGH, "%s: VTCM overflow after search: need %zu have %zu (M=%zu N=%zu K=%zu)", __func__, total_vtcm,
vtcm_budget, M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE);
return -1;
}
@@ -1568,8 +1682,8 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
__fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256);
assert((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) <= vtcm_budget);
FARF(MEDIUM, "%s: m=%d k=%d n=%d wtype=%d vtcm=%zu/%zu", __func__, m, k, n, weight_type,
(size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
FARF(HIGH, "hmx-mm: m=%d k=%d n=%d wtype=%d block M=%zu N=%zu K=%zu vtcm=%zu/%zu", __func__, m, k, n, weight_type,
M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE, (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget);
// initialize eye tile (32x32 identity matrix)
{

View File

@@ -0,0 +1,158 @@
#pragma clang diagnostic ignored "-Wunused-function"
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <qurt_thread.h>
#include <qurt_futex.h>
#include <HAP_compute_res.h>
#include "hmx-queue.h"
#define QURT_LOWEST_PRIO (254)
static inline void hmx_lock(struct hmx_queue *q)
{
if (!q->hmx_locked) {
HAP_compute_res_hmx_lock(q->hap_rctx);
q->hmx_locked = true;
}
}
static inline void hmx_unlock(struct hmx_queue *q)
{
if (q->hmx_locked) {
HAP_compute_res_hmx_unlock(q->hap_rctx);
q->hmx_locked = false;
}
}
static inline void hmx_queue_process(struct hmx_queue *q, bool* killed) {
unsigned int ir = atomic_load(&q->idx_read);
while (ir != atomic_load(&q->idx_write)) {
struct hmx_queue_desc *d = &q->desc[ir];
if (!d->done) {
FARF(HIGH, "hmx-queue-process: ir %u func %p data %p", ir, d->func, d->data);
enum hmx_queue_signal sig = (enum hmx_queue_signal) (unsigned int) d->func;
switch (sig) {
case HMX_QUEUE_NOOP: /* noop */; break;
case HMX_QUEUE_KILL: *killed = true; break;
case HMX_QUEUE_SUSPEND: hmx_unlock(q); break;
default:
hmx_lock(q);
d->func(d->data);
break;
}
atomic_fetch_add(&d->done, 1);
}
ir = (ir + 1) & q->idx_mask;
atomic_store(&q->idx_read, ir);
}
}
static void hmx_queue_thread(void * arg) {
struct hmx_queue * q = (struct hmx_queue *) arg;
FARF(HIGH, "hmx-queue-thread: started");
bool killed = false;
unsigned int poll_cnt = HMX_QUEUE_POLL_COUNT;
unsigned int prev_seqn = 0;
while (!killed) {
unsigned int seqn = atomic_load(&q->seqn);
if (seqn == prev_seqn) {
if (--poll_cnt) { hex_pause(); continue; }
FARF(HIGH, "hmx-queue-thread: sleeping");
qurt_futex_wait(&q->seqn, prev_seqn);
continue;
}
prev_seqn = seqn;
poll_cnt = HMX_QUEUE_POLL_COUNT;
FARF(HIGH, "hmx-queue-thread: new work");
hmx_queue_process(q, &killed);
}
FARF(HIGH, "hmx-queue-thread: stopped");
}
struct hmx_queue * hmx_queue_create(size_t capacity, uint32_t hap_rctx) {
capacity = hex_ceil_pow2(capacity);
struct hmx_queue * q = (struct hmx_queue *) memalign(32, sizeof(struct hmx_queue));
if (q == NULL) {
FARF(ERROR, "%s: failed to allocate DMA queue\n", __FUNCTION__);
return NULL;
}
memset(q, 0, sizeof(struct hmx_queue));
q->capacity = capacity;
q->idx_mask = capacity - 1;
q->hap_rctx = hap_rctx;
q->desc = (struct hmx_queue_desc *) memalign(64, capacity * sizeof(struct hmx_queue_desc));
if (!q->desc) {
FARF(ERROR, "hmx-queue: failed to allocate HMX queue descriptors\n");
return NULL;
}
memset(q->desc, 0, capacity * sizeof(struct hmx_queue_desc));
const size_t stack_size = HMX_QUEUE_THREAD_STACK_SIZE;
q->stack = (unsigned char *) memalign(64, stack_size);
if (!q->stack) {
FARF(ERROR, "hmx-queue: thread stack allocation failed (%zu bytes)", stack_size);
return NULL;
}
memset(q->stack, 0, stack_size);
// Match caller thread priority (same pattern as worker-pool.c).
int prio = qurt_thread_get_priority(qurt_thread_get_id());
if (prio < 1) {
prio = 1;
}
if (prio > QURT_LOWEST_PRIO) {
prio = QURT_LOWEST_PRIO;
}
qurt_thread_attr_t attr;
qurt_thread_attr_init(&attr);
qurt_thread_attr_set_stack_addr(&attr, q->stack);
qurt_thread_attr_set_stack_size(&attr, stack_size);
qurt_thread_attr_set_priority(&attr, prio);
qurt_thread_attr_set_name(&attr, "hmx-queue");
int err = qurt_thread_create(&q->thread, &attr, hmx_queue_thread, q);
if (err) {
FARF(ERROR, "hmx-worker: thread create failed (%d)", err);
return NULL;
}
FARF(HIGH, "hmx-queue: capacity %u\n", capacity);
return q;
}
void hmx_queue_delete(struct hmx_queue * q) {
if (!q) {
return;
}
// Tell the worker to exit.
hmx_queue_flush(q);
hmx_queue_signal(q, HMX_QUEUE_KILL);
hmx_queue_flush(q);
int status;
qurt_thread_join(q->thread, &status);
free(q->desc);
free(q->stack);
free(q);
}

View File

@@ -0,0 +1,134 @@
#ifndef HMX_QUEUE_H
#define HMX_QUEUE_H
#include <stdbool.h>
#include <stdint.h>
#include <stdatomic.h>
#include <hexagon_types.h>
#include <qurt_thread.h>
#include <qurt_futex.h>
#include <HAP_farf.h>
#include "hex-utils.h"
#ifdef __cplusplus
extern "C" {
#endif
#define HMX_QUEUE_THREAD_STACK_SIZE (16 * 1024)
#define HMX_QUEUE_POLL_COUNT 2000
typedef void (*hmx_queue_func)(void *);
// Dummy funcs used as signals
enum hmx_queue_signal {
HMX_QUEUE_NOOP = 0, // aka NULL
HMX_QUEUE_SUSPEND,
HMX_QUEUE_KILL
};
struct hmx_queue_desc {
hmx_queue_func func;
void * data;
atomic_uint done;
};
struct hmx_queue {
struct hmx_queue_desc * desc;
atomic_uint idx_write; // updated by producer (push)
atomic_uint idx_read; // updated by consumer (process)
unsigned int idx_pop; // updated by producer (pop)
uint32_t idx_mask;
uint32_t capacity;
atomic_uint seqn; // incremented for all pushes, used with futex
qurt_thread_t thread;
void * stack;
uint32_t hap_rctx;
bool hmx_locked;
};
struct hmx_queue * hmx_queue_create(size_t capacity, uint32_t hap_rctx);
void hmx_queue_delete(struct hmx_queue * q);
static inline struct hmx_queue_desc hmx_queue_make_desc(hmx_queue_func func, void * data) {
struct hmx_queue_desc d = { func, data };
return d;
}
static inline bool hmx_queue_push(struct hmx_queue * q, struct hmx_queue_desc d) {
unsigned int ir = atomic_load(&q->idx_read);
unsigned int iw = q->idx_write;
if (((iw + 1) & q->idx_mask) == ir) {
FARF(HIGH, "hmx-queue-push: queue is full\n");
return false;
}
atomic_store(&d.done, 0);
FARF(HIGH, "hmx-queue-push: iw %u func %p data %p\n", iw, d.func, d.data);
q->desc[iw] = d;
atomic_store(&q->idx_write, (iw + 1) & q->idx_mask);
// wake up our thread
atomic_fetch_add(&q->seqn, 1);
qurt_futex_wake(&q->seqn, 1);
return true;
}
static inline bool hmx_queue_signal(struct hmx_queue *q, enum hmx_queue_signal sig) {
return hmx_queue_push(q, hmx_queue_make_desc((hmx_queue_func) sig, NULL));
}
static inline bool hmx_queue_empty(struct hmx_queue * q) {
return q->idx_pop == q->idx_write;
}
static inline uint32_t hmx_queue_depth(struct hmx_queue * q) {
return (q->idx_read - q->idx_read) & q->idx_mask;
}
static inline uint32_t hmx_queue_capacity(struct hmx_queue * q) {
return q->capacity;
}
static inline struct hmx_queue_desc hmx_queue_pop(struct hmx_queue * q) {
unsigned int ip = q->idx_pop;
unsigned int iw = q->idx_write;
struct hmx_queue_desc rd = { NULL, NULL };
if (ip == iw) {
return rd;
}
// Wait for desc to complete
struct hmx_queue_desc * d = &q->desc[ip];
while (!atomic_load(&d->done)) {
FARF(HIGH, "hmx-queue-pop: waiting for HMX queue : %u\n", ip);
hex_pause();
}
rd = *d;
q->idx_pop = (ip + 1) & q->idx_mask;
FARF(HIGH, "hmx-queue-pop: ip %u func %p data %p\n", ip, rd.func, rd.data);
return rd;
}
static inline void hmx_queue_flush(struct hmx_queue * q) {
while (hmx_queue_pop(q).func != NULL) ;
}
static inline void hmx_queue_suspend(struct hmx_queue *q) {
hmx_queue_signal(q, HMX_QUEUE_SUSPEND);
hmx_queue_flush(q);
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif /* HMX_QUEUE_H */

View File

@@ -14,10 +14,6 @@
#define HMX_INLINE_ALWAYS inline __attribute__((unused, always_inline))
static HMX_INLINE_ALWAYS void hmx_set_output_scales(const void *scales) {
asm volatile("bias = mxmem2(%0)" :: "r"(scales));
}
// Initialise aligned 256-byte area with scale vector + zero padding.
static HMX_INLINE_ALWAYS void hmx_init_column_scales(void *out_scales, HVX_Vector v_scale) {
HVX_Vector *pv = (HVX_Vector *)out_scales;
@@ -25,58 +21,6 @@ static HMX_INLINE_ALWAYS void hmx_init_column_scales(void *out_scales, HVX_Vecto
*pv = Q6_V_vzero();
}
// Load multiple contiguous tiles with :deep streaming.
// Rt = total region size - 1; the hardware streams through [Rs, Rs + Rt].
// IMPORTANT: the tile region [Rs, Rs + Rt] must NOT cross a VTCM 4 MB bank
// boundary, otherwise the mxmem instruction will raise a precise bus error.
// Callers must ensure their VTCM layout satisfies this constraint.
static HMX_INLINE_ALWAYS void hmx_load_tiles_fp16(const __fp16 *row_tiles,
const __fp16 *col_tiles,
size_t n_tiles) {
size_t limit = n_tiles * HMX_FP16_TILE_SIZE - 1;
asm volatile(
"{ activation.hf = mxmem(%0, %1):deep\n"
"weight.hf = mxmem(%2, %3) }\n"
:: "r"(row_tiles), "r"(limit), "r"(col_tiles), "r"(limit)
: "memory");
}
// Load a single activation+weight tile pair (no :deep streaming).
// Rt defines the accessible region [Rs, Rs+Rt]. Following the reference formula
// (limit = n_tiles * HMX_FP16_TILE_SIZE - 1), for a single tile Rt = 2047.
// The original code used Rt=0x7FFF (32 KB region); when dynamic VTCM allocation
// places a tile near a 4 MB bank boundary, the oversized region crosses it and
// triggers a precise bus error (0x2601). Rt=2047 confines accesses to exactly
// one 2048-byte tile while covering all 16 HVX vectors (offsets 0..2047).
static HMX_INLINE_ALWAYS void hmx_load_tile_pair_fp16(const __fp16 *act_tile,
const __fp16 *wt_tile) {
asm volatile(
"{ activation.hf = mxmem(%0, %1)\n"
"weight.hf = mxmem(%2, %3) }\n"
:: "r"(act_tile), "r"(2047),
"r"(wt_tile), "r"(2047)
: "memory");
}
static HMX_INLINE_ALWAYS void hmx_consume_accumulator_fp16(__fp16 *out) {
// Use the combined convert-and-store instruction (matches the reference
// Q6_mxmem_AR_after_hf intrinsic). The previous two-instruction sequence
// "cvt.hf = acc(2); mxmem = cvt" used an undocumented Rs=2 parameter.
asm volatile(
"mxmem(%0, %1):after.hf = acc\n"
:: "r"(out), "r"(0)
: "memory");
}
// Compute inner product of two vectors of tiles and store result.
static HMX_INLINE_ALWAYS void hmx_dot_fp16(__fp16 *out,
const __fp16 *row_tiles,
const __fp16 *col_tiles,
size_t n_tiles) {
hmx_load_tiles_fp16(row_tiles, col_tiles, n_tiles);
hmx_consume_accumulator_fp16(out);
}
// --- VTCM sequential allocator (from htp-ops-lib/include/dsp/vtcm_mgr.h) ---
static inline uint8_t *vtcm_seq_alloc(uint8_t **vtcm_ptr, size_t size) {

View File

@@ -2,6 +2,7 @@
#define HTP_CTX_H
#include "hex-dma.h"
#include "hmx-queue.h"
#include "htp-ops.h"
#include "worker-pool.h"
@@ -30,6 +31,8 @@ struct htp_spad {
uint32_t size_per_thread; // size per thread
};
struct htp_context;
// Context while processing an Op
// TODO: fold this into the main context
struct htp_ops_context {
@@ -72,6 +75,10 @@ struct htp_context {
atomic_bool vtcm_needs_release;
struct htp_ops_context octx;
#ifdef HTP_HAS_HMX
struct hmx_queue * hmx_queue; // Async HMX queue for pipeline overlap
#endif
};
int op_matmul(struct htp_ops_context * octx);

View File

@@ -91,7 +91,12 @@ enum htp_op_code {
#define HTP_OP_MAX_BUFS 8
#define HTP_OP_MAX_REQS 256
#define HTP_OP_MAX_TENSORS (HTP_OP_MAX_REQS * HTP_OP_MAX_INPUTS + HTP_OP_MAX_REQS)
#if __HVX_ARCH__ < 75
#define HTP_OP_MAX_VMEM (3167538380u)
#else
#define HTP_OP_MAX_VMEM (3221225472u)
#endif
enum htp_tensor_flags {
HTP_TENSOR_COMPUTE = (1U << 0), // Tensor buffer temporal compute data (not weights)

View File

@@ -116,9 +116,14 @@ static inline HVX_VectorPred hvx_vec_is_nan_f16(HVX_Vector v) {
}
static inline HVX_Vector hvx_vec_f32_to_f16_shuff(HVX_Vector v0, HVX_Vector v1) {
#if __HVX_ARCH__ >= 81
HVX_Vector q0 = Q6_Vqf32_equals_Vsf(v0);
HVX_Vector q1 = Q6_Vqf32_equals_Vsf(v1);
#else
const HVX_Vector zero = Q6_V_vzero();
HVX_Vector q0 = Q6_Vqf32_vadd_VsfVsf(v0, zero);
HVX_Vector q1 = Q6_Vqf32_vadd_VsfVsf(v1, zero);
#endif
return Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(q1, q0));
}

View File

@@ -18,8 +18,9 @@
#include <remote.h>
#include <string.h>
#include "hex-dma.h"
#include "hex-utils.h"
#include "hex-dma.h"
#include "hmx-queue.h"
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
@@ -324,6 +325,14 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
#ifdef HTP_HAS_HMX
ctx->hmx_enabled = use_hmx;
ctx->hmx_queue = NULL;
if (use_hmx) {
ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx);
if (!ctx->hmx_queue) {
FARF(ERROR, "hmx-queue-create failed");
ctx->hmx_enabled = false;
}
}
FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx);
#endif
@@ -389,7 +398,11 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
}
#ifdef HTP_HAS_HMX
ctx->hmx_enabled = 0;
if (ctx->hmx_queue) {
hmx_queue_delete(ctx->hmx_queue);
ctx->hmx_queue = NULL;
}
ctx->hmx_enabled = false;
#endif
vtcm_free(ctx);

View File

@@ -0,0 +1,161 @@
{%- macro render_content(content, num_img_tokens, num_video_frames) -%}
{%- if content is string -%}
{{- content -}}
{%- elif content is sequence -%}
{%- set ns = namespace(out="", prev_was_text=false) -%}
{%- for item in content -%}
{%- set item_type = item.get("type") -%}
{%- if item_type == "text" or item.get("text") is not none -%}
{%- set text = item.get("text", "") -%}
{%- if text -%}
{%- if ns.prev_was_text -%}
{%- set ns.out = ns.out ~ " " -%}
{%- endif -%}
{%- set ns.out = ns.out ~ text -%}
{%- endif -%}
{%- set ns.prev_was_text = text != "" -%}
{%- elif item_type in ["image", "image_url"] or item.get("image") is not none or item.get("image_url") is not none -%}
{%- set ns.out = ns.out ~ "<image>" ~ ("<REKA_IMG_TOKEN>" * num_img_tokens) ~ "</image>" -%}
{%- set ns.prev_was_text = false -%}
{%- elif item_type in ["video", "video_url"] or item.get("video") is not none or item.get("video_url") is not none -%}
{%- set repeat_tokens = num_img_tokens * num_video_frames -%}
{%- set ns.out = ns.out ~ "<video>" ~ ("<REKA_IMG_TOKEN>" * repeat_tokens) ~ "</video>" -%}
{%- set ns.prev_was_text = false -%}
{%- endif -%}
{%- endfor -%}
{{- ns.out -}}
{%- endif -%}
{%- endmacro -%}
{%- set ns = namespace(out="", last_query_index=messages|length - 1) -%}
{%- for msg in messages[::-1] -%}
{%- set idx = messages|length - 1 - loop.index0 -%}
{%- if msg.get("role") == "user" -%}
{%- set content = msg.get("content", "") -%}
{%- if not (content is string and content.startswith("<tool_response>") and content.endswith("</tool_response>")) -%}
{%- set ns.last_query_index = idx -%}
{%- break -%}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{%- set last_query_index = ns.last_query_index -%}
{%- set num_img_tokens = num_img_tokens | default(64, true) | int -%}
{%- set num_video_frames = num_video_frames | default(6, true) | int -%}
{%- set start_idx = 0 -%}
{%- set system_text = "" -%}
{%- if messages|length > 0 and messages[0].get("role") in ["system", "developer"] -%}
{%- set system_text = render_content(messages[0].get("content", ""), num_img_tokens, num_video_frames) -%}
{%- set start_idx = 1 -%}
{%- endif -%}
{%- if tools or system_text -%}
{%- set preamble_ns = namespace(text="") -%}
{%- if system_text -%}
{%- set preamble_ns.text = "system: " ~ system_text -%}
{%- endif -%}
{%- if tools -%}
{%- if preamble_ns.text -%}
{%- set preamble_ns.text = preamble_ns.text ~ "\n\n" -%}
{%- else -%}
{%- set preamble_ns.text = "system: " -%}
{%- endif -%}
{%- set preamble_ns.text = preamble_ns.text
~ "# Tools\n\n"
~ "You may call one or more functions to assist with the user query.\n\n"
~ "You are provided with function signatures within <tools></tools> XML tags:\n"
~ "<tools>" -%}
{%- for tool in tools -%}
{%- set preamble_ns.text = preamble_ns.text ~ "\n" ~ (tool | tojson(ensure_ascii=True)) -%}
{%- endfor -%}
{%- set preamble_ns.text = preamble_ns.text
~ "\n</tools>\n\n"
~ "For each function call, return a json object with function name and arguments "
~ "within <tool_call></tool_call> XML tags:\n"
~ "<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>" -%}
{%- endif -%}
{%- set ns.out = ns.out ~ preamble_ns.text ~ "\n\n<sep>" -%}
{%- endif -%}
{%- for idx in range(start_idx, messages|length) -%}
{%- set message = messages[idx] -%}
{%- set role = message.get("role") -%}
{%- set content = message.get("content") -%}
{%- if role == "user" -%}
{%- set prefix_ns = namespace(value="human: ") -%}
{%- if content is sequence and content is not string -%}
{%- for item in content -%}
{%- if item.get("type") == "text" or item.get("text") is not none -%}
{%- set text = item.get("text", "") -%}
{%- if text -%}
{%- break -%}
{%- endif -%}
{%- elif item.get("type") in ["image", "image_url", "video", "video_url"] -%}
{%- set prefix_ns.value = "human:" -%}
{%- break -%}
{%- endif -%}
{%- endfor -%}
{%- endif -%}
{%- set ns.out = ns.out ~ prefix_ns.value ~ render_content(content, num_img_tokens, num_video_frames) ~ "<sep>" -%}
{%- elif role == "assistant" -%}
{%- set tool_calls = message.get("tool_calls") -%}
{%- set content_text = render_content(content, num_img_tokens, num_video_frames) -%}
{%- set reasoning_text = "" -%}
{%- if message.get("reasoning_content") is string -%}
{%- set reasoning_text = message.get("reasoning_content") -%}
{%- elif "</think>" in content_text -%}
{%- set reasoning_text = content_text.split("</think>", 1)[0].rstrip("\n").split("<think>")[-1].lstrip("\n") -%}
{%- set content_text = content_text.split("</think>", 1)[1].lstrip("\n") -%}
{%- endif -%}
{%- set ns.out = ns.out ~ "assistant: " -%}
{%- set include_thinking = enable_thinking is true
and idx > last_query_index
and (idx == messages|length - 1 or reasoning_text)
-%}
{%- if include_thinking -%}
{%- set ns.out = ns.out ~ "<think>\n" ~ (reasoning_text.strip() ) ~ "\n</think>\n\n" -%}
{%- endif -%}
{%- set ns.out = ns.out ~ content_text -%}
{%- if tool_calls -%}
{%- if content_text and not ns.out.endswith("\n") -%}
{%- set ns.out = ns.out ~ "\n" -%}
{%- endif -%}
{%- for tool_call in tool_calls -%}
{%- if tool_call.get("function") is not none -%}
{%- set tool_call = tool_call.get("function") -%}
{%- endif -%}
{%- set arguments = tool_call.get("arguments", {}) -%}
{%- if arguments is string -%}
{%- set arguments_json = arguments -%}
{%- elif arguments is mapping -%}
{%- set arguments_json = arguments | tojson(ensure_ascii=True) -%}
{%- else -%}
{%- set arguments_json = arguments | tojson(ensure_ascii=True) -%}
{%- endif -%}
{%- set ns.out = ns.out
~ "<tool_call>\n"
~ "{\"name\": \"" ~ tool_call.get("name", "") ~ "\", \"arguments\": "
~ arguments_json
~ "}\n</tool_call>" -%}
{%- endfor -%}
{%- endif -%}
{%- if not (continue_final_message and idx == messages|length - 1) -%}
{%- set ns.out = ns.out ~ "\n\n<sep>" -%}
{%- endif -%}
{%- elif role == "tool" -%}
{%- if idx == start_idx or messages[idx - 1].get("role") != "tool" -%}
{%- set ns.out = ns.out ~ "human: " -%}
{%- endif -%}
{%- set response_text = render_content(content, num_img_tokens, num_video_frames) -%}
{%- set ns.out = ns.out ~ "<tool_response>\n" ~ response_text ~ "\n</tool_response>" -%}
{%- if idx == messages|length - 1 or messages[idx + 1].get("role") != "tool" -%}
{%- set ns.out = ns.out ~ "<sep>" -%}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt
and (messages|length == 0 or messages[-1].get("role") != "assistant")
-%}
{%- if enable_thinking is true -%}
{%- set ns.out = ns.out ~ "assistant: <think>\n" -%}
{%- else -%}
{%- set ns.out = ns.out ~ "assistant:" -%}
{%- endif -%}
{%- endif -%}
{{- ns.out -}}

View File

@@ -1,9 +1,9 @@
set(TARGET llama-vdot)
add_executable(${TARGET} vdot.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-q8dot)
add_executable(${TARGET} q8dot.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -10,7 +10,7 @@ function(llama_build source)
endif()
add_executable(${TEST_TARGET} ${TEST_SOURCES})
target_link_libraries(${TEST_TARGET} PRIVATE common)
target_link_libraries(${TEST_TARGET} PRIVATE llama llama-common)
if (LLAMA_TESTS_INSTALL)
install(TARGETS ${TEST_TARGET} RUNTIME)
endif()
@@ -105,7 +105,7 @@ function(llama_build_and_test source)
if (LLAMA_TESTS_INSTALL)
install(TARGETS ${TEST_TARGET} RUNTIME)
endif()
target_link_libraries(${TEST_TARGET} PRIVATE common)
target_link_libraries(${TEST_TARGET} PRIVATE llama-common)
add_test(
NAME ${TEST_TARGET}
@@ -269,11 +269,11 @@ if (TARGET cpp-httplib)
get_target_property(_cpp_httplib_defs cpp-httplib INTERFACE_COMPILE_DEFINITIONS)
if (_cpp_httplib_defs MATCHES "CPPHTTPLIB_OPENSSL_SUPPORT")
add_library(gguf-model-data STATIC gguf-model-data.cpp)
target_link_libraries(gguf-model-data PRIVATE common cpp-httplib)
target_link_libraries(gguf-model-data PRIVATE llama-common cpp-httplib)
target_include_directories(gguf-model-data PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
add_executable(test-gguf-model-data test-gguf-model-data.cpp)
target_link_libraries(test-gguf-model-data PRIVATE gguf-model-data common)
target_link_libraries(test-gguf-model-data PRIVATE gguf-model-data llama-common)
llama_test(test-gguf-model-data LABEL "model")
# test-quant-type-selection requires gguf-model-data for remote model metadata

View File

@@ -2164,7 +2164,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
tst.test(
"<tool_call>\n"
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}"
"</tool_call>")
.tools({ special_function_tool })
.expect(message_assist_call)
@@ -2172,7 +2172,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
tst.test(
"Hello, world!\nWhat's up?<tool_call>\n"
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}"
"</tool_call>")
.tools({ special_function_tool })
.expect(message_assist_call_content)
@@ -3329,6 +3329,92 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
.run();
}
// Reka-Edge tests - uses native JSON format with per-call wrapper
{
auto tst = peg_tester("models/templates/Reka-Edge.jinja", detailed_debug);
// Basic content only
tst.test("Hello, world!\nWhat's up?").enable_thinking(false).expect(message_assist).run();
// Single tool call without reasoning
tst.test("<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>")
.enable_thinking(false)
.tools({ special_function_tool })
.expect(message_assist_call)
.run();
// Tool call with string argument
tst.test("<tool_call>\n{\"name\": \"get_time\", \"arguments\": {\"city\": \"XYZCITY\"}}</tool_call>")
.enable_thinking(false)
.tools({ get_time_tool })
.expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
.run();
// Tool call with reasoning (enable_thinking=true)
tst.test("I'm\nthinking</think><tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>")
.enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ special_function_tool })
.expect(message_assist_call_thoughts)
.run();
// Multiple tool calls (parallel)
tst.test(
"<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>"
"<tool_call>\n{\"name\": \"special_function_with_opt\", \"arguments\": {\"arg1\": 1, \"arg2\": 2}}</tool_call>"
)
.enable_thinking(false)
.parallel_tool_calls(true)
.tools({
special_function_tool, special_function_tool_with_optional_param
})
.expect_tool_calls({
{ "special_function", R"({"arg1": 1})", {} },
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
})
.run();
// Tool call with reasoning and content
tst.test("I need to call a function</think>"
"Let me check the time.<tool_call>\n{\"name\": \"get_time\", \"arguments\": {\"city\": \"XYZCITY\"}}</tool_call>")
.enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ get_time_tool })
.expect(message_with_reasoning_content_and_multiple_tool_calls(
"I need to call a function", "Let me check the time.", { { "get_time", "{\"city\":\"XYZCITY\"}" } }
))
.run();
// Partial tool call (streaming)
tst.test("<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\":")
.tools({ special_function_tool })
.enable_thinking(false)
.is_partial(true)
.expect(simple_assist_msg("", "", "special_function", "{\"arg1\": "))
.run();
// Tool call with empty arguments
tst.test("<tool_call>\n{\"name\": \"empty_args\", \"arguments\": {}}</tool_call>")
.enable_thinking(false)
.tools({ empty_args_tool })
.expect(simple_assist_msg("", "", "empty_args", "{}"))
.run();
// fake tool call marker in reasoning
tst.test(
"Let me think about <tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 2}}</tool_call> hmm</think>"
"<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>")
.enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ special_function_tool })
.expect_reasoning("Let me think about <tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 2}}</tool_call> hmm")
.expect_tool_calls({
{ "special_function", R"({"arg1": 1})", {} },
})
.run();
}
// Apertus-8B-Instruct tests - FUNC_NAME_AS_KEY format
// Format: <|tools_prefix|>[{"function_name": {...arguments...}}]<|tools_suffix|>
{

View File

@@ -1,10 +1,13 @@
#include "ggml.h"
#include "ggml-cpu.h"
#include "llama.h"
#include "build-info.h"
#include "common.h"
#include "../src/llama-model.h"
#include "ggml.h"
#include "ggml-cpu.h"
#include <algorithm>
#include <cassert>
#include <cinttypes>
@@ -298,7 +301,7 @@ int main(int argc, char ** argv) {
return 1;
}
print_build_info();
llama_print_build_info();
// load the model
fprintf(stderr, "Loading model\n");

View File

@@ -1,6 +1,6 @@
set(TARGET llama-batched-bench)
add_executable(${TARGET} batched-bench.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

View File

@@ -1,6 +1,6 @@
set(TARGET llama-cli)
add_executable(${TARGET} cli.cpp)
target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE server-context PUBLIC llama-common ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
include_directories(../server)

View File

@@ -1,6 +1,6 @@
set(TARGET llama-completion)
add_executable(${TARGET} completion.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

View File

@@ -1,6 +1,6 @@
set(TARGET llama-cvector-generator)
add_executable(${TARGET} cvector-generator.cpp pca.hpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

View File

@@ -2,6 +2,7 @@
#include "gguf.h"
#include "arg.h"
#include "build-info.h"
#include "common.h"
#include "llama.h"
#include "pca.hpp"
@@ -420,7 +421,7 @@ int main(int argc, char ** argv) {
params.cb_eval_user_data = &cb_data;
params.warmup = false;
print_build_info();
llama_print_build_info();
llama_backend_init();
llama_numa_init(params.numa);

View File

@@ -1,6 +1,6 @@
set(TARGET llama-export-lora)
add_executable(${TARGET} export-lora.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

View File

@@ -1,6 +1,6 @@
set(TARGET llama-fit-params)
add_executable(${TARGET} fit-params.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

View File

@@ -1,6 +1,6 @@
set(TARGET llama-gguf-split)
add_executable(${TARGET} gguf-split.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

View File

@@ -1,7 +1,10 @@
#include "llama.h"
#include "build-info.h"
#include "common.h"
#include "ggml.h"
#include "gguf.h"
#include "llama.h"
#include "common.h"
#include <algorithm>
#include <cinttypes>
@@ -101,8 +104,8 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
split_print_usage(argv[0]);
exit(0);
} else if (arg == "--version") {
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
fprintf(stderr, "version: %d (%s)\n", llama_build_number(), llama_commit());
fprintf(stderr, "built with %s for %s\n", llama_compiler(), llama_build_target());
exit(0);
} else if (arg == "--dry-run") {
arg_found = true;

View File

@@ -1,6 +1,6 @@
set(TARGET llama-imatrix)
add_executable(${TARGET} imatrix.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

View File

@@ -1,6 +1,6 @@
set(TARGET llama-bench)
add_executable(${TARGET} llama-bench.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

View File

@@ -19,6 +19,7 @@
#include <vector>
#include <unordered_set>
#include "build-info.h"
#include "common.h"
#include "download.h"
#include "ggml.h"
@@ -1624,8 +1625,8 @@ struct test {
}
};
const std::string test::build_commit = LLAMA_COMMIT;
const int test::build_number = LLAMA_BUILD_NUMBER;
const std::string test::build_commit = llama_commit();
const int test::build_number = llama_build_number();
struct printer {
virtual ~printer() {}

View File

@@ -86,12 +86,12 @@ if (TARGET BUILD_INFO)
add_dependencies(mtmd-helper BUILD_INFO)
endif()
# if mtmd is linked against common, we throw an error
# if mtmd is linked against llama-common, we throw an error
if (TARGET mtmd)
get_target_property(libs mtmd LINK_LIBRARIES)
if (libs AND "common" IN_LIST libs)
if (libs AND "llama-common" IN_LIST libs)
message(FATAL_ERROR "mtmd is designed to be a public library.\n"
"It must not link against common")
"It must not link against llama-common")
endif()
endif()
@@ -106,11 +106,11 @@ set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
if(LLAMA_TOOLS_INSTALL)
install(TARGETS ${TARGET} RUNTIME)
endif()
target_link_libraries (${TARGET} PRIVATE common mtmd Threads::Threads)
target_link_libraries (${TARGET} PRIVATE llama-common mtmd Threads::Threads)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
# mtmd-debug tool
add_executable(llama-mtmd-debug debug/mtmd-debug.cpp)
set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug)
target_link_libraries(llama-mtmd-debug PRIVATE common mtmd Threads::Threads)
target_link_libraries(llama-mtmd-debug PRIVATE llama-common mtmd Threads::Threads)
target_compile_features(llama-mtmd-debug PRIVATE cxx_std_17)

View File

@@ -2,7 +2,7 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
# this tool is disabled on Windows when building with shared libraries because it uses internal functions not exported with LLAMA_API
set(TARGET llama-debug-template-parser)
add_executable(${TARGET} debug-template-parser.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)
@@ -12,7 +12,7 @@ endif()
set(TARGET llama-template-analysis)
add_executable(${TARGET} template-analysis.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

View File

@@ -1,6 +1,6 @@
set(TARGET llama-perplexity)
add_executable(${TARGET} perplexity.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

View File

@@ -1,6 +1,6 @@
set(TARGET llama-quantize)
add_executable(${TARGET} quantize.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,5 +1,8 @@
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include "common.h"
#include "gguf.h"
#include <algorithm>
@@ -709,7 +712,7 @@ int main(int argc, char ** argv) {
}
}
print_build_info();
llama_print_build_info();
if (params.dry_run) {
fprintf(stderr, "%s: calculating quantization size for '%s' as %s", __func__, fname_inp.c_str(), ftype_str.c_str());

View File

@@ -1,6 +1,6 @@
set(TARGET llama-results)
add_executable(${TARGET} results.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

View File

@@ -23,7 +23,7 @@ endif()
target_include_directories(${TARGET} PRIVATE ../mtmd)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
target_link_libraries(${TARGET} PUBLIC common mtmd ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PUBLIC llama-common mtmd ${CMAKE_THREAD_LIBS_INIT})
# llama-server executable
@@ -68,6 +68,6 @@ install(TARGETS ${TARGET} RUNTIME)
target_include_directories(${TARGET} PRIVATE ../mtmd)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE server-context PUBLIC llama-common cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -4,6 +4,7 @@
#include "server-task.h"
#include "server-queue.h"
#include "build-info.h"
#include "common.h"
#include "llama.h"
#include "log.h"
@@ -3009,7 +3010,7 @@ server_context_meta server_context::get_meta() const {
auto eos_token_str = eos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, eos_id, true) : "";
return server_context_meta {
/* build_info */ build_info,
/* build_info */ std::string(llama_build_info()),
/* model_name */ impl->model_name,
/* model_aliases */ impl->model_aliases,
/* model_tags */ impl->model_tags,

View File

@@ -1,6 +1,7 @@
#include "server-common.h"
#include "server-models.h"
#include "build-info.h"
#include "preset.h"
#include "download.h"
@@ -936,7 +937,7 @@ void server_models_routes::init_routes() {
{"n_ctx", 0},
}},
{"webui_settings", webui_settings},
{"build_info", build_info},
{"build_info", std::string(llama_build_info())},
});
return res;
}

View File

@@ -1,5 +1,6 @@
#include "server-task.h"
#include "build-info.h"
#include "chat.h"
#include "common.h"
#include "json-schema-to-grammar.h"
@@ -791,7 +792,7 @@ json server_task_result_cmpl_final::to_json_oaicompat() {
})},
{"created", t},
{"model", oaicompat_model},
{"system_fingerprint", build_info},
{"system_fingerprint", std::string(llama_build_info())},
{"object", "text_completion"},
{"usage", usage_json_oaicompat()},
{"id", oaicompat_cmpl_id}
@@ -839,7 +840,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat() {
{"choices", json::array({choice})},
{"created", t},
{"model", oaicompat_model},
{"system_fingerprint", build_info},
{"system_fingerprint", std::string(llama_build_info())},
{"object", "chat.completion"},
{"usage", usage_json_oaicompat()},
{"id", oaicompat_cmpl_id}
@@ -876,7 +877,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
{"created", t},
{"id", oaicompat_cmpl_id},
{"model", oaicompat_model},
{"system_fingerprint", build_info},
{"system_fingerprint", std::string(llama_build_info())},
{"object", "chat.completion.chunk"},
});
}
@@ -892,7 +893,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
{"created", t},
{"id", oaicompat_cmpl_id},
{"model", oaicompat_model},
{"system_fingerprint", build_info},
{"system_fingerprint", std::string(llama_build_info())},
{"object", "chat.completion.chunk"},
});
@@ -904,7 +905,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
{"created", t},
{"id", oaicompat_cmpl_id},
{"model", oaicompat_model},
{"system_fingerprint", build_info},
{"system_fingerprint", std::string(llama_build_info())},
{"object", "chat.completion.chunk"},
{"usage", usage_json_oaicompat()},
});
@@ -1469,7 +1470,7 @@ json server_task_result_cmpl_partial::to_json_oaicompat() {
})},
{"created", t},
{"model", oaicompat_model},
{"system_fingerprint", build_info},
{"system_fingerprint", std::string(llama_build_info())},
{"object", "text_completion"},
{"id", oaicompat_cmpl_id}
};
@@ -1506,7 +1507,7 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
{"created", t},
{"id", oaicompat_cmpl_id},
{"model", oaicompat_model},
{"system_fingerprint", build_info},
{"system_fingerprint", std::string(llama_build_info())},
{"object", "chat.completion.chunk"},
});
};

View File

@@ -5,6 +5,7 @@
#include "server-tools.h"
#include "arg.h"
#include "build-info.h"
#include "common.h"
#include "llama.h"
#include "log.h"
@@ -108,7 +109,7 @@ int main(int argc, char ** argv) {
llama_backend_init();
llama_numa_init(params.numa);
LOG_INF("build_info: %s\n", build_info.c_str());
LOG_INF("build_info: %s\n", llama_build_info());
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
server_http_context ctx_http;

View File

@@ -3,5 +3,5 @@ add_executable(${TARGET} tokenize.cpp)
if(LLAMA_TOOLS_INSTALL)
install(TARGETS ${TARGET} RUNTIME)
endif()
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,6 +1,6 @@
set(TARGET llama-tts)
add_executable(${TARGET} tts.cpp)
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

View File

@@ -5,6 +5,8 @@ find_package(Threads REQUIRED)
llama_add_compile_flags()
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_library(${TARGET} STATIC httplib.cpp httplib.h)
# disable warnings in 3rd party code