mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-16 16:27:32 +03:00
Compare commits
15 Commits
b8789
...
copilot/pl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4943e3a396 | ||
|
|
51b679a5d6 | ||
|
|
c00ac13fee | ||
|
|
3f3d62ffec | ||
|
|
8dc530b86d | ||
|
|
e1a9a6dcbe | ||
|
|
e39eba26f3 | ||
|
|
5d14e5d19b | ||
|
|
fae3a28070 | ||
|
|
c0de6eda72 | ||
|
|
707c0b7a6e | ||
|
|
1f30ac0cea | ||
|
|
f4b5bf2f32 | ||
|
|
aa0f1897b7 | ||
|
|
be76dd0bb2 |
@@ -7,7 +7,7 @@ RUN apt update && apt install -y git build-essential cmake wget xz-utils
|
||||
|
||||
# Install SSL and Vulkan SDK dependencies
|
||||
RUN apt install -y libssl-dev curl \
|
||||
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc
|
||||
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc spirv-headers
|
||||
|
||||
# Build it
|
||||
WORKDIR /app
|
||||
|
||||
108
.github/workflows/build-self-hosted.yml
vendored
108
.github/workflows/build-self-hosted.yml
vendored
@@ -141,61 +141,59 @@ jobs:
|
||||
# amd-smi static
|
||||
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
# TODO: sandbox Mac runners
|
||||
# ggml-ci-mac-metal:
|
||||
# runs-on: [self-hosted, macOS, ARM64]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
#
|
||||
# ggml-ci-mac-webgpu:
|
||||
# runs-on: [self-hosted, macOS, ARM64]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Dawn Dependency
|
||||
# id: dawn-depends
|
||||
# run: |
|
||||
# DAWN_VERSION="v2.0.0"
|
||||
# DAWN_OWNER="reeselevine"
|
||||
# DAWN_REPO="dawn"
|
||||
# DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
|
||||
# echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
# curl -L -o artifact.zip \
|
||||
# "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
# mkdir dawn
|
||||
# unzip artifact.zip
|
||||
# tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
# bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
#
|
||||
# ggml-ci-mac-vulkan:
|
||||
# runs-on: [self-hosted, macOS, ARM64]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# vulkaninfo --summary
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
ggml-ci-mac-metal:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-webgpu:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dawn Dependency
|
||||
id: dawn-depends
|
||||
run: |
|
||||
DAWN_VERSION="v20260317.182325"
|
||||
DAWN_OWNER="google"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
|
||||
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
curl -L -o artifact.tar.gz \
|
||||
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
mkdir dawn
|
||||
tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-vulkan:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-linux-intel-vulkan:
|
||||
runs-on: [self-hosted, Linux, Intel]
|
||||
|
||||
3
.github/workflows/build-vulkan.yml
vendored
3
.github/workflows/build-vulkan.yml
vendored
@@ -93,4 +93,5 @@ jobs:
|
||||
export GGML_VK_DISABLE_F16=1
|
||||
export GGML_VK_DISABLE_COOPMAT=1
|
||||
# This is using llvmpipe and runs slower than other backends
|
||||
ctest -L main --verbose --timeout 4800
|
||||
# test-backend-ops is too slow on llvmpipe, skip it
|
||||
ctest -L main -E test-backend-ops --verbose --timeout 900
|
||||
|
||||
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@@ -318,7 +318,7 @@ jobs:
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
|
||||
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
|
||||
|
||||
99
.github/workflows/libllama-abi-check.yml
vendored
Normal file
99
.github/workflows/libllama-abi-check.yml
vendored
Normal file
@@ -0,0 +1,99 @@
|
||||
name: libllama ABI check
|
||||
|
||||
# Checks exported function signatures of libllama against a committed baseline
|
||||
# (scripts/libllama.abi) and determines whether a major (signatures
|
||||
# removed/changed) or minor (signatures added) version bump is required.
|
||||
#
|
||||
# The baseline is generated from include/llama.h by scripts/gen-libllama-abi.py.
|
||||
# To update the baseline after an intentional ABI change:
|
||||
#
|
||||
# python3 scripts/gen-libllama-abi.py include/llama.h > scripts/libllama.abi
|
||||
#
|
||||
# Then increment LLAMA_VERSION_MAJOR (breaking change) or LLAMA_VERSION_MINOR
|
||||
# (backwards-compatible addition) in CMakeLists.txt.
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- 'include/llama.h'
|
||||
- 'scripts/libllama.abi'
|
||||
- 'scripts/gen-libllama-abi.py'
|
||||
- 'CMakeLists.txt'
|
||||
- '.github/workflows/libllama-abi-check.yml'
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths:
|
||||
- 'include/llama.h'
|
||||
- 'scripts/libllama.abi'
|
||||
- 'scripts/gen-libllama-abi.py'
|
||||
- 'CMakeLists.txt'
|
||||
- '.github/workflows/libllama-abi-check.yml'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
abi-check:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Extract current signatures
|
||||
run: |
|
||||
python3 scripts/gen-libllama-abi.py include/llama.h > /tmp/current.abi
|
||||
|
||||
- name: Compare with baseline
|
||||
id: compare
|
||||
run: |
|
||||
baseline=scripts/libllama.abi
|
||||
current=/tmp/current.abi
|
||||
|
||||
removed=$(comm -23 "$baseline" "$current")
|
||||
added=$(comm -13 "$baseline" "$current")
|
||||
|
||||
if [ -n "$removed" ]; then
|
||||
echo "bump=major" >> "$GITHUB_OUTPUT"
|
||||
echo "### :boom: MAJOR version bump required" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "The following exported signatures were **removed or changed** in libllama:" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo '```' >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "$removed" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo '```' >> "$GITHUB_STEP_SUMMARY"
|
||||
elif [ -n "$added" ]; then
|
||||
echo "bump=minor" >> "$GITHUB_OUTPUT"
|
||||
echo "### :sparkles: MINOR version bump required" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "The following new signatures were **added** to libllama:" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo '```' >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "$added" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo '```' >> "$GITHUB_STEP_SUMMARY"
|
||||
else
|
||||
echo "bump=patch" >> "$GITHUB_OUTPUT"
|
||||
echo "### :white_check_mark: No ABI change – PATCH version bump only" >> "$GITHUB_STEP_SUMMARY"
|
||||
fi
|
||||
|
||||
if [ -n "$removed" ] || [ -n "$added" ]; then
|
||||
echo "" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "Regenerate the baseline and bump the version:" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo '```' >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "python3 scripts/gen-libllama-abi.py include/llama.h > scripts/libllama.abi" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo '```' >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "Then increment \`LLAMA_VERSION_MAJOR\` (breaking) or \`LLAMA_VERSION_MINOR\` (additive) in \`CMakeLists.txt\`." >> "$GITHUB_STEP_SUMMARY"
|
||||
fi
|
||||
|
||||
- name: Fail on unacknowledged ABI change
|
||||
if: steps.compare.outputs.bump == 'major' || steps.compare.outputs.bump == 'minor'
|
||||
run: |
|
||||
echo "ABI change detected. Run: python3 scripts/gen-libllama-abi.py include/llama.h > scripts/libllama.abi"
|
||||
echo "Then bump LLAMA_VERSION_MAJOR (breaking) or LLAMA_VERSION_MINOR (additive) in CMakeLists.txt."
|
||||
exit 1
|
||||
|
||||
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@@ -202,7 +202,7 @@ jobs:
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev
|
||||
else
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
|
||||
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
fi
|
||||
|
||||
77
.github/workflows/server-self-hosted.yml
vendored
77
.github/workflows/server-self-hosted.yml
vendored
@@ -84,41 +84,42 @@ jobs:
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
server-cuda:
|
||||
runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||
|
||||
name: server-cuda (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
wf_name: ["GPUx1"]
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx1, backend-sampling"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
# TODO: provision CUDA runner
|
||||
# server-cuda:
|
||||
# runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||
#
|
||||
# name: server-cuda (${{ matrix.wf_name }})
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build_type: [Release]
|
||||
# wf_name: ["GPUx1"]
|
||||
# include:
|
||||
# - build_type: Release
|
||||
# extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
# wf_name: "GPUx1, backend-sampling"
|
||||
# fail-fast: false
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
# ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
||||
# cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
#
|
||||
# - name: Tests
|
||||
# id: server_integration_tests
|
||||
# if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
# run: |
|
||||
# cd tools/server/tests
|
||||
# python3 -m venv venv
|
||||
# source venv/bin/activate
|
||||
# pip install -r requirements.txt
|
||||
# export ${{ matrix.extra_args }}
|
||||
# pytest -v -x -m "not slow"
|
||||
|
||||
@@ -127,7 +127,13 @@ endif()
|
||||
if (NOT DEFINED LLAMA_BUILD_COMMIT)
|
||||
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
||||
endif()
|
||||
set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
|
||||
if (NOT DEFINED LLAMA_VERSION_MAJOR)
|
||||
set(LLAMA_VERSION_MAJOR 0)
|
||||
endif()
|
||||
if (NOT DEFINED LLAMA_VERSION_MINOR)
|
||||
set(LLAMA_VERSION_MINOR 0)
|
||||
endif()
|
||||
set(LLAMA_INSTALL_VERSION ${LLAMA_VERSION_MAJOR}.${LLAMA_VERSION_MINOR}.${LLAMA_BUILD_NUMBER})
|
||||
|
||||
# override ggml options
|
||||
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
||||
|
||||
@@ -198,10 +198,19 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
|
||||
args_field = format.function_field + "." + args_field;
|
||||
}
|
||||
|
||||
auto tools_parser = p.standard_json_tools(
|
||||
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
|
||||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
|
||||
auto tools_parser = p.eps();
|
||||
if (format.section_start.empty() && !format.per_call_start.empty()) {
|
||||
auto single_tool_parser = p.standard_json_tools(
|
||||
format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls,
|
||||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
|
||||
tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space()));
|
||||
} else {
|
||||
tools_parser = p.standard_json_tools(
|
||||
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
|
||||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
|
||||
}
|
||||
|
||||
// Handle content wrappers if present
|
||||
if (ctx.content && ctx.content->is_always_wrapped()) {
|
||||
|
||||
@@ -308,19 +308,23 @@ struct analyze_tools : analyze_base {
|
||||
|
||||
private:
|
||||
// Extract tool calling 'haystack' for further analysis and delegate further analysis based on format
|
||||
void analyze_tool_calls(const analyze_reasoning & reasoning);
|
||||
void analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls);
|
||||
|
||||
// Analyze format based on position of function and argument name in needle
|
||||
void analyze_tool_call_format(const std::string & haystack,
|
||||
const std::string & fun_name_needle,
|
||||
const std::string & arg_name_needle,
|
||||
const analyze_reasoning & reasoning);
|
||||
const analyze_reasoning & reasoning,
|
||||
bool supports_parallel_tool_calls);
|
||||
|
||||
// Analyze specifics of JSON native format (entire tool call is a JSON object)
|
||||
void analyze_tool_call_format_json_native(const std::string & clean_haystack,
|
||||
const std::string & fun_name_needle,
|
||||
const std::string & arg_name_needle);
|
||||
|
||||
// Check if parallel calls in JSON native format array wrapped or tag wrapped
|
||||
void analyze_json_native_parallel_calls();
|
||||
|
||||
// Analyze specifics of non-JSON native format (tags for function name or for function name and arguments)
|
||||
void analyze_tool_call_format_non_json(const std::string & clean_haystack,
|
||||
const std::string & fun_name_needle);
|
||||
|
||||
@@ -558,7 +558,7 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
|
||||
: analyze_base(tmpl) {
|
||||
LOG_DBG(ANSI_ORANGE "Phase 3: Tool call analysis\n" ANSI_RESET);
|
||||
|
||||
analyze_tool_calls(reasoning);
|
||||
analyze_tool_calls(reasoning, caps.supports_parallel_tool_calls);
|
||||
|
||||
if (format.mode != tool_format::NONE && format.mode != tool_format::JSON_NATIVE) {
|
||||
if (caps.supports_parallel_tool_calls) {
|
||||
@@ -577,7 +577,7 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
|
||||
}
|
||||
}
|
||||
|
||||
void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
|
||||
void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls) {
|
||||
json assistant_no_tools = json{
|
||||
{ "role", "assistant" },
|
||||
{ "content", ASSISTANT_MSG }
|
||||
@@ -611,13 +611,14 @@ void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
|
||||
return;
|
||||
}
|
||||
|
||||
analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning);
|
||||
analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning, supports_parallel_tool_calls);
|
||||
}
|
||||
|
||||
void analyze_tools::analyze_tool_call_format(const std::string & haystack,
|
||||
const std::string & fun_name_needle,
|
||||
const std::string & arg_name_needle,
|
||||
const analyze_reasoning & reasoning) {
|
||||
const analyze_reasoning & reasoning,
|
||||
bool supports_parallel_tool_calls) {
|
||||
if (fun_name_needle.empty() || arg_name_needle.empty() || haystack.empty()) {
|
||||
return;
|
||||
}
|
||||
@@ -660,6 +661,9 @@ void analyze_tools::analyze_tool_call_format(const std::string & haystack,
|
||||
|
||||
if (format.mode == tool_format::JSON_NATIVE) {
|
||||
analyze_tool_call_format_json_native(clean_haystack, fun_name_needle, arg_name_needle);
|
||||
if (supports_parallel_tool_calls) {
|
||||
analyze_json_native_parallel_calls();
|
||||
}
|
||||
} else {
|
||||
analyze_tool_call_format_non_json(clean_haystack, fun_name_needle);
|
||||
}
|
||||
@@ -668,6 +672,42 @@ void analyze_tools::analyze_tool_call_format(const std::string & haystack,
|
||||
format.per_call_end = trim_whitespace(format.per_call_end);
|
||||
}
|
||||
|
||||
void analyze_tools::analyze_json_native_parallel_calls() {
|
||||
json assistant_one_tool = json{
|
||||
{ "role", "assistant" },
|
||||
{ "content", "" },
|
||||
{ "tool_calls", json::array({ first_tool_call }) }
|
||||
};
|
||||
|
||||
json assistant_two_tools = json{
|
||||
{ "role", "assistant" },
|
||||
{ "content", "" },
|
||||
{ "tool_calls", json::array({ first_tool_call, second_tool_call }) }
|
||||
};
|
||||
|
||||
template_params params;
|
||||
params.messages = json::array({ user_msg, assistant_one_tool });
|
||||
params.tools = tools;
|
||||
params.add_generation_prompt = false;
|
||||
params.enable_thinking = true;
|
||||
|
||||
auto comparison = compare_variants(
|
||||
*tmpl, params, [&](template_params & p) { p.messages = json::array({ user_msg, assistant_two_tools }); });
|
||||
|
||||
if (!comparison) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Template application failed\n" ANSI_RESET, __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
std::string & second_call = comparison->diff.right;
|
||||
if (!format.section_start.empty() && second_call.find(format.section_start) != std::string::npos) {
|
||||
format.per_call_start = format.section_start;
|
||||
format.per_call_end = format.section_end;
|
||||
format.section_start.clear();
|
||||
format.section_end.clear();
|
||||
}
|
||||
}
|
||||
|
||||
void analyze_tools::analyze_tool_call_format_json_native(const std::string & clean_haystack,
|
||||
const std::string & fun_name_needle,
|
||||
const std::string & arg_name_needle) {
|
||||
|
||||
@@ -676,7 +676,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
|
||||
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
|
||||
|
||||
auto nested_name = literal("\"" + nested_name_field + "\"") + space() + literal(":") + space() +
|
||||
literal("\"") + tool_name(literal(name)) + literal("\"");
|
||||
atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
|
||||
auto nested_args = literal("\"" + nested_args_field + "\"") + space() + literal(":") + space() +
|
||||
tool_args(schema(json(), "tool-" + name + "-schema", params));
|
||||
|
||||
@@ -744,7 +744,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
|
||||
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
|
||||
|
||||
auto tool_name_ = name_key_parser + space() + literal(":") + space() +
|
||||
literal("\"") + tool_name(literal(name)) + literal("\"");
|
||||
atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
|
||||
auto tool_args_ = args_key_parser + space() + literal(":") + space() +
|
||||
tool_args(schema(json(), "tool-" + name + "-schema", params));
|
||||
|
||||
|
||||
@@ -456,7 +456,8 @@ pacman -S git \
|
||||
mingw-w64-ucrt-x86_64-gcc \
|
||||
mingw-w64-ucrt-x86_64-cmake \
|
||||
mingw-w64-ucrt-x86_64-vulkan-devel \
|
||||
mingw-w64-ucrt-x86_64-shaderc
|
||||
mingw-w64-ucrt-x86_64-shaderc \
|
||||
mingw-w64-ucrt-x86_64-spirv-headers
|
||||
```
|
||||
|
||||
Switch into the `llama.cpp` directory and build using CMake.
|
||||
@@ -490,9 +491,11 @@ First, follow the official LunarG instructions for the installation and setup of
|
||||
|
||||
On Debian / Ubuntu, you can install the required dependencies using:
|
||||
```sh
|
||||
sudo apt-get install libvulkan-dev glslc
|
||||
sudo apt-get install libvulkan-dev glslc spirv-headers
|
||||
```
|
||||
|
||||
SPIRV-Headers (`spirv/unified1/spirv.hpp`) are required for the Vulkan backend and are **not** always pulled in by the Vulkan loader dev package alone. Other distros use names such as `spirv-headers` (Ubuntu / Debian / Arch), or `spirv-headers-devel` (Fedora / openSUSE). On Windows, the LunarG Vulkan SDK’s `Include` directory already contains these headers.
|
||||
|
||||
#### Common steps
|
||||
|
||||
Second, after verifying that you have followed all of the SDK installation/setup steps, use this command to make sure before proceeding:
|
||||
|
||||
@@ -602,8 +602,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
int n_input = input_tokens.size();
|
||||
|
||||
if (n_input >= params.n_ctx) {
|
||||
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx);
|
||||
if (static_cast<uint32_t>(n_input) >= llama_n_ctx(ctx)) {
|
||||
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, llama_n_ctx(ctx));
|
||||
llama_free(ctx);
|
||||
llama_model_free(model);
|
||||
return 1;
|
||||
|
||||
@@ -348,6 +348,53 @@ extern "C" {
|
||||
// Set a callback to be called for each resulting node during graph compute
|
||||
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
|
||||
|
||||
//
|
||||
// Meta backend
|
||||
//
|
||||
|
||||
#define GGML_BACKEND_META_MAX_DEVICES 16
|
||||
|
||||
enum ggml_backend_meta_split_axis {
|
||||
// tensor split by tensor dimensions:
|
||||
GGML_BACKEND_SPLIT_AXIS_0 = 0,
|
||||
GGML_BACKEND_SPLIT_AXIS_1 = 1,
|
||||
GGML_BACKEND_SPLIT_AXIS_2 = 2,
|
||||
GGML_BACKEND_SPLIT_AXIS_3 = 3,
|
||||
|
||||
GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
|
||||
GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum
|
||||
|
||||
// for internal bookkeeping only:
|
||||
GGML_BACKEND_SPLIT_AXIS_NONE = 98,
|
||||
GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
|
||||
};
|
||||
GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
|
||||
|
||||
struct ggml_backend_meta_split_state {
|
||||
enum ggml_backend_meta_split_axis axis;
|
||||
|
||||
// for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
|
||||
// - each device has a slice of the tensor along the split axis
|
||||
// - most tensors have n_segments == 1 and a contiguous slice of the tensor data
|
||||
// - some tensors have an inhomogenenous data layout along the split axis,
|
||||
// those tensors are divided into segments which are each individually split across devices
|
||||
// - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
|
||||
// the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
|
||||
// - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
|
||||
// that each need to be split individually across devices so that each device gets a slice of Q, K, and V
|
||||
int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES];
|
||||
uint32_t n_segments;
|
||||
};
|
||||
|
||||
// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
|
||||
typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
|
||||
|
||||
// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
|
||||
// TODO: this looks a bit strange - a backend API creates a device. I think we should try
|
||||
// express this as a backend registry functionality instead
|
||||
GGML_API ggml_backend_dev_t ggml_backend_meta_device(
|
||||
ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
|
||||
|
||||
//
|
||||
// Utils
|
||||
//
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <limits.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
@@ -5,9 +5,6 @@
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
// TODO: tmp
|
||||
#include "ggml-ext.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
|
||||
@@ -1,56 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
// This is a "staging" header for new ggml API
|
||||
// It is not publicly available and it should not be used by 3rd party projects
|
||||
//
|
||||
// When the API matures enough, it will be moved to the official public API
|
||||
|
||||
//
|
||||
// Meta backend
|
||||
//
|
||||
|
||||
#define GGML_BACKEND_META_MAX_DEVICES 16
|
||||
|
||||
enum ggml_backend_meta_split_axis {
|
||||
// tensor split by tensor dimensions:
|
||||
GGML_BACKEND_SPLIT_AXIS_0 = 0,
|
||||
GGML_BACKEND_SPLIT_AXIS_1 = 1,
|
||||
GGML_BACKEND_SPLIT_AXIS_2 = 2,
|
||||
GGML_BACKEND_SPLIT_AXIS_3 = 3,
|
||||
|
||||
GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
|
||||
GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum
|
||||
|
||||
// for internal bookkeeping only:
|
||||
GGML_BACKEND_SPLIT_AXIS_NONE = 98,
|
||||
GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
|
||||
};
|
||||
GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
|
||||
|
||||
struct ggml_backend_meta_split_state {
|
||||
enum ggml_backend_meta_split_axis axis;
|
||||
|
||||
// for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
|
||||
// - each device has a slice of the tensor along the split axis
|
||||
// - most tensors have n_segments == 1 and a contiguous slice of the tensor data
|
||||
// - some tensors have an inhomogenenous data layout along the split axis,
|
||||
// those tensors are divided into segments which are each individually split across devices
|
||||
// - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
|
||||
// the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
|
||||
// - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
|
||||
// that each need to be split individually across devices so that each device gets a slice of Q, K, and V
|
||||
int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES];
|
||||
uint32_t n_segments;
|
||||
};
|
||||
|
||||
// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
|
||||
typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
|
||||
|
||||
// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
|
||||
// TODO: this looks a bit strange - a backend API creates a device. I think we should try
|
||||
// express this as a backend registry functionality instead
|
||||
GGML_API ggml_backend_dev_t ggml_backend_meta_device(
|
||||
ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
|
||||
@@ -47,6 +47,7 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
|
||||
|
||||
if (_hmx_idx GREATER_EQUAL 0)
|
||||
target_sources(${HTP_LIB} PRIVATE
|
||||
hmx-queue.c
|
||||
hmx-matmul-ops.c
|
||||
)
|
||||
|
||||
|
||||
@@ -31,6 +31,14 @@ static inline uint64_t hex_get_pktcnt() {
|
||||
return pktcnt;
|
||||
}
|
||||
|
||||
static inline uint32_t hex_ceil_pow2(uint32_t x) {
|
||||
if (x <= 1) { return 1; }
|
||||
int p = 2;
|
||||
x--;
|
||||
while (x >>= 1) { p <<= 1; }
|
||||
return p;
|
||||
}
|
||||
|
||||
static inline size_t hmx_ceil_div(size_t num, size_t den) {
|
||||
return (num + den - 1) / den;
|
||||
}
|
||||
@@ -73,8 +81,7 @@ static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride,
|
||||
#define HEX_L2_LINE_SIZE 64
|
||||
#define HEX_L2_FLUSH_SIZE (128 * 1024)
|
||||
|
||||
static inline void hex_l2flush(void * addr, size_t size)
|
||||
{
|
||||
static inline void hex_l2flush(void * addr, size_t size) {
|
||||
if (size > HEX_L2_FLUSH_SIZE) {
|
||||
qurt_mem_cache_clean((qurt_addr_t) 0, 0, QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, QURT_MEM_DCACHE);
|
||||
} else {
|
||||
@@ -89,4 +96,8 @@ static inline void hex_l2flush(void * addr, size_t size)
|
||||
}
|
||||
}
|
||||
|
||||
static inline void hex_pause() {
|
||||
asm volatile(" pause(#255)\n");
|
||||
}
|
||||
|
||||
#endif /* HEX_UTILS_H */
|
||||
|
||||
@@ -16,14 +16,16 @@
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "hex-dma.h"
|
||||
#include "worker-pool.h"
|
||||
|
||||
#include "hvx-utils.h"
|
||||
#include "hvx-dump.h"
|
||||
#include "worker-pool.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
|
||||
#include "hmx-utils.h"
|
||||
#include "hmx-ops.h"
|
||||
#include "hmx-utils.h"
|
||||
#include "hmx-queue.h"
|
||||
#include "hmx-profile.h"
|
||||
|
||||
static const __fp16 q4_0_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
|
||||
@@ -47,7 +49,8 @@ static const __fp16 iq4_nl_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
|
||||
static const int32_t weight_transpose_scatter_offsets[32] __attribute__((aligned(VLEN))) = {
|
||||
0*128, 1*128, 2*128, 3*128, 4*128, 5*128, 6*128, 7*128,
|
||||
8*128, 9*128, 10*128, 11*128, 12*128, 13*128, 14*128, 15*128,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
16*128, 17*128, 18*128, 19*128, 20*128, 21*128, 22*128, 23*128,
|
||||
24*128, 25*128, 26*128, 27*128, 28*128, 29*128, 30*128, 31*128
|
||||
};
|
||||
|
||||
// Scales per x4x2 logical block: 8 × sizeof(__fp16) = 16 bytes
|
||||
@@ -109,36 +112,45 @@ static inline bool hmx_add_overflow(size_t a, size_t b, size_t *out) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Search for optimal (mc, nc) chunk sizes that maximize mc * nc within VTCM budget.
|
||||
// Search for optimal (mc, nc) chunk sizes within VTCM budget.
|
||||
//
|
||||
// Cost model: total = nc * per_n_cost + mc * per_m_cost + mc * nc * per_mn_cost + overhead
|
||||
// per_n_cost: bytes per nc column (weight + scratch buffers)
|
||||
// per_m_cost: bytes per mc row (activation)
|
||||
// per_mn_cost: bytes per mc*nc element (output)
|
||||
// overhead: fixed bytes (scales 256B, eye_tile 2048B, etc.)
|
||||
// VTCM model: nc * per_n_cost + mc * per_m_cost + mc * nc * per_mn_cost + overhead
|
||||
//
|
||||
// Minimize ceil(m/mc) * m_block_cost + ceil(n/nc) * n_block_cost.
|
||||
// All matmul paths repeat weight processing per M-block and activation loading
|
||||
// per N-block, so discrete block counts drive total overhead.
|
||||
// Tie-break: when cost is equal, prefer larger mc * nc.
|
||||
//
|
||||
// Caller-provided coefficients:
|
||||
// m_block_cost: penalty per extra M-block (weight redundancy, scales with n).
|
||||
// n_block_cost: penalty per extra N-block (activation redundancy, scales with m).
|
||||
//
|
||||
// Algorithm: nc sweeps from n_max down by 32, analytically solving for mc_max.
|
||||
// Returns 0 on success, -1 if VTCM is insufficient.
|
||||
static int hmx_compute_chunks(
|
||||
size_t vtcm_total, size_t overhead,
|
||||
size_t per_n_cost, size_t per_m_cost, size_t per_mn_cost,
|
||||
int m, int n,
|
||||
size_t *m_chunk_out, size_t *n_chunk_out,
|
||||
size_t *total_out)
|
||||
{
|
||||
static int hmx_compute_chunks(size_t vtcm_total,
|
||||
size_t overhead,
|
||||
size_t per_n_cost,
|
||||
size_t per_m_cost,
|
||||
size_t per_mn_cost,
|
||||
int m,
|
||||
int n,
|
||||
size_t m_block_cost,
|
||||
size_t n_block_cost,
|
||||
size_t * m_chunk_out,
|
||||
size_t * n_chunk_out,
|
||||
size_t * total_out) {
|
||||
if (m <= 0 || n <= 0) return -1;
|
||||
if (vtcm_total <= overhead) return -1;
|
||||
if (per_n_cost == 0 || per_m_cost == 0 || per_mn_cost == 0) return -1;
|
||||
|
||||
const size_t usable = vtcm_total - overhead;
|
||||
size_t best_mn = 0, best_m = 0, best_n = 0;
|
||||
|
||||
size_t best_cost = SIZE_MAX;
|
||||
size_t best_mn = 0;
|
||||
size_t best_m = 0, best_n = 0;
|
||||
|
||||
const size_t n_max = hex_align_down((size_t)n, HMX_FP16_TILE_N_COLS);
|
||||
for (size_t nc = n_max; nc >= HMX_FP16_TILE_N_COLS; nc -= HMX_FP16_TILE_N_COLS) {
|
||||
// Early exit: if nc * m_max cannot beat best, smaller nc won't either
|
||||
if (nc * hex_align_down((size_t)m, HMX_FP16_TILE_N_ROWS) <= best_mn)
|
||||
break;
|
||||
|
||||
size_t n_fixed = 0, ncmn = 0, mc_denom = 0;
|
||||
if (hmx_mul_overflow(nc, per_n_cost, &n_fixed)) continue;
|
||||
if (n_fixed >= usable) goto next_nc;
|
||||
@@ -152,10 +164,19 @@ static int hmx_compute_chunks(
|
||||
mc = hex_align_down(mc, HMX_FP16_TILE_N_ROWS);
|
||||
mc = hex_smin(mc, (size_t)m);
|
||||
|
||||
if (mc > 0 && mc * nc > best_mn) {
|
||||
best_mn = mc * nc;
|
||||
best_m = mc;
|
||||
best_n = nc;
|
||||
if (mc == 0) {
|
||||
goto next_nc;
|
||||
}
|
||||
|
||||
size_t mblocks = ((size_t) m + mc - 1) / mc;
|
||||
size_t nblocks = ((size_t) n + nc - 1) / nc;
|
||||
size_t cost = mblocks * m_block_cost + nblocks * n_block_cost;
|
||||
size_t mn = mc * nc;
|
||||
if (cost < best_cost || (cost == best_cost && mn > best_mn)) {
|
||||
best_cost = cost;
|
||||
best_mn = mn;
|
||||
best_m = mc;
|
||||
best_n = nc;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -233,7 +254,7 @@ static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
HVX_Vector v_scales = hvx_vec_splat_f16(*scale);
|
||||
// q4x4x2 stores two int4 values per byte. Keep only the selected nibble.
|
||||
HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
|
||||
HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
|
||||
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
|
||||
// Shuffle before LUT
|
||||
v_quants = Q6_Vb_vshuff_Vb(v_quants);
|
||||
@@ -257,7 +278,7 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
|
||||
// Load all 128 packed bytes (4 contiguous 32-byte groups)
|
||||
HVX_Vector vq = hvx_vmemu(packed_128);
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
|
||||
HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
|
||||
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
|
||||
|
||||
// Shuffle before LUT
|
||||
@@ -277,10 +298,8 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
|
||||
v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
|
||||
|
||||
// Extract individual groups: scatter uses q_mask64 so only first 64 bytes matter
|
||||
out[0] = v_lo; // group0 already in [0:63]
|
||||
out[1] = Q6_V_vror_VR(v_lo, 64); // group1 rotated to [0:63]
|
||||
out[2] = v_hi; // group2 already in [0:63]
|
||||
out[3] = Q6_V_vror_VR(v_hi, 64); // group3 rotated to [0:63]
|
||||
out[0] = v_lo; // group0 already in [0:63]
|
||||
out[1] = v_hi; // group2 already in [0:63]
|
||||
}
|
||||
|
||||
// Dequantize one x4x2 Q8_0 group (32 int8 quants) -> 32 FP16 in first 64 bytes.
|
||||
@@ -384,8 +403,9 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
size_t row_stride, int weight_type,
|
||||
int start_tile, int end_tile) {
|
||||
|
||||
const int n_k_tiles = k_block / HMX_FP16_TILE_N_COLS;
|
||||
const int qrow_size = (weight_type == HTP_TYPE_Q8_0) ? k_block : (k_block / 2);
|
||||
const int n_k_tiles = (unsigned)k_block / HMX_FP16_TILE_N_COLS;
|
||||
const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL);
|
||||
const int qrow_size = is_q4 ? ((unsigned)k_block / 2) : k_block;
|
||||
|
||||
const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) ? hvx_vmem(iq4_nl_to_fp16_lut) :
|
||||
(weight_type == HTP_TYPE_MXFP4) ? hvx_vmem(mxfp4_to_fp16_lut) :
|
||||
@@ -398,47 +418,46 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); // 4 bytes = 1 column step
|
||||
const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); // first 16 words (64 bytes)
|
||||
|
||||
for (int t = start_tile; t < end_tile; ) {
|
||||
int ct = t / n_k_tiles; // column tile index
|
||||
int kt = t % n_k_tiles; // K tile index
|
||||
unsigned ct = (unsigned)start_tile / n_k_tiles; // column tile index
|
||||
unsigned kt = (unsigned)start_tile % n_k_tiles; // K tile index
|
||||
for (unsigned t = start_tile; t < end_tile; ) {
|
||||
if (kt >= n_k_tiles) { kt = 0; ct++; }
|
||||
|
||||
// --- Batch-4 fast path for Q4_0/IQ4_NL: process 4 contiguous K-tiles with one vlut16 per row ---
|
||||
if ((weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) && (kt % 4 == 0) && (t + 4 <= end_tile) &&
|
||||
((t + 3) / n_k_tiles == ct)) {
|
||||
int blk_idx = (kt * 32) / QK_Q4_0x4x2;
|
||||
int sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; // 0 or 4
|
||||
bool upper = (sub_blk_base >= 4);
|
||||
int packed_off = blk_idx * (QK_Q4_0x4x2 / 2); // 128 contiguous packed bytes
|
||||
int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE
|
||||
+ sub_blk_base * (int)sizeof(__fp16); // 4 consecutive scales
|
||||
// --- Batch-4 fast path for Q4: process 4 contiguous K-tiles with one vlut16 per row ---
|
||||
if (is_q4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
|
||||
unsigned blk_idx = (kt * 32) / QK_Q4_0x4x2;
|
||||
unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; // 0 or 4
|
||||
bool upper = (sub_blk_base >= 4);
|
||||
unsigned packed_off = blk_idx * (QK_Q4_0x4x2 / 2); // 128 contiguous packed bytes
|
||||
unsigned scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE
|
||||
+ sub_blk_base * (int)sizeof(__fp16); // 4 consecutive scales
|
||||
|
||||
__fp16 *tile_bases[4];
|
||||
for (int g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; }
|
||||
for (unsigned g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; }
|
||||
|
||||
HVX_Vector v_off = v_scat_base;
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
|
||||
int row0 = ct * HMX_FP16_TILE_N_COLS + r;
|
||||
int row1 = row0 + 1;
|
||||
const uint8_t *r0 = vtcm_src + row0 * row_stride;
|
||||
const uint8_t *r1 = vtcm_src + row1 * row_stride;
|
||||
|
||||
HVX_Vector v0[4], v1[4];
|
||||
unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
|
||||
unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
|
||||
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
|
||||
HVX_Vector v0[2];
|
||||
const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
|
||||
dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
|
||||
if (row1 < n_cols) {
|
||||
dequantize_x4x2_q4_0_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt, v1);
|
||||
} else {
|
||||
v1[0] = v1[1] = v1[2] = v1[3] = Q6_V_vzero();
|
||||
}
|
||||
|
||||
for (int g = 0; g < 4; g++) { Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v0[g]); }
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]);
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]);
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
for (int g = 0; g < 4; g++) { Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v1[g]); }
|
||||
|
||||
|
||||
r0 = vtcm_src + row_offset; row_offset += row_stride;
|
||||
dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]);
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]);
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
}
|
||||
|
||||
for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); }
|
||||
|
||||
t += 4;
|
||||
t += 4; kt += 4;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -495,20 +514,19 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
// --- Single-tile fallback ---
|
||||
__fp16 *tile_base = vtcm_dst + t * HMX_FP16_TILE_N_ELMS;
|
||||
|
||||
if (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) {
|
||||
int blk_idx = (kt * 32) / QK_Q4_0x4x2;
|
||||
int sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32;
|
||||
bool upper = (sub_blk >= 4);
|
||||
int byte_off = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
|
||||
int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);
|
||||
if (is_q4) {
|
||||
unsigned blk_idx = (kt * 32) / QK_Q4_0x4x2;
|
||||
unsigned sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32;
|
||||
bool upper = (sub_blk >= 4);
|
||||
unsigned byte_off = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
|
||||
unsigned scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);
|
||||
|
||||
HVX_Vector v_off = v_scat_base; // reset to column 0
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
|
||||
int row0 = ct * HMX_FP16_TILE_N_COLS + r;
|
||||
int row1 = row0 + 1;
|
||||
|
||||
const uint8_t *r0 = vtcm_src + row0 * row_stride;
|
||||
const uint8_t *r1 = vtcm_src + row1 * row_stride;
|
||||
unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
|
||||
unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
|
||||
const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
|
||||
const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
|
||||
|
||||
HVX_Vector v0 = dequantize_x4x2_q4_0_group_hvx(
|
||||
r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
|
||||
@@ -585,7 +603,7 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
}
|
||||
(void) *(volatile HVX_Vector *)(tile_base);
|
||||
}
|
||||
++t;
|
||||
++t; ++kt;
|
||||
}
|
||||
|
||||
// Drain HVX scatter write buffer: a vmem load on the same HW thread retires
|
||||
@@ -653,9 +671,13 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
|
||||
// --- End x4x2 dequantizers ---
|
||||
|
||||
// requires external HMX lock
|
||||
static void core_dot_chunk_fp16(__fp16 *output, const __fp16 *activation, const __fp16 *weight, const __fp16 *scales,
|
||||
static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict activation, const __fp16 *restrict weight, const __fp16 *restrict scales,
|
||||
int n_row_tiles, int n_col_tiles, int n_dot_tiles) {
|
||||
hmx_set_output_scales(scales);
|
||||
__builtin_assume(n_row_tiles > 0);
|
||||
__builtin_assume(n_col_tiles > 0);
|
||||
__builtin_assume(n_dot_tiles > 0);
|
||||
|
||||
Q6_bias_mxmem2_A((void *)scales);
|
||||
|
||||
for (int r = 0; r < n_row_tiles; ++r) {
|
||||
for (int c = 0; c < n_col_tiles; ++c) {
|
||||
@@ -665,16 +687,55 @@ static void core_dot_chunk_fp16(__fp16 *output, const __fp16 *activation, const
|
||||
const __fp16 *col_tiles = weight + c * n_dot_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
|
||||
for (int k = 0; k < n_dot_tiles; ++k) {
|
||||
int offset = k * HMX_FP16_TILE_N_ELMS;
|
||||
hmx_load_tile_pair_fp16(row_tiles + offset, col_tiles + offset);
|
||||
Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047);
|
||||
Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047);
|
||||
row_tiles += HMX_FP16_TILE_N_ELMS;
|
||||
col_tiles += HMX_FP16_TILE_N_ELMS;
|
||||
}
|
||||
|
||||
__fp16 *out_tile = output + (r * n_col_tiles + c) * HMX_FP16_TILE_N_ELMS;
|
||||
hmx_consume_accumulator_fp16(out_tile);
|
||||
Q6_mxmem_AR_after_hf(out_tile, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- Async HMX matmul job (for pipeline overlap) ---
|
||||
|
||||
typedef struct {
|
||||
__fp16 * output;
|
||||
const __fp16 * activation;
|
||||
const __fp16 * weight;
|
||||
const __fp16 * scales;
|
||||
uint32_t n_row_tiles;
|
||||
uint32_t n_col_tiles;
|
||||
uint32_t n_dot_tiles;
|
||||
} hmx_matmul_job_t;
|
||||
|
||||
static void hmx_matmul_worker_fn(void * data) {
|
||||
hmx_matmul_job_t * job = (hmx_matmul_job_t *) data;
|
||||
FARF(HIGH, "hmx-mm-job: n_row_tiles %u n_col_tiles %u n_dot_tiles %u", job->n_row_tiles, job->n_col_tiles, job->n_dot_tiles);
|
||||
core_dot_chunk_fp16(job->output, job->activation, job->weight, job->scales, job->n_row_tiles, job->n_col_tiles, job->n_dot_tiles);
|
||||
}
|
||||
|
||||
static inline void hmx_matmul_job_init(hmx_matmul_job_t * job,
|
||||
__fp16 * output,
|
||||
const __fp16 * activation,
|
||||
const __fp16 * weight,
|
||||
const __fp16 * scales,
|
||||
int n_row_tiles,
|
||||
int n_col_tiles,
|
||||
int n_dot_tiles) {
|
||||
job->output = output;
|
||||
job->activation = activation;
|
||||
job->weight = weight;
|
||||
job->scales = scales;
|
||||
job->n_row_tiles = n_row_tiles;
|
||||
job->n_col_tiles = n_col_tiles;
|
||||
job->n_dot_tiles = n_dot_tiles;
|
||||
}
|
||||
|
||||
// --- End async HMX matmul job ---
|
||||
|
||||
static void transfer_output_chunk_fp16_to_fp32(float *restrict dst, const __fp16 *restrict vtcm_src, int n_rows, int n_cols, int n) {
|
||||
assert(n_cols % HMX_FP16_TILE_N_COLS == 0);
|
||||
const int n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS;
|
||||
@@ -832,12 +893,13 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
|
||||
const size_t f32_scratch_per_m = use_dma_activation ? (size_t) params->k * sizeof(float) : 0;
|
||||
|
||||
size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
|
||||
// FP16 weight: interleave and activation load have similar per-element cost.
|
||||
if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256,
|
||||
/*per_n=*/3 * vec_dot_size,
|
||||
/*per_m=*/group_size * vec_dot_size + f32_scratch_per_m,
|
||||
/*per_mn=*/sizeof(__fp16),
|
||||
params->m, params->n,
|
||||
&m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
|
||||
/*per_n=*/3 * vec_dot_size,
|
||||
/*per_m=*/group_size * vec_dot_size + f32_scratch_per_m,
|
||||
/*per_mn=*/sizeof(__fp16), params->m, params->n,
|
||||
/*m_block_cost=*/(size_t) params->n,
|
||||
/*n_block_cost=*/(size_t) params->m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
|
||||
FARF(HIGH, "%s: grouped path does not fit VTCM, falling back to legacy batched loop", __func__);
|
||||
return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params);
|
||||
}
|
||||
@@ -1006,13 +1068,15 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
|
||||
const size_t f32_scratch_per_m = use_dma_activation ? (size_t) k * sizeof(float) : 0;
|
||||
|
||||
size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
|
||||
// FP16 weight: interleave and activation load have similar per-element cost.
|
||||
if (hmx_compute_chunks(vtcm_budget,
|
||||
/*overhead=*/ 256,
|
||||
/*per_n=*/ 3 * vec_dot_size, // W + S0 + S1
|
||||
/*per_m=*/ vec_dot_size + f32_scratch_per_m, // A + optional F32 scratch
|
||||
/*per_mn=*/ sizeof(__fp16), // O
|
||||
m, n,
|
||||
&m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
|
||||
/*overhead=*/256,
|
||||
/*per_n=*/3 * vec_dot_size, // W + S0 + S1
|
||||
/*per_m=*/vec_dot_size + f32_scratch_per_m, // A + optional F32 scratch
|
||||
/*per_mn=*/sizeof(__fp16), // O
|
||||
m, n,
|
||||
/*m_block_cost=*/(size_t) n,
|
||||
/*n_block_cost=*/(size_t) m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
|
||||
FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget);
|
||||
return -1;
|
||||
}
|
||||
@@ -1157,6 +1221,8 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
|
||||
int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict out, const float *restrict x, const uint8_t *restrict w, int m,
|
||||
int k, int n, int w_type);
|
||||
|
||||
#define FALLBACK_TO_STANDARD 1
|
||||
|
||||
int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
|
||||
const uint8_t *restrict permuted_weight, int m, int k, int n,
|
||||
int weight_type) {
|
||||
@@ -1169,9 +1235,12 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
|
||||
// for large m, k (e.g. prefill FFN Down), use out-stationary version
|
||||
if (m >= 128 && k > n && n > 1024) {
|
||||
FARF(MEDIUM, "hmx_matmul_qk: OUT-STATIONARY path m=%d k=%d n=%d type=%d (K_BLOCK=512, %d K-iters with fp16 intermediate)",
|
||||
m, k, n, weight_type, (k + 511) / 512);
|
||||
return mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type);
|
||||
int rc = mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type);
|
||||
if (rc != FALLBACK_TO_STANDARD) {
|
||||
return rc; // 0 success, -1 error
|
||||
}
|
||||
FARF(MEDIUM, "hmx_matmul_qk: out-stationary fallback to standard m=%d k=%d n=%d", m, k, n);
|
||||
// fall through to standard path
|
||||
}
|
||||
|
||||
size_t row_stride = get_x4x2_row_stride(weight_type, k);
|
||||
@@ -1197,9 +1266,10 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
}
|
||||
|
||||
size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
|
||||
if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256,
|
||||
per_n_cost, /*per_m=*/vec_dot_size, per_mn_cost,
|
||||
m, n, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
|
||||
// Quantized weight: dequant ~1.5x more expensive per element than activation load.
|
||||
if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, per_n_cost, /*per_m=*/vec_dot_size, per_mn_cost, m, n,
|
||||
/*m_block_cost=*/(size_t) n * 3,
|
||||
/*n_block_cost=*/(size_t) m * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
|
||||
FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d pipe=%d budget=%zu)",
|
||||
__func__, m, k, n, use_pipeline, vtcm_budget);
|
||||
return -1;
|
||||
@@ -1256,9 +1326,8 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
use_pipeline ? "PIPELINE" : "SEQUENTIAL", m_chunk_n_rows, n_chunk_n_cols,
|
||||
(size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
|
||||
|
||||
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
|
||||
|
||||
if (!use_pipeline) {
|
||||
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
|
||||
for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
|
||||
// transfer activation matrix chunk into VTCM
|
||||
size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
|
||||
@@ -1318,20 +1387,22 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
TIMER_STOP(output_store);
|
||||
}
|
||||
}
|
||||
HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
|
||||
} else {
|
||||
// 4-stage pipeline: DMA load (A), dequantize (B), HMX matmul (C), store (D)
|
||||
// stage B and D (dequantize and store) are expected to be on the critical path
|
||||
// HMX compute (C) runs on dedicated worker thread, overlapping with HVX stages (B, D).
|
||||
|
||||
// A --> B: vtcm_qweight, 1 buffer
|
||||
// B --> C: vtcm_weight0/vtcm_weight1, 2 buffers
|
||||
// C --> D: vtcm_output0/vtcm_output1, 2 buffers
|
||||
|
||||
//
|
||||
// LD ||A3| | B3 ||
|
||||
// MM || C2 ||
|
||||
// ST || D1 | ||
|
||||
// Async timeline (C overlaps B+D):
|
||||
// main+HVX: [A0][Act][B0][A1][sub C0][B1‖C0][A2][wait,sub C1][D0+B2‖C1][wait,sub C2][D1‖C2][wait][D2]
|
||||
// HMX queue: [████ C0 ████████][████ C1 ████████████][████ C2 ████████]
|
||||
|
||||
int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols);
|
||||
hmx_matmul_job_t job_slots[2]; // persistent double-buffered job descriptors
|
||||
|
||||
for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
|
||||
const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
|
||||
|
||||
@@ -1352,31 +1423,34 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k);
|
||||
}
|
||||
|
||||
// prologue: B0, A1, C0, B1
|
||||
// prologue: B0, A1, submit C0 (async), B1 (overlaps C0)
|
||||
{
|
||||
// B0
|
||||
// B0: wait for DMA, dequant weight chunk 0
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type);
|
||||
|
||||
// A1
|
||||
// A1: issue DMA for weight chunk 1
|
||||
const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols);
|
||||
if (1 < n_chunk_cnt) {
|
||||
const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * row_stride;
|
||||
dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1);
|
||||
}
|
||||
|
||||
// C0
|
||||
core_dot_chunk_fp16((__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
|
||||
hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
|
||||
// submit C0 (non-blocking — HMX worker executes in parallel)
|
||||
hmx_matmul_job_init(&job_slots[0], (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation,
|
||||
(__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
|
||||
hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
|
||||
hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
|
||||
hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[0]));
|
||||
|
||||
// B1
|
||||
// B1: DMA pop + dequant (runs in parallel with C0 on HMX worker)
|
||||
if (1 < n_chunk_cnt) {
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type);
|
||||
}
|
||||
}
|
||||
|
||||
// main loop
|
||||
// main loop: wait C_i → submit C_{i+1} → D_i + B_{i+2} (parallel with C_{i+1})
|
||||
for (int i = 0; i < n_chunk_cnt; ++i) {
|
||||
const size_t nc = i * n_chunk_n_cols;
|
||||
const size_t nc_p1 = nc + 1 * n_chunk_n_cols;
|
||||
@@ -1386,36 +1460,41 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols);
|
||||
const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols);
|
||||
|
||||
// issue A_{i+2}
|
||||
// issue A_{i+2}: DMA push (non-blocking)
|
||||
if (i + 2 < n_chunk_cnt) {
|
||||
const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * row_stride;
|
||||
dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2);
|
||||
}
|
||||
|
||||
// wait for HMX (C_{i}) -- C_{i} is done
|
||||
// wait C_i: block until prologue/previous C completes
|
||||
hmx_queue_pop(ctx->hmx_queue);
|
||||
|
||||
// result of B_{i+1} (input of C_{i+1}) should be ready now
|
||||
|
||||
// issue C_{i+1}
|
||||
// submit C_{i+1} (non-blocking, overlaps with D_i + B_{i+2} below)
|
||||
// job_slots[(i+1)%2] is safe: C_i just completed, freeing slot i%2's
|
||||
// counterpart — and (i+1)%2 was last used by C_{i-1} which completed
|
||||
// before C_i was submitted.
|
||||
if (i + 1 < n_chunk_cnt) {
|
||||
core_dot_chunk_fp16((__fp16 *) vtcm_output_bufs[(i + 1) % 2], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2], vtcm_scales,
|
||||
hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
|
||||
hmx_matmul_job_init(&job_slots[(i + 1) % 2], (__fp16 *) vtcm_output_bufs[(i + 1) % 2],
|
||||
(__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2],
|
||||
vtcm_scales, hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
|
||||
hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
|
||||
hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[(i + 1) % 2]));
|
||||
}
|
||||
|
||||
// compute D_{i}
|
||||
// D_i: store output (multi-thread HVX, parallel with C_{i+1})
|
||||
float *output_chunk = dst + (mr * n + nc);
|
||||
transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, n);
|
||||
|
||||
// wait for DMA (A_{i+2}), compute B_{i+2}
|
||||
// B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1})
|
||||
if (i + 2 < n_chunk_cnt) {
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
|
||||
hmx_queue_suspend(ctx->hmx_queue);
|
||||
}
|
||||
|
||||
TIMER_STOP(total);
|
||||
|
||||
@@ -1434,10 +1513,13 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
}
|
||||
|
||||
// C += AB
|
||||
void core_mma_chunk_fp16(__fp16 *c, const __fp16 *a, const __fp16 *b, const __fp16 *col_scales, const __fp16 *eye_tile,
|
||||
void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, const __fp16 *restrict b, const __fp16 *restrict col_scales, const __fp16 *restrict eye_tile,
|
||||
int n_row_tiles, int n_col_tiles, int n_dot_tiles, bool zero_init) {
|
||||
__builtin_assume(n_row_tiles > 0);
|
||||
__builtin_assume(n_col_tiles > 0);
|
||||
__builtin_assume(n_dot_tiles > 0);
|
||||
|
||||
hmx_set_output_scales(col_scales);
|
||||
Q6_bias_mxmem2_A((void *)col_scales);
|
||||
|
||||
for (int i = 0; i < n_row_tiles; ++i) {
|
||||
for (int j = 0; j < n_col_tiles; ++j) {
|
||||
@@ -1448,15 +1530,17 @@ void core_mma_chunk_fp16(__fp16 *c, const __fp16 *a, const __fp16 *b, const __fp
|
||||
|
||||
__fp16 *accum_tile = c + (i * n_col_tiles + j) * HMX_FP16_TILE_N_ELMS;
|
||||
if (!zero_init) {
|
||||
hmx_load_tile_pair_fp16(accum_tile, eye_tile);
|
||||
Q6_activation_hf_mxmem_RR((unsigned int)accum_tile, 2047);
|
||||
Q6_weight_hf_mxmem_RR((unsigned int)eye_tile, 2047);
|
||||
}
|
||||
|
||||
for (int k = 0; k < n_dot_tiles; ++k) {
|
||||
int offset = k * HMX_FP16_TILE_N_ELMS;
|
||||
hmx_load_tile_pair_fp16(row_tiles + offset, col_tiles + offset);
|
||||
Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047);
|
||||
Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047);
|
||||
row_tiles += HMX_FP16_TILE_N_ELMS;
|
||||
col_tiles += HMX_FP16_TILE_N_ELMS;
|
||||
}
|
||||
|
||||
hmx_consume_accumulator_fp16(accum_tile);
|
||||
Q6_mxmem_AR_after_hf(accum_tile, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1540,12 +1624,41 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
|
||||
|
||||
const size_t vtcm_budget = ctx->vtcm_size;
|
||||
|
||||
const size_t M_BLOCK_SIZE = 512;
|
||||
const size_t N_BLOCK_SIZE = 512;
|
||||
const size_t K_BLOCK_SIZE = 512;
|
||||
const size_t K_BLOCK_SIZE = 1024;
|
||||
|
||||
// Compute precise buffer sizes
|
||||
// Fallback: if k doesn't need K-blocking, out-stationary has no advantage
|
||||
const size_t k_iters_check = (k + K_BLOCK_SIZE - 1) / K_BLOCK_SIZE;
|
||||
if (k_iters_check <= 1) {
|
||||
FARF(MEDIUM, "%s: K_BLK=%zu >= k=%d, fallback to standard path", __func__, K_BLOCK_SIZE, k);
|
||||
return FALLBACK_TO_STANDARD;
|
||||
}
|
||||
|
||||
// Dynamic M,N search via hmx_compute_chunks
|
||||
const size_t sub_row_stride_alloc = get_x4x2_row_stride(weight_type, K_BLOCK_SIZE);
|
||||
const size_t per_m = K_BLOCK_SIZE * sizeof(float) // scratch1: M×K×4 (act DMA staging F32)
|
||||
+ K_BLOCK_SIZE * sizeof(__fp16); // activation: M×K×2 (F16 tiles)
|
||||
const size_t per_n = sub_row_stride_alloc // scratch0: N×sub_row(K) (packed quant)
|
||||
+ K_BLOCK_SIZE * sizeof(__fp16); // weight: N×K×2 (F16 tiles)
|
||||
const size_t per_mn = sizeof(__fp16); // output: M×N×2 (out-stationary)
|
||||
// Alignment margin: hex_align_up can add up to 2047 bytes per buffer;
|
||||
// scratch1 (mc×6144) is naturally 2048-aligned, remaining 4 buffers need margin
|
||||
const size_t align_margin = 4 * HMX_FP16_TILE_SIZE;
|
||||
const size_t overhead = HMX_FP16_TILE_SIZE + 256 + align_margin; // eye_tile + scales + alignment
|
||||
|
||||
size_t M_BLOCK_SIZE, N_BLOCK_SIZE, vtcm_used;
|
||||
// Cost-based search: minimize ceil(m/mc)*m_block_cost + ceil(n/nc)*n_block_cost.
|
||||
// From profiling: wt_dequant per element ≈ 1.5× activation load per element.
|
||||
// m_block_cost = n*3: each extra M-block re-dequants all N×K weight (expensive).
|
||||
// n_block_cost = m*2: each extra N-block re-loads all M×K activation (cheaper).
|
||||
const size_t m_block_cost = (size_t) n * 3;
|
||||
const size_t n_block_cost = (size_t) m * 2;
|
||||
if (hmx_compute_chunks(vtcm_budget, overhead, per_n, per_m, per_mn, m, n, m_block_cost, n_block_cost, &M_BLOCK_SIZE,
|
||||
&N_BLOCK_SIZE, &vtcm_used) != 0) {
|
||||
FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Compute precise buffer sizes from searched M,N and fixed K
|
||||
const size_t weight_size = hex_align_up(N_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
|
||||
const size_t act_size = hex_align_up(M_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
|
||||
const size_t out_size = hex_align_up(M_BLOCK_SIZE * N_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
|
||||
@@ -1554,7 +1667,8 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
|
||||
|
||||
const size_t total_vtcm = weight_size + act_size + out_size + scratch0_sz + scratch1_sz + HMX_FP16_TILE_SIZE + 256;
|
||||
if (total_vtcm > vtcm_budget) {
|
||||
FARF(HIGH, "%s: VTCM too small: need %zu have %zu (m=%d k=%d n=%d)", __func__, total_vtcm, vtcm_budget, m, k, n);
|
||||
FARF(HIGH, "%s: VTCM overflow after search: need %zu have %zu (M=%zu N=%zu K=%zu)", __func__, total_vtcm,
|
||||
vtcm_budget, M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -1568,8 +1682,8 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
|
||||
__fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256);
|
||||
assert((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) <= vtcm_budget);
|
||||
|
||||
FARF(MEDIUM, "%s: m=%d k=%d n=%d wtype=%d vtcm=%zu/%zu", __func__, m, k, n, weight_type,
|
||||
(size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
|
||||
FARF(HIGH, "hmx-mm: m=%d k=%d n=%d wtype=%d block M=%zu N=%zu K=%zu vtcm=%zu/%zu", __func__, m, k, n, weight_type,
|
||||
M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE, (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget);
|
||||
|
||||
// initialize eye tile (32x32 identity matrix)
|
||||
{
|
||||
|
||||
158
ggml/src/ggml-hexagon/htp/hmx-queue.c
Normal file
158
ggml/src/ggml-hexagon/htp/hmx-queue.c
Normal file
@@ -0,0 +1,158 @@
|
||||
#pragma clang diagnostic ignored "-Wunused-function"
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <qurt_thread.h>
|
||||
#include <qurt_futex.h>
|
||||
|
||||
#include <HAP_compute_res.h>
|
||||
|
||||
#include "hmx-queue.h"
|
||||
|
||||
#define QURT_LOWEST_PRIO (254)
|
||||
|
||||
static inline void hmx_lock(struct hmx_queue *q)
|
||||
{
|
||||
if (!q->hmx_locked) {
|
||||
HAP_compute_res_hmx_lock(q->hap_rctx);
|
||||
q->hmx_locked = true;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void hmx_unlock(struct hmx_queue *q)
|
||||
{
|
||||
if (q->hmx_locked) {
|
||||
HAP_compute_res_hmx_unlock(q->hap_rctx);
|
||||
q->hmx_locked = false;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void hmx_queue_process(struct hmx_queue *q, bool* killed) {
|
||||
unsigned int ir = atomic_load(&q->idx_read);
|
||||
|
||||
while (ir != atomic_load(&q->idx_write)) {
|
||||
struct hmx_queue_desc *d = &q->desc[ir];
|
||||
if (!d->done) {
|
||||
FARF(HIGH, "hmx-queue-process: ir %u func %p data %p", ir, d->func, d->data);
|
||||
|
||||
enum hmx_queue_signal sig = (enum hmx_queue_signal) (unsigned int) d->func;
|
||||
switch (sig) {
|
||||
case HMX_QUEUE_NOOP: /* noop */; break;
|
||||
case HMX_QUEUE_KILL: *killed = true; break;
|
||||
case HMX_QUEUE_SUSPEND: hmx_unlock(q); break;
|
||||
default:
|
||||
hmx_lock(q);
|
||||
d->func(d->data);
|
||||
break;
|
||||
}
|
||||
|
||||
atomic_fetch_add(&d->done, 1);
|
||||
}
|
||||
|
||||
ir = (ir + 1) & q->idx_mask;
|
||||
atomic_store(&q->idx_read, ir);
|
||||
}
|
||||
}
|
||||
|
||||
static void hmx_queue_thread(void * arg) {
|
||||
struct hmx_queue * q = (struct hmx_queue *) arg;
|
||||
|
||||
FARF(HIGH, "hmx-queue-thread: started");
|
||||
|
||||
bool killed = false;
|
||||
|
||||
unsigned int poll_cnt = HMX_QUEUE_POLL_COUNT;
|
||||
unsigned int prev_seqn = 0;
|
||||
while (!killed) {
|
||||
unsigned int seqn = atomic_load(&q->seqn);
|
||||
if (seqn == prev_seqn) {
|
||||
if (--poll_cnt) { hex_pause(); continue; }
|
||||
FARF(HIGH, "hmx-queue-thread: sleeping");
|
||||
qurt_futex_wait(&q->seqn, prev_seqn);
|
||||
continue;
|
||||
}
|
||||
prev_seqn = seqn;
|
||||
poll_cnt = HMX_QUEUE_POLL_COUNT;
|
||||
|
||||
FARF(HIGH, "hmx-queue-thread: new work");
|
||||
|
||||
hmx_queue_process(q, &killed);
|
||||
}
|
||||
|
||||
FARF(HIGH, "hmx-queue-thread: stopped");
|
||||
}
|
||||
|
||||
struct hmx_queue * hmx_queue_create(size_t capacity, uint32_t hap_rctx) {
|
||||
capacity = hex_ceil_pow2(capacity);
|
||||
|
||||
struct hmx_queue * q = (struct hmx_queue *) memalign(32, sizeof(struct hmx_queue));
|
||||
if (q == NULL) {
|
||||
FARF(ERROR, "%s: failed to allocate DMA queue\n", __FUNCTION__);
|
||||
return NULL;
|
||||
}
|
||||
memset(q, 0, sizeof(struct hmx_queue));
|
||||
q->capacity = capacity;
|
||||
q->idx_mask = capacity - 1;
|
||||
q->hap_rctx = hap_rctx;
|
||||
|
||||
q->desc = (struct hmx_queue_desc *) memalign(64, capacity * sizeof(struct hmx_queue_desc));
|
||||
if (!q->desc) {
|
||||
FARF(ERROR, "hmx-queue: failed to allocate HMX queue descriptors\n");
|
||||
return NULL;
|
||||
}
|
||||
memset(q->desc, 0, capacity * sizeof(struct hmx_queue_desc));
|
||||
|
||||
const size_t stack_size = HMX_QUEUE_THREAD_STACK_SIZE;
|
||||
q->stack = (unsigned char *) memalign(64, stack_size);
|
||||
if (!q->stack) {
|
||||
FARF(ERROR, "hmx-queue: thread stack allocation failed (%zu bytes)", stack_size);
|
||||
return NULL;
|
||||
}
|
||||
memset(q->stack, 0, stack_size);
|
||||
|
||||
// Match caller thread priority (same pattern as worker-pool.c).
|
||||
int prio = qurt_thread_get_priority(qurt_thread_get_id());
|
||||
if (prio < 1) {
|
||||
prio = 1;
|
||||
}
|
||||
if (prio > QURT_LOWEST_PRIO) {
|
||||
prio = QURT_LOWEST_PRIO;
|
||||
}
|
||||
|
||||
qurt_thread_attr_t attr;
|
||||
qurt_thread_attr_init(&attr);
|
||||
qurt_thread_attr_set_stack_addr(&attr, q->stack);
|
||||
qurt_thread_attr_set_stack_size(&attr, stack_size);
|
||||
qurt_thread_attr_set_priority(&attr, prio);
|
||||
qurt_thread_attr_set_name(&attr, "hmx-queue");
|
||||
|
||||
int err = qurt_thread_create(&q->thread, &attr, hmx_queue_thread, q);
|
||||
if (err) {
|
||||
FARF(ERROR, "hmx-worker: thread create failed (%d)", err);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
FARF(HIGH, "hmx-queue: capacity %u\n", capacity);
|
||||
|
||||
return q;
|
||||
}
|
||||
|
||||
void hmx_queue_delete(struct hmx_queue * q) {
|
||||
if (!q) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Tell the worker to exit.
|
||||
hmx_queue_flush(q);
|
||||
hmx_queue_signal(q, HMX_QUEUE_KILL);
|
||||
hmx_queue_flush(q);
|
||||
|
||||
int status;
|
||||
qurt_thread_join(q->thread, &status);
|
||||
|
||||
free(q->desc);
|
||||
free(q->stack);
|
||||
free(q);
|
||||
}
|
||||
134
ggml/src/ggml-hexagon/htp/hmx-queue.h
Normal file
134
ggml/src/ggml-hexagon/htp/hmx-queue.h
Normal file
@@ -0,0 +1,134 @@
|
||||
#ifndef HMX_QUEUE_H
|
||||
#define HMX_QUEUE_H
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stdatomic.h>
|
||||
|
||||
#include <hexagon_types.h>
|
||||
#include <qurt_thread.h>
|
||||
#include <qurt_futex.h>
|
||||
#include <HAP_farf.h>
|
||||
|
||||
#include "hex-utils.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define HMX_QUEUE_THREAD_STACK_SIZE (16 * 1024)
|
||||
#define HMX_QUEUE_POLL_COUNT 2000
|
||||
|
||||
typedef void (*hmx_queue_func)(void *);
|
||||
|
||||
// Dummy funcs used as signals
|
||||
enum hmx_queue_signal {
|
||||
HMX_QUEUE_NOOP = 0, // aka NULL
|
||||
HMX_QUEUE_SUSPEND,
|
||||
HMX_QUEUE_KILL
|
||||
};
|
||||
|
||||
struct hmx_queue_desc {
|
||||
hmx_queue_func func;
|
||||
void * data;
|
||||
atomic_uint done;
|
||||
};
|
||||
|
||||
struct hmx_queue {
|
||||
struct hmx_queue_desc * desc;
|
||||
atomic_uint idx_write; // updated by producer (push)
|
||||
atomic_uint idx_read; // updated by consumer (process)
|
||||
unsigned int idx_pop; // updated by producer (pop)
|
||||
uint32_t idx_mask;
|
||||
uint32_t capacity;
|
||||
|
||||
atomic_uint seqn; // incremented for all pushes, used with futex
|
||||
qurt_thread_t thread;
|
||||
void * stack;
|
||||
uint32_t hap_rctx;
|
||||
bool hmx_locked;
|
||||
};
|
||||
|
||||
struct hmx_queue * hmx_queue_create(size_t capacity, uint32_t hap_rctx);
|
||||
void hmx_queue_delete(struct hmx_queue * q);
|
||||
|
||||
static inline struct hmx_queue_desc hmx_queue_make_desc(hmx_queue_func func, void * data) {
|
||||
struct hmx_queue_desc d = { func, data };
|
||||
return d;
|
||||
}
|
||||
|
||||
static inline bool hmx_queue_push(struct hmx_queue * q, struct hmx_queue_desc d) {
|
||||
unsigned int ir = atomic_load(&q->idx_read);
|
||||
unsigned int iw = q->idx_write;
|
||||
|
||||
if (((iw + 1) & q->idx_mask) == ir) {
|
||||
FARF(HIGH, "hmx-queue-push: queue is full\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
atomic_store(&d.done, 0);
|
||||
|
||||
FARF(HIGH, "hmx-queue-push: iw %u func %p data %p\n", iw, d.func, d.data);
|
||||
|
||||
q->desc[iw] = d;
|
||||
atomic_store(&q->idx_write, (iw + 1) & q->idx_mask);
|
||||
// wake up our thread
|
||||
atomic_fetch_add(&q->seqn, 1);
|
||||
qurt_futex_wake(&q->seqn, 1);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool hmx_queue_signal(struct hmx_queue *q, enum hmx_queue_signal sig) {
|
||||
return hmx_queue_push(q, hmx_queue_make_desc((hmx_queue_func) sig, NULL));
|
||||
}
|
||||
|
||||
static inline bool hmx_queue_empty(struct hmx_queue * q) {
|
||||
return q->idx_pop == q->idx_write;
|
||||
}
|
||||
|
||||
static inline uint32_t hmx_queue_depth(struct hmx_queue * q) {
|
||||
return (q->idx_read - q->idx_read) & q->idx_mask;
|
||||
}
|
||||
|
||||
static inline uint32_t hmx_queue_capacity(struct hmx_queue * q) {
|
||||
return q->capacity;
|
||||
}
|
||||
|
||||
static inline struct hmx_queue_desc hmx_queue_pop(struct hmx_queue * q) {
|
||||
unsigned int ip = q->idx_pop;
|
||||
unsigned int iw = q->idx_write;
|
||||
|
||||
struct hmx_queue_desc rd = { NULL, NULL };
|
||||
if (ip == iw) {
|
||||
return rd;
|
||||
}
|
||||
|
||||
// Wait for desc to complete
|
||||
struct hmx_queue_desc * d = &q->desc[ip];
|
||||
while (!atomic_load(&d->done)) {
|
||||
FARF(HIGH, "hmx-queue-pop: waiting for HMX queue : %u\n", ip);
|
||||
hex_pause();
|
||||
}
|
||||
|
||||
rd = *d;
|
||||
q->idx_pop = (ip + 1) & q->idx_mask;
|
||||
|
||||
FARF(HIGH, "hmx-queue-pop: ip %u func %p data %p\n", ip, rd.func, rd.data);
|
||||
return rd;
|
||||
}
|
||||
|
||||
static inline void hmx_queue_flush(struct hmx_queue * q) {
|
||||
while (hmx_queue_pop(q).func != NULL) ;
|
||||
}
|
||||
|
||||
static inline void hmx_queue_suspend(struct hmx_queue *q) {
|
||||
hmx_queue_signal(q, HMX_QUEUE_SUSPEND);
|
||||
hmx_queue_flush(q);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif /* HMX_QUEUE_H */
|
||||
@@ -14,10 +14,6 @@
|
||||
|
||||
#define HMX_INLINE_ALWAYS inline __attribute__((unused, always_inline))
|
||||
|
||||
static HMX_INLINE_ALWAYS void hmx_set_output_scales(const void *scales) {
|
||||
asm volatile("bias = mxmem2(%0)" :: "r"(scales));
|
||||
}
|
||||
|
||||
// Initialise aligned 256-byte area with scale vector + zero padding.
|
||||
static HMX_INLINE_ALWAYS void hmx_init_column_scales(void *out_scales, HVX_Vector v_scale) {
|
||||
HVX_Vector *pv = (HVX_Vector *)out_scales;
|
||||
@@ -25,58 +21,6 @@ static HMX_INLINE_ALWAYS void hmx_init_column_scales(void *out_scales, HVX_Vecto
|
||||
*pv = Q6_V_vzero();
|
||||
}
|
||||
|
||||
// Load multiple contiguous tiles with :deep streaming.
|
||||
// Rt = total region size - 1; the hardware streams through [Rs, Rs + Rt].
|
||||
// IMPORTANT: the tile region [Rs, Rs + Rt] must NOT cross a VTCM 4 MB bank
|
||||
// boundary, otherwise the mxmem instruction will raise a precise bus error.
|
||||
// Callers must ensure their VTCM layout satisfies this constraint.
|
||||
static HMX_INLINE_ALWAYS void hmx_load_tiles_fp16(const __fp16 *row_tiles,
|
||||
const __fp16 *col_tiles,
|
||||
size_t n_tiles) {
|
||||
size_t limit = n_tiles * HMX_FP16_TILE_SIZE - 1;
|
||||
asm volatile(
|
||||
"{ activation.hf = mxmem(%0, %1):deep\n"
|
||||
"weight.hf = mxmem(%2, %3) }\n"
|
||||
:: "r"(row_tiles), "r"(limit), "r"(col_tiles), "r"(limit)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
// Load a single activation+weight tile pair (no :deep streaming).
|
||||
// Rt defines the accessible region [Rs, Rs+Rt]. Following the reference formula
|
||||
// (limit = n_tiles * HMX_FP16_TILE_SIZE - 1), for a single tile Rt = 2047.
|
||||
// The original code used Rt=0x7FFF (32 KB region); when dynamic VTCM allocation
|
||||
// places a tile near a 4 MB bank boundary, the oversized region crosses it and
|
||||
// triggers a precise bus error (0x2601). Rt=2047 confines accesses to exactly
|
||||
// one 2048-byte tile while covering all 16 HVX vectors (offsets 0..2047).
|
||||
static HMX_INLINE_ALWAYS void hmx_load_tile_pair_fp16(const __fp16 *act_tile,
|
||||
const __fp16 *wt_tile) {
|
||||
asm volatile(
|
||||
"{ activation.hf = mxmem(%0, %1)\n"
|
||||
"weight.hf = mxmem(%2, %3) }\n"
|
||||
:: "r"(act_tile), "r"(2047),
|
||||
"r"(wt_tile), "r"(2047)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static HMX_INLINE_ALWAYS void hmx_consume_accumulator_fp16(__fp16 *out) {
|
||||
// Use the combined convert-and-store instruction (matches the reference
|
||||
// Q6_mxmem_AR_after_hf intrinsic). The previous two-instruction sequence
|
||||
// "cvt.hf = acc(2); mxmem = cvt" used an undocumented Rs=2 parameter.
|
||||
asm volatile(
|
||||
"mxmem(%0, %1):after.hf = acc\n"
|
||||
:: "r"(out), "r"(0)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
// Compute inner product of two vectors of tiles and store result.
|
||||
static HMX_INLINE_ALWAYS void hmx_dot_fp16(__fp16 *out,
|
||||
const __fp16 *row_tiles,
|
||||
const __fp16 *col_tiles,
|
||||
size_t n_tiles) {
|
||||
hmx_load_tiles_fp16(row_tiles, col_tiles, n_tiles);
|
||||
hmx_consume_accumulator_fp16(out);
|
||||
}
|
||||
|
||||
// --- VTCM sequential allocator (from htp-ops-lib/include/dsp/vtcm_mgr.h) ---
|
||||
|
||||
static inline uint8_t *vtcm_seq_alloc(uint8_t **vtcm_ptr, size_t size) {
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#define HTP_CTX_H
|
||||
|
||||
#include "hex-dma.h"
|
||||
#include "hmx-queue.h"
|
||||
#include "htp-ops.h"
|
||||
#include "worker-pool.h"
|
||||
|
||||
@@ -30,6 +31,8 @@ struct htp_spad {
|
||||
uint32_t size_per_thread; // size per thread
|
||||
};
|
||||
|
||||
struct htp_context;
|
||||
|
||||
// Context while processing an Op
|
||||
// TODO: fold this into the main context
|
||||
struct htp_ops_context {
|
||||
@@ -72,6 +75,10 @@ struct htp_context {
|
||||
atomic_bool vtcm_needs_release;
|
||||
|
||||
struct htp_ops_context octx;
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
struct hmx_queue * hmx_queue; // Async HMX queue for pipeline overlap
|
||||
#endif
|
||||
};
|
||||
|
||||
int op_matmul(struct htp_ops_context * octx);
|
||||
|
||||
@@ -91,7 +91,12 @@ enum htp_op_code {
|
||||
#define HTP_OP_MAX_BUFS 8
|
||||
#define HTP_OP_MAX_REQS 256
|
||||
#define HTP_OP_MAX_TENSORS (HTP_OP_MAX_REQS * HTP_OP_MAX_INPUTS + HTP_OP_MAX_REQS)
|
||||
|
||||
#if __HVX_ARCH__ < 75
|
||||
#define HTP_OP_MAX_VMEM (3167538380u)
|
||||
#else
|
||||
#define HTP_OP_MAX_VMEM (3221225472u)
|
||||
#endif
|
||||
|
||||
enum htp_tensor_flags {
|
||||
HTP_TENSOR_COMPUTE = (1U << 0), // Tensor buffer temporal compute data (not weights)
|
||||
|
||||
@@ -116,9 +116,14 @@ static inline HVX_VectorPred hvx_vec_is_nan_f16(HVX_Vector v) {
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_f32_to_f16_shuff(HVX_Vector v0, HVX_Vector v1) {
|
||||
#if __HVX_ARCH__ >= 81
|
||||
HVX_Vector q0 = Q6_Vqf32_equals_Vsf(v0);
|
||||
HVX_Vector q1 = Q6_Vqf32_equals_Vsf(v1);
|
||||
#else
|
||||
const HVX_Vector zero = Q6_V_vzero();
|
||||
HVX_Vector q0 = Q6_Vqf32_vadd_VsfVsf(v0, zero);
|
||||
HVX_Vector q1 = Q6_Vqf32_vadd_VsfVsf(v1, zero);
|
||||
#endif
|
||||
return Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(q1, q0));
|
||||
}
|
||||
|
||||
|
||||
@@ -18,8 +18,9 @@
|
||||
#include <remote.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "hex-dma.h"
|
||||
#include "hex-utils.h"
|
||||
#include "hex-dma.h"
|
||||
#include "hmx-queue.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
@@ -324,6 +325,14 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
ctx->hmx_enabled = use_hmx;
|
||||
ctx->hmx_queue = NULL;
|
||||
if (use_hmx) {
|
||||
ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx);
|
||||
if (!ctx->hmx_queue) {
|
||||
FARF(ERROR, "hmx-queue-create failed");
|
||||
ctx->hmx_enabled = false;
|
||||
}
|
||||
}
|
||||
FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx);
|
||||
#endif
|
||||
|
||||
@@ -389,7 +398,11 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
|
||||
}
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
ctx->hmx_enabled = 0;
|
||||
if (ctx->hmx_queue) {
|
||||
hmx_queue_delete(ctx->hmx_queue);
|
||||
ctx->hmx_queue = NULL;
|
||||
}
|
||||
ctx->hmx_enabled = false;
|
||||
#endif
|
||||
|
||||
vtcm_free(ctx);
|
||||
|
||||
@@ -250,6 +250,7 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary(ggml_metal
|
||||
case GGML_UNARY_OP_CEIL: op_num = OP_UNARY_NUM_CEIL; break;
|
||||
case GGML_UNARY_OP_ROUND: op_num = OP_UNARY_NUM_ROUND; break;
|
||||
case GGML_UNARY_OP_TRUNC: op_num = OP_UNARY_NUM_TRUNC; break;
|
||||
case GGML_UNARY_OP_XIELU: op_num = OP_UNARY_NUM_XIELU; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
} break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
|
||||
@@ -1043,6 +1043,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
case GGML_UNARY_OP_CEIL:
|
||||
case GGML_UNARY_OP_ROUND:
|
||||
case GGML_UNARY_OP_TRUNC:
|
||||
case GGML_UNARY_OP_XIELU:
|
||||
return ggml_is_contiguous_rows(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
|
||||
default:
|
||||
return false;
|
||||
@@ -1159,6 +1160,23 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
if (op->src[1]->type != op->src[2]->type) {
|
||||
return false;
|
||||
}
|
||||
switch (op->src[1]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
if (!has_bfloat) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
return has_simdgroup_mm; // TODO: over-restricted for vec-kernels
|
||||
case GGML_OP_SSM_CONV:
|
||||
case GGML_OP_SSM_SCAN:
|
||||
|
||||
@@ -127,6 +127,7 @@
|
||||
#define OP_UNARY_NUM_CEIL 118
|
||||
#define OP_UNARY_NUM_ROUND 119
|
||||
#define OP_UNARY_NUM_TRUNC 120
|
||||
#define OP_UNARY_NUM_XIELU 121
|
||||
|
||||
#define OP_SUM_ROWS_NUM_SUM_ROWS 10
|
||||
#define OP_SUM_ROWS_NUM_MEAN 11
|
||||
|
||||
@@ -787,6 +787,13 @@ int ggml_metal_op_unary(ggml_metal_op_t ctx, int idx) {
|
||||
args.max = ggml_get_op_params_f32(op, 1);
|
||||
}
|
||||
|
||||
if (op->op == GGML_OP_UNARY && ggml_get_unary_op(op) == GGML_UNARY_OP_XIELU) {
|
||||
args.slope = ggml_get_op_params_f32(op, 1); // alpha_n
|
||||
args.scale = ggml_get_op_params_f32(op, 2); // alpha_p
|
||||
args.bias = ggml_get_op_params_f32(op, 3); // beta
|
||||
args.val = ggml_get_op_params_f32(op, 4); // eps
|
||||
}
|
||||
|
||||
auto pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
|
||||
|
||||
if (pipeline.c4) {
|
||||
|
||||
@@ -1177,6 +1177,15 @@ kernel void kernel_unary_impl(
|
||||
if (FC_OP == OP_UNARY_NUM_TRUNC) {
|
||||
dst_ptr[i0] = (T) trunc(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_XIELU) {
|
||||
const TC xi = x;
|
||||
const TC gate = TC(xi > TC(0.0f));
|
||||
const TC clamped = fmin(xi, TC(args.val));
|
||||
const TC y_pos = TC(args.scale) * xi * xi + TC(args.bias) * xi;
|
||||
const TC y_neg = (exp(clamped) - TC(1.0f) - xi) * TC(args.slope) + TC(args.bias) * xi;
|
||||
dst_ptr[i0] = (T) (gate * y_pos + (TC(1.0f) - gate) * y_neg);
|
||||
}
|
||||
}
|
||||
|
||||
#undef FC_OP
|
||||
|
||||
@@ -20,6 +20,13 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
|
||||
#define VULKAN_HPP_DEFAULT_DISPATCHER ggml_vk_default_dispatcher()
|
||||
|
||||
#include <vulkan/vulkan.hpp>
|
||||
// SPIRV-Headers: LunarG Windows SDK uses Include/spirv-headers/spirv.hpp (not spirv/unified1/). MinGW/MSYS2 and
|
||||
// Linux packages use Khronos layout spirv/unified1/spirv.hpp. See docs/build.md#vulkan.
|
||||
#if defined(_WIN32) && !defined(__MINGW32__)
|
||||
#include <spirv-headers/spirv.hpp>
|
||||
#else
|
||||
#include <spirv/unified1/spirv.hpp>
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
@@ -2131,6 +2138,66 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
||||
|
||||
vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
|
||||
|
||||
// Patch SPIR-V to enable RTE rounding for FP16, avoiding the need for
|
||||
// separate shader variants compiled with -DRTE16.
|
||||
std::vector<uint32_t> spv;
|
||||
if (device->float_controls_rte_fp16) {
|
||||
const uint32_t* spv_words = reinterpret_cast<const uint32_t *>(spv_data);
|
||||
size_t word_count = spv_size / sizeof(uint32_t);
|
||||
spv.assign(spv_words, spv_words + word_count);
|
||||
|
||||
// Find insertion points respecting SPIR-V layout order:
|
||||
// Header(5) -> OpCapability -> OpExtension -> ... -> OpEntryPoint -> OpExecutionMode -> ...
|
||||
size_t pos = 5; // skip header
|
||||
size_t cap_insert_pos = pos;
|
||||
size_t ext_insert_pos = pos;
|
||||
size_t exec_insert_pos = pos;
|
||||
uint32_t entry_point_id = 0;
|
||||
|
||||
while (pos < spv.size()) {
|
||||
uint32_t opcode = spv[pos] & spv::OpCodeMask;
|
||||
uint32_t len = spv[pos] >> spv::WordCountShift;
|
||||
if (len == 0) break;
|
||||
|
||||
if (opcode == spv::OpCapability) {
|
||||
cap_insert_pos = pos + len;
|
||||
ext_insert_pos = pos + len;
|
||||
} else if (opcode == spv::OpExtension) {
|
||||
ext_insert_pos = pos + len;
|
||||
} else if (opcode == spv::OpEntryPoint) {
|
||||
entry_point_id = spv[pos + 2];
|
||||
exec_insert_pos = pos + len;
|
||||
} else if (opcode == spv::OpExecutionMode || opcode == spv::OpExecutionModeId) {
|
||||
exec_insert_pos = pos + len;
|
||||
} else if (entry_point_id != 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
pos += len;
|
||||
}
|
||||
|
||||
// Insert from latest position first so earlier indices stay valid.
|
||||
|
||||
// OpExecutionMode %entrypoint RoundingModeRTE 16
|
||||
uint32_t exec_mode[] = { (4u << spv::WordCountShift) | spv::OpExecutionMode, entry_point_id, spv::ExecutionModeRoundingModeRTE, 16 };
|
||||
spv.insert(spv.begin() + exec_insert_pos, std::begin(exec_mode), std::end(exec_mode));
|
||||
|
||||
// OpExtension "SPV_KHR_float_controls"
|
||||
const char ext_str[] = "SPV_KHR_float_controls";
|
||||
size_t ext_str_words = CEIL_DIV(sizeof(ext_str), sizeof(uint32_t));
|
||||
std::vector<uint32_t> extension(1 + ext_str_words, 0);
|
||||
extension[0] = (uint32_t)((1 + ext_str_words) << spv::WordCountShift) | spv::OpExtension;
|
||||
memcpy(&extension[1], ext_str, sizeof(ext_str));
|
||||
spv.insert(spv.begin() + ext_insert_pos, extension.begin(), extension.end());
|
||||
|
||||
// OpCapability RoundingModeRTE
|
||||
uint32_t capability[] = { (2u << spv::WordCountShift) | spv::OpCapability, spv::CapabilityRoundingModeRTE };
|
||||
spv.insert(spv.begin() + cap_insert_pos, std::begin(capability), std::end(capability));
|
||||
|
||||
shader_module_create_info = vk::ShaderModuleCreateInfo({}, spv.size() * sizeof(uint32_t), spv.data());
|
||||
}
|
||||
|
||||
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
|
||||
|
||||
vk::PushConstantRange pcr(
|
||||
@@ -4344,10 +4411,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_partials_f32, "rms_norm_partials_f32", rms_norm_partials_f32_len, rms_norm_partials_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 0}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_partials_f32, "rms_norm_mul_partials_f32", rms_norm_partials_f32_len, rms_norm_partials_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 1}, 1, true);
|
||||
|
||||
if (device->float_controls_rte_fp16 &&
|
||||
sizeof(vk_op_rms_norm_mul_rope_push_constants) <= device->properties.limits.maxPushConstantsSize) {
|
||||
if (sizeof(vk_op_rms_norm_mul_rope_push_constants) <= device->properties.limits.maxPushConstantsSize) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_rope_f32_f32, "rms_norm_mul_rope_f32_f32", rms_norm_mul_rope_f32_f32_len, rms_norm_mul_rope_f32_f32_data, "main", 7, sizeof(vk_op_rms_norm_mul_rope_push_constants), {1, 1, 1}, {0, 1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_rope_f32_f16, "rms_norm_mul_rope_f32_f16", rms_norm_mul_rope_f32_f16_rte_len, rms_norm_mul_rope_f32_f16_rte_data, "main", 7, sizeof(vk_op_rms_norm_mul_rope_push_constants), {1, 1, 1}, {0, 1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_rope_f32_f16, "rms_norm_mul_rope_f32_f16", rms_norm_mul_rope_f32_f16_len, rms_norm_mul_rope_f32_f16_data, "main", 7, sizeof(vk_op_rms_norm_mul_rope_push_constants), {1, 1, 1}, {0, 1}, 1, true);
|
||||
}
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
@@ -4372,43 +4438,28 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_32, "cpy_transpose_32", cpy_transpose_32_len, cpy_transpose_32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_16, "cpy_transpose_16", cpy_transpose_16_len, cpy_transpose_16_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);
|
||||
|
||||
if (device->float_controls_rte_fp16) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q1_0], "cpy_f32_q1_0", cpy_f32_q1_0_rte_len, cpy_f32_q1_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_rte_len, cpy_f32_q4_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_rte_len, cpy_f32_q5_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_rte_len, cpy_f32_q5_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_rte_len, cpy_f32_q8_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_rte_len, cpy_f32_iq4_nl_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q1_0], "cpy_f32_q1_0", cpy_f32_q1_0_len, cpy_f32_q1_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
}
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q1_0], "cpy_f32_q1_0", cpy_f32_q1_0_len, cpy_f32_q1_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
|
||||
#define SET_ROWS(itype, rte) \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F32], "set_rows_f32" #itype, set_rows_f32 ## itype ## rte ## _len, set_rows_f32 ## itype ## rte ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F16], "set_rows_f16" #itype, set_rows_f16 ## itype ## rte ## _len, set_rows_f16 ## itype ## rte ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_BF16], "set_rows_bf16" #itype, set_rows_bf16 ## itype ## rte ## _len, set_rows_bf16 ## itype ## rte ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q1_0], "set_rows_q1_0" #itype, set_rows_q1_0 ## itype ## rte ## _len, set_rows_q1_0 ## itype ## rte ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_0], "set_rows_q4_0" #itype, set_rows_q4_0 ## itype ## rte ## _len, set_rows_q4_0 ## itype ## rte ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_1], "set_rows_q4_1" #itype, set_rows_q4_1 ## itype ## rte ## _len, set_rows_q4_1 ## itype ## rte ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_0], "set_rows_q5_0" #itype, set_rows_q5_0 ## itype ## rte ## _len, set_rows_q5_0 ## itype ## rte ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_1], "set_rows_q5_1" #itype, set_rows_q5_1 ## itype ## rte ## _len, set_rows_q5_1 ## itype ## rte ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q8_0], "set_rows_q8_0" #itype, set_rows_q8_0 ## itype ## rte ## _len, set_rows_q8_0 ## itype ## rte ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_IQ4_NL], "set_rows_iq4_nl" #itype, set_rows_iq4_nl ## itype ## rte ## _len, set_rows_iq4_nl ## itype ## rte ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
|
||||
#define SET_ROWS(itype) \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F32], "set_rows_f32" #itype, set_rows_f32 ## itype ## _len, set_rows_f32 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F16], "set_rows_f16" #itype, set_rows_f16 ## itype ## _len, set_rows_f16 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_BF16], "set_rows_bf16" #itype, set_rows_bf16 ## itype ## _len, set_rows_bf16 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q1_0], "set_rows_q1_0" #itype, set_rows_q1_0 ## itype ## _len, set_rows_q1_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_0], "set_rows_q4_0" #itype, set_rows_q4_0 ## itype ## _len, set_rows_q4_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_1], "set_rows_q4_1" #itype, set_rows_q4_1 ## itype ## _len, set_rows_q4_1 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_0], "set_rows_q5_0" #itype, set_rows_q5_0 ## itype ## _len, set_rows_q5_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_1], "set_rows_q5_1" #itype, set_rows_q5_1 ## itype ## _len, set_rows_q5_1 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q8_0], "set_rows_q8_0" #itype, set_rows_q8_0 ## itype ## _len, set_rows_q8_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_IQ4_NL], "set_rows_iq4_nl" #itype, set_rows_iq4_nl ## itype ## _len, set_rows_iq4_nl ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
|
||||
|
||||
if (device->float_controls_rte_fp16) {
|
||||
SET_ROWS(_i32, _rte)
|
||||
SET_ROWS(_i64, _rte)
|
||||
} else {
|
||||
SET_ROWS(_i32, )
|
||||
SET_ROWS(_i64, )
|
||||
}
|
||||
SET_ROWS(_i32)
|
||||
SET_ROWS(_i64)
|
||||
#undef SET_ROWS
|
||||
|
||||
|
||||
@@ -4428,11 +4479,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
return s;
|
||||
};
|
||||
|
||||
bool rte = device->float_controls_rte_fp16;
|
||||
#define CREATE_BINARY(name, namemod, spec, bindings) \
|
||||
for (int s0 : {0,1}) for (int s1 : {0,1}) for (int d : {0,1}) \
|
||||
ggml_vk_create_pipeline2(device, device->pipeline_ ## name ## namemod[s0][s1][d], \
|
||||
#name + get_suffix(s0, s1, d) + #namemod, name ## _len[s0][s1][d][rte], name ## _data[s0][s1][d][rte], \
|
||||
#name + get_suffix(s0, s1, d) + #namemod, name ## _len[s0][s1][d], name ## _data[s0][s1][d], \
|
||||
"main", (bindings), sizeof(vk_op_binary_push_constants), {512, 1, 1}, spec, 1);
|
||||
|
||||
CREATE_BINARY(add, , {0}, 4)
|
||||
@@ -4475,13 +4525,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_sin_f32, "sin_f32", sin_f32_len, sin_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cos_f32, "cos_f32", cos_f32_len, cos_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
if (device->float_controls_rte_fp16) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_log[0], "log_f32_rte", log_f32_rte_len, log_f32_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_log[1], "log_f16_rte", log_f16_rte_len, log_f16_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_log[0], "log_f32", log_f32_len, log_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_log[1], "log_f16", log_f16_len, log_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
}
|
||||
ggml_vk_create_pipeline(device, device->pipeline_log[0], "log_f32", log_f32_len, log_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_log[1], "log_f16", log_f16_len, log_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_tri[0], "tri_f32", tri_f32_len, tri_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_tri[1], "tri_f16", tri_f16_len, tri_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
@@ -4522,19 +4567,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
CREATE_UNARY(floor)
|
||||
CREATE_UNARY(trunc)
|
||||
CREATE_UNARY(sgn)
|
||||
CREATE_UNARY(exp)
|
||||
#undef CREATE_UNARY
|
||||
|
||||
#define CREATE_UNARY_RTE(name) \
|
||||
if (device->float_controls_rte_fp16) { \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32_rte", name ## _f32_rte_len, name ## _f32_rte_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16_rte", name ## _f16_rte_len, name ## _f16_rte_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
|
||||
} else { \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
|
||||
}
|
||||
CREATE_UNARY_RTE(exp)
|
||||
#undef CREATE_UNARY_RTE
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f16, "add1_f16_f16", add1_f16_f16_len, add1_f16_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f32, "add1_f16_f32", add1_f16_f32_len, add1_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add1_f32_f32, "add1_f32_f32", add1_f32_f32_len, add1_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
@@ -4544,13 +4579,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
#define CREATE_GLU(name) \
|
||||
if (device->float_controls_rte_fp16) { \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32_rte", name ## _f32_rte_len, name ## _f32_rte_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16_rte", name ## _f16_rte_len, name ## _f16_rte_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
|
||||
} else { \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
|
||||
}
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true);
|
||||
|
||||
CREATE_GLU(geglu)
|
||||
CREATE_GLU(reglu)
|
||||
@@ -4583,25 +4613,14 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f32, "rope_vision_f32", rope_vision_f32_len, rope_vision_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
|
||||
if (device->float_controls_rte_fp16) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_rte_len, rope_norm_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_rte_len, rope_neox_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f16, "rope_multi_f16", rope_multi_f16_rte_len, rope_multi_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f16, "rope_vision_f16", rope_vision_f16_rte_len, rope_vision_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f16, "rope_multi_f16", rope_multi_f16_len, rope_multi_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f16, "rope_vision_f16", rope_vision_f16_len, rope_vision_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32_f16, "rope_norm_f32_f16", rope_norm_f32_f16_rte_len, rope_norm_f32_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32_f16, "rope_neox_f32_f16", rope_neox_f32_f16_rte_len, rope_neox_f32_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32_f16, "rope_multi_f32_f16", rope_multi_f32_f16_rte_len, rope_multi_f32_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f16, "rope_multi_f16", rope_multi_f16_len, rope_multi_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f16, "rope_vision_f16", rope_vision_f16_len, rope_vision_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32_f16, "rope_norm_f32_f16", rope_norm_f32_f16_len, rope_norm_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32_f16, "rope_neox_f32_f16", rope_neox_f32_f16_len, rope_neox_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32_f16, "rope_multi_f32_f16", rope_multi_f32_f16_len, rope_multi_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
}
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32_f16, "rope_norm_f32_f16", rope_norm_f32_f16_len, rope_norm_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32_f16, "rope_neox_f32_f16", rope_neox_f32_f16_len, rope_neox_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32_f16, "rope_multi_f32_f16", rope_multi_f32_f16_len, rope_multi_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
|
||||
for (uint32_t i = 0; i < num_argsort_pipelines; ++i) {
|
||||
uint32_t BLOCK_SIZE = 1u << std::min(i, device->max_workgroup_size_log2);
|
||||
@@ -4663,13 +4682,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
#define IM2COL(bda) \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32 ## bda ## _len, im2col_f32 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32, "im2col_3d_f32", im2col_3d_f32 ## bda ## _len, im2col_3d_f32 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true); \
|
||||
if (device->float_controls_rte_fp16) { \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte ## bda ## _len, im2col_f32_f16_rte ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32_f16, "im2col_3d_f32_f16", im2col_3d_f32_f16_rte ## bda ## _len, im2col_3d_f32_f16_rte ## bda ## _data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true); \
|
||||
} else { \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16 ## bda ## _len, im2col_f32_f16 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32_f16, "im2col_3d_f32_f16", im2col_3d_f32_f16 ## bda ## _len, im2col_3d_f32_f16 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true); \
|
||||
}
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16 ## bda ## _len, im2col_f32_f16 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32_f16, "im2col_3d_f32_f16", im2col_3d_f32_f16 ## bda ## _len, im2col_3d_f32_f16 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true);
|
||||
if (device->shader_int64 && device->buffer_device_address) {
|
||||
IM2COL(_bda)
|
||||
} else {
|
||||
@@ -14343,8 +14357,7 @@ static bool ggml_vk_can_fuse_rms_norm_mul_rope(ggml_backend_vk_context * ctx, co
|
||||
}
|
||||
|
||||
// conditions for pipeline creation
|
||||
if (!(ctx->device->float_controls_rte_fp16 &&
|
||||
sizeof(vk_op_rms_norm_mul_rope_push_constants) <= ctx->device->properties.limits.maxPushConstantsSize)) {
|
||||
if (sizeof(vk_op_rms_norm_mul_rope_push_constants) > ctx->device->properties.limits.maxPushConstantsSize) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#version 450
|
||||
|
||||
#include "rte.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
#if defined(SET_ROWS) && QUANT_K == 1
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#version 450
|
||||
|
||||
#include "rte.glsl"
|
||||
#include "types.glsl"
|
||||
#include "generic_unary_head.glsl"
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#version 450
|
||||
|
||||
#include "rte.glsl"
|
||||
#include "generic_head.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
|
||||
#include "rte.glsl"
|
||||
#include "utils.glsl"
|
||||
#if RMS_NORM_ROPE_FUSION
|
||||
#include "rope_params.glsl"
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
#include "rte.glsl"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
|
||||
#include "rte.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
|
||||
|
||||
#include "rte.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#version 450
|
||||
|
||||
#include "rte.glsl"
|
||||
#include "types.glsl"
|
||||
#include "generic_unary_head.glsl"
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
#extension GL_KHR_shader_subgroup_basic : enable
|
||||
#endif
|
||||
|
||||
#include "rte.glsl"
|
||||
#include "types.glsl"
|
||||
#include "utils.glsl"
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
#include "rte.glsl"
|
||||
#include "rope_params.glsl"
|
||||
|
||||
layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
#if !defined(GGML_ROPE_PARAMS)
|
||||
#define GGML_ROPE_PARAMS
|
||||
|
||||
#include "rte.glsl"
|
||||
|
||||
struct rope_params {
|
||||
uint rope_mode;
|
||||
uint nrows;
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
|
||||
#if RTE16
|
||||
#extension GL_EXT_spirv_intrinsics : enable
|
||||
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
|
||||
#endif // RTE16
|
||||
@@ -1,6 +1,5 @@
|
||||
#version 450
|
||||
|
||||
#include "rte.glsl"
|
||||
#include "types.glsl"
|
||||
#include "generic_unary_head.glsl"
|
||||
|
||||
|
||||
@@ -745,7 +745,7 @@ void process_shaders() {
|
||||
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("rms_norm_partials_f32", "rms_norm_partials.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("rms_norm_mul_rope_f32_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"ROPE_D_TYPE", "float"}, {"RMS_NORM_ROPE_FUSION", "1"}}));
|
||||
string_to_spv("rms_norm_mul_rope_f32_f16_rte", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}, {"RMS_NORM_ROPE_FUSION", "1"}, {"RTE16", "1"}}));
|
||||
string_to_spv("rms_norm_mul_rope_f32_f16", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}, {"RMS_NORM_ROPE_FUSION", "1"}}));
|
||||
string_to_spv("rms_norm_back_f32", "rms_norm_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("l2_norm_f32", "l2_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
@@ -769,15 +769,12 @@ void process_shaders() {
|
||||
|
||||
for (std::string t : {"q1_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
|
||||
string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("cpy_f32_" + t + "_rte", "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}});
|
||||
string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}
|
||||
|
||||
for (std::string t : {"f32", "f16", "bf16", "q1_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
|
||||
string_to_spv("set_rows_" + t + "_i32", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uint"}, {"B_SIZE", "32"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("set_rows_" + t + "_i32_rte", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uint"}, {"B_SIZE", "32"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}});
|
||||
string_to_spv("set_rows_" + t + "_i64", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"B_SIZE", "64"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("set_rows_" + t + "_i64_rte", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"B_SIZE", "64"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}});
|
||||
string_to_spv("set_rows_" + t + "_i32", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uint"}, {"B_SIZE", "32"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("set_rows_" + t + "_i64", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"B_SIZE", "64"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}
|
||||
|
||||
auto get_type_str = [](bool f16) {
|
||||
@@ -794,12 +791,10 @@ void process_shaders() {
|
||||
for (auto src0_f16 : {false, true}) {
|
||||
for (auto src1_f16 : {false, true}) {
|
||||
for (auto dst_f16 : {false, true}) {
|
||||
for (auto rte : {false, true}) {
|
||||
auto source = op == "add_rms" ? std::string("add") : op;
|
||||
auto name = op + get_suffix(src0_f16, src1_f16, dst_f16) + (rte ? "_rte" : "");
|
||||
auto name = op + get_suffix(src0_f16, src1_f16, dst_f16);
|
||||
auto add_rms = op == "add_rms" ? "1" : "0";
|
||||
string_to_spv(name.c_str(), source + ".comp", {{"A_TYPE", get_type_str(src0_f16)}, {"B_TYPE", get_type_str(src1_f16)}, {"D_TYPE", get_type_str(dst_f16)}, {"FLOAT_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}, {"ADD_RMS" , add_rms}});
|
||||
}
|
||||
string_to_spv(name.c_str(), source + ".comp", {{"A_TYPE", get_type_str(src0_f16)}, {"B_TYPE", get_type_str(src1_f16)}, {"D_TYPE", get_type_str(dst_f16)}, {"FLOAT_TYPE", "float"}, {"ADD_RMS" , add_rms}});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -847,14 +842,11 @@ void process_shaders() {
|
||||
|
||||
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
for (auto rte : {false, true}) {
|
||||
std::string suffix = rte ? "_rte" : "";
|
||||
string_to_spv("exp_f16" + suffix, "exp.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("exp_f32" + suffix, "exp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"} , {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("exp_f16", "exp.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("exp_f32", "exp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("log_f16" + suffix, "log.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("log_f32" + suffix, "log.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
|
||||
}
|
||||
string_to_spv("log_f16", "log.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("log_f32", "log.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("gelu_f16", "gelu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("gelu_erf_f16", "gelu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
@@ -908,21 +900,18 @@ void process_shaders() {
|
||||
string_to_spv("trunc_f16", "trunc.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("trunc_f32", "trunc.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
for (auto rte : {false, true}) {
|
||||
std::string suffix = rte ? "_rte" : "";
|
||||
string_to_spv("geglu_f16" + suffix, "geglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("geglu_f32" + suffix, "geglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("reglu_f16" + suffix, "reglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("reglu_f32" + suffix, "reglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("swiglu_f16" + suffix, "swiglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("swiglu_f32" + suffix, "swiglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("swiglu_oai_f16" + suffix, "swiglu_oai.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("swiglu_oai_f32" + suffix, "swiglu_oai.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("geglu_erf_f16" + suffix, "geglu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("geglu_erf_f32" + suffix, "geglu_erf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("geglu_quick_f16" + suffix,"geglu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("geglu_quick_f32" + suffix,"geglu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
|
||||
}
|
||||
string_to_spv("geglu_f16", "geglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("geglu_f32", "geglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("reglu_f16", "reglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("reglu_f32", "reglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("swiglu_f16", "swiglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("swiglu_f32", "swiglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("swiglu_oai_f16", "swiglu_oai.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("swiglu_oai_f32", "swiglu_oai.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("geglu_erf_f16", "geglu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("geglu_erf_f32", "geglu_erf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("geglu_quick_f16","geglu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("geglu_quick_f32","geglu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
@@ -942,25 +931,18 @@ void process_shaders() {
|
||||
|
||||
string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
|
||||
string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
|
||||
string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
||||
string_to_spv("rope_norm_f32_f16", "rope_norm.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}});
|
||||
string_to_spv("rope_norm_f32_f16_rte", "rope_norm.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
||||
|
||||
string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
|
||||
string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
|
||||
string_to_spv("rope_neox_f16_rte", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
||||
string_to_spv("rope_neox_f32_f16", "rope_neox.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}});
|
||||
string_to_spv("rope_neox_f32_f16_rte", "rope_neox.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
||||
|
||||
string_to_spv("rope_multi_f32", "rope_multi.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
|
||||
string_to_spv("rope_multi_f16", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
|
||||
string_to_spv("rope_multi_f16_rte", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
||||
string_to_spv("rope_multi_f32_f16", "rope_multi.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}});
|
||||
string_to_spv("rope_multi_f32_f16_rte", "rope_multi.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
||||
|
||||
string_to_spv("rope_vision_f32", "rope_vision.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
|
||||
string_to_spv("rope_vision_f16", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
|
||||
string_to_spv("rope_vision_f16_rte", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
||||
|
||||
string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
|
||||
string_to_spv("argsort_large_f32", "argsort_large.comp", {{"A_TYPE", "float"}});
|
||||
@@ -983,7 +965,6 @@ void process_shaders() {
|
||||
std::string bda_def = bda ? "1" : "0";
|
||||
string_to_spv("im2col" + dim_str + "_f32" + bda_str, "im2col" + dim_str + ".comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"D_SIZE", "4"}, {"BDA", bda_def}}));
|
||||
string_to_spv("im2col" + dim_str + "_f32_f16" + bda_str, "im2col" + dim_str + ".comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"D_SIZE", "2"}, {"BDA", bda_def}}));
|
||||
string_to_spv("im2col" + dim_str + "_f32_f16_rte" + bda_str, "im2col" + dim_str + ".comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"D_SIZE", "2"}, {"RTE16", "1"}, {"BDA", bda_def}}));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1036,8 +1017,8 @@ void process_shaders() {
|
||||
|
||||
string_to_spv("add_id_f32", "add_id.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("multi_add_f32", "multi_add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}, {"ADD_RMS" , "0"}});
|
||||
string_to_spv("multi_add_rms_f32", "multi_add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}, {"ADD_RMS" , "1"}});
|
||||
string_to_spv("multi_add_f32", "multi_add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"ADD_RMS" , "0"}});
|
||||
string_to_spv("multi_add_rms_f32", "multi_add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"ADD_RMS" , "1"}});
|
||||
|
||||
string_to_spv("ssm_scan_f32", "ssm_scan.comp", {{"A_TYPE", "float"}});
|
||||
string_to_spv("ssm_scan_subgroup_f32", "ssm_scan.comp", {{"A_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}});
|
||||
@@ -1090,8 +1071,8 @@ void write_output_files() {
|
||||
|
||||
std::string suffixes[2] = {"_f32", "_f16"};
|
||||
for (std::string op : {"add", "sub", "mul", "div", "add_rms"}) {
|
||||
hdr << "extern const void * " << op << "_data[2][2][2][2];\n";
|
||||
hdr << "extern const uint64_t " << op << "_len[2][2][2][2];\n";
|
||||
hdr << "extern const void * " << op << "_data[2][2][2];\n";
|
||||
hdr << "extern const uint64_t " << op << "_len[2][2][2];\n";
|
||||
|
||||
std::string op_file = op == "add_rms" ? "add.comp" : std::string(op) + ".comp";
|
||||
if (basename(input_filepath) != op_file) {
|
||||
@@ -1099,8 +1080,8 @@ void write_output_files() {
|
||||
}
|
||||
std::stringstream data = make_generic_stringstream();
|
||||
std::stringstream len = make_generic_stringstream();
|
||||
data << "const void * " << op << "_data[2][2][2][2] = ";
|
||||
len << "const uint64_t " << op << "_len[2][2][2][2] = ";
|
||||
data << "const void * " << op << "_data[2][2][2] = ";
|
||||
len << "const uint64_t " << op << "_len[2][2][2] = ";
|
||||
for (uint32_t t0 = 0; t0 < 2; ++t0) {
|
||||
if (t0 == 0) {
|
||||
data << "{";
|
||||
@@ -1116,20 +1097,10 @@ void write_output_files() {
|
||||
data << "{";
|
||||
len << "{";
|
||||
}
|
||||
for (uint32_t rte = 0; rte < 2; ++rte) {
|
||||
if (rte == 0) {
|
||||
data << "{";
|
||||
len << "{";
|
||||
}
|
||||
data << op << suffixes[t0] << suffixes[t1] << suffixes[t2] << ((rte != 0) ? "_rte" : "");
|
||||
len << op << suffixes[t0] << suffixes[t1] << suffixes[t2] << ((rte != 0) ? "_rte" : "");
|
||||
data << "_data,";
|
||||
len << "_len,";
|
||||
if (rte == 1) {
|
||||
data << "}, ";
|
||||
len << "}, ";
|
||||
}
|
||||
}
|
||||
data << op << suffixes[t0] << suffixes[t1] << suffixes[t2];
|
||||
len << op << suffixes[t0] << suffixes[t1] << suffixes[t2];
|
||||
data << "_data,";
|
||||
len << "_len,";
|
||||
if (t2 == 1) {
|
||||
data << "}, ";
|
||||
len << "}, ";
|
||||
|
||||
@@ -3485,7 +3485,7 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
|
||||
dev_desc.requiredFeatureCount = required_features.size();
|
||||
dev_desc.SetDeviceLostCallback(
|
||||
wgpu::CallbackMode::AllowSpontaneous,
|
||||
[ctx](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
|
||||
[](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
|
||||
if (reason == wgpu::DeviceLostReason::Destroyed) {
|
||||
return;
|
||||
}
|
||||
|
||||
161
models/templates/Reka-Edge.jinja
Normal file
161
models/templates/Reka-Edge.jinja
Normal file
@@ -0,0 +1,161 @@
|
||||
{%- macro render_content(content, num_img_tokens, num_video_frames) -%}
|
||||
{%- if content is string -%}
|
||||
{{- content -}}
|
||||
{%- elif content is sequence -%}
|
||||
{%- set ns = namespace(out="", prev_was_text=false) -%}
|
||||
{%- for item in content -%}
|
||||
{%- set item_type = item.get("type") -%}
|
||||
{%- if item_type == "text" or item.get("text") is not none -%}
|
||||
{%- set text = item.get("text", "") -%}
|
||||
{%- if text -%}
|
||||
{%- if ns.prev_was_text -%}
|
||||
{%- set ns.out = ns.out ~ " " -%}
|
||||
{%- endif -%}
|
||||
{%- set ns.out = ns.out ~ text -%}
|
||||
{%- endif -%}
|
||||
{%- set ns.prev_was_text = text != "" -%}
|
||||
{%- elif item_type in ["image", "image_url"] or item.get("image") is not none or item.get("image_url") is not none -%}
|
||||
{%- set ns.out = ns.out ~ "<image>" ~ ("<REKA_IMG_TOKEN>" * num_img_tokens) ~ "</image>" -%}
|
||||
{%- set ns.prev_was_text = false -%}
|
||||
{%- elif item_type in ["video", "video_url"] or item.get("video") is not none or item.get("video_url") is not none -%}
|
||||
{%- set repeat_tokens = num_img_tokens * num_video_frames -%}
|
||||
{%- set ns.out = ns.out ~ "<video>" ~ ("<REKA_IMG_TOKEN>" * repeat_tokens) ~ "</video>" -%}
|
||||
{%- set ns.prev_was_text = false -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{{- ns.out -}}
|
||||
{%- endif -%}
|
||||
{%- endmacro -%}
|
||||
{%- set ns = namespace(out="", last_query_index=messages|length - 1) -%}
|
||||
{%- for msg in messages[::-1] -%}
|
||||
{%- set idx = messages|length - 1 - loop.index0 -%}
|
||||
{%- if msg.get("role") == "user" -%}
|
||||
{%- set content = msg.get("content", "") -%}
|
||||
{%- if not (content is string and content.startswith("<tool_response>") and content.endswith("</tool_response>")) -%}
|
||||
{%- set ns.last_query_index = idx -%}
|
||||
{%- break -%}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- set last_query_index = ns.last_query_index -%}
|
||||
{%- set num_img_tokens = num_img_tokens | default(64, true) | int -%}
|
||||
{%- set num_video_frames = num_video_frames | default(6, true) | int -%}
|
||||
{%- set start_idx = 0 -%}
|
||||
{%- set system_text = "" -%}
|
||||
{%- if messages|length > 0 and messages[0].get("role") in ["system", "developer"] -%}
|
||||
{%- set system_text = render_content(messages[0].get("content", ""), num_img_tokens, num_video_frames) -%}
|
||||
{%- set start_idx = 1 -%}
|
||||
{%- endif -%}
|
||||
{%- if tools or system_text -%}
|
||||
{%- set preamble_ns = namespace(text="") -%}
|
||||
{%- if system_text -%}
|
||||
{%- set preamble_ns.text = "system: " ~ system_text -%}
|
||||
{%- endif -%}
|
||||
{%- if tools -%}
|
||||
{%- if preamble_ns.text -%}
|
||||
{%- set preamble_ns.text = preamble_ns.text ~ "\n\n" -%}
|
||||
{%- else -%}
|
||||
{%- set preamble_ns.text = "system: " -%}
|
||||
{%- endif -%}
|
||||
{%- set preamble_ns.text = preamble_ns.text
|
||||
~ "# Tools\n\n"
|
||||
~ "You may call one or more functions to assist with the user query.\n\n"
|
||||
~ "You are provided with function signatures within <tools></tools> XML tags:\n"
|
||||
~ "<tools>" -%}
|
||||
{%- for tool in tools -%}
|
||||
{%- set preamble_ns.text = preamble_ns.text ~ "\n" ~ (tool | tojson(ensure_ascii=True)) -%}
|
||||
{%- endfor -%}
|
||||
{%- set preamble_ns.text = preamble_ns.text
|
||||
~ "\n</tools>\n\n"
|
||||
~ "For each function call, return a json object with function name and arguments "
|
||||
~ "within <tool_call></tool_call> XML tags:\n"
|
||||
~ "<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>" -%}
|
||||
{%- endif -%}
|
||||
{%- set ns.out = ns.out ~ preamble_ns.text ~ "\n\n<sep>" -%}
|
||||
{%- endif -%}
|
||||
{%- for idx in range(start_idx, messages|length) -%}
|
||||
{%- set message = messages[idx] -%}
|
||||
{%- set role = message.get("role") -%}
|
||||
{%- set content = message.get("content") -%}
|
||||
{%- if role == "user" -%}
|
||||
{%- set prefix_ns = namespace(value="human: ") -%}
|
||||
{%- if content is sequence and content is not string -%}
|
||||
{%- for item in content -%}
|
||||
{%- if item.get("type") == "text" or item.get("text") is not none -%}
|
||||
{%- set text = item.get("text", "") -%}
|
||||
{%- if text -%}
|
||||
{%- break -%}
|
||||
{%- endif -%}
|
||||
{%- elif item.get("type") in ["image", "image_url", "video", "video_url"] -%}
|
||||
{%- set prefix_ns.value = "human:" -%}
|
||||
{%- break -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- endif -%}
|
||||
{%- set ns.out = ns.out ~ prefix_ns.value ~ render_content(content, num_img_tokens, num_video_frames) ~ "<sep>" -%}
|
||||
{%- elif role == "assistant" -%}
|
||||
{%- set tool_calls = message.get("tool_calls") -%}
|
||||
{%- set content_text = render_content(content, num_img_tokens, num_video_frames) -%}
|
||||
{%- set reasoning_text = "" -%}
|
||||
{%- if message.get("reasoning_content") is string -%}
|
||||
{%- set reasoning_text = message.get("reasoning_content") -%}
|
||||
{%- elif "</think>" in content_text -%}
|
||||
{%- set reasoning_text = content_text.split("</think>", 1)[0].rstrip("\n").split("<think>")[-1].lstrip("\n") -%}
|
||||
{%- set content_text = content_text.split("</think>", 1)[1].lstrip("\n") -%}
|
||||
{%- endif -%}
|
||||
{%- set ns.out = ns.out ~ "assistant: " -%}
|
||||
{%- set include_thinking = enable_thinking is true
|
||||
and idx > last_query_index
|
||||
and (idx == messages|length - 1 or reasoning_text)
|
||||
-%}
|
||||
{%- if include_thinking -%}
|
||||
{%- set ns.out = ns.out ~ "<think>\n" ~ (reasoning_text.strip() ) ~ "\n</think>\n\n" -%}
|
||||
{%- endif -%}
|
||||
{%- set ns.out = ns.out ~ content_text -%}
|
||||
{%- if tool_calls -%}
|
||||
{%- if content_text and not ns.out.endswith("\n") -%}
|
||||
{%- set ns.out = ns.out ~ "\n" -%}
|
||||
{%- endif -%}
|
||||
{%- for tool_call in tool_calls -%}
|
||||
{%- if tool_call.get("function") is not none -%}
|
||||
{%- set tool_call = tool_call.get("function") -%}
|
||||
{%- endif -%}
|
||||
{%- set arguments = tool_call.get("arguments", {}) -%}
|
||||
{%- if arguments is string -%}
|
||||
{%- set arguments_json = arguments -%}
|
||||
{%- elif arguments is mapping -%}
|
||||
{%- set arguments_json = arguments | tojson(ensure_ascii=True) -%}
|
||||
{%- else -%}
|
||||
{%- set arguments_json = arguments | tojson(ensure_ascii=True) -%}
|
||||
{%- endif -%}
|
||||
{%- set ns.out = ns.out
|
||||
~ "<tool_call>\n"
|
||||
~ "{\"name\": \"" ~ tool_call.get("name", "") ~ "\", \"arguments\": "
|
||||
~ arguments_json
|
||||
~ "}\n</tool_call>" -%}
|
||||
{%- endfor -%}
|
||||
{%- endif -%}
|
||||
{%- if not (continue_final_message and idx == messages|length - 1) -%}
|
||||
{%- set ns.out = ns.out ~ "\n\n<sep>" -%}
|
||||
{%- endif -%}
|
||||
{%- elif role == "tool" -%}
|
||||
{%- if idx == start_idx or messages[idx - 1].get("role") != "tool" -%}
|
||||
{%- set ns.out = ns.out ~ "human: " -%}
|
||||
{%- endif -%}
|
||||
{%- set response_text = render_content(content, num_img_tokens, num_video_frames) -%}
|
||||
{%- set ns.out = ns.out ~ "<tool_response>\n" ~ response_text ~ "\n</tool_response>" -%}
|
||||
{%- if idx == messages|length - 1 or messages[idx + 1].get("role") != "tool" -%}
|
||||
{%- set ns.out = ns.out ~ "<sep>" -%}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- if add_generation_prompt
|
||||
and (messages|length == 0 or messages[-1].get("role") != "assistant")
|
||||
-%}
|
||||
{%- if enable_thinking is true -%}
|
||||
{%- set ns.out = ns.out ~ "assistant: <think>\n" -%}
|
||||
{%- else -%}
|
||||
{%- set ns.out = ns.out ~ "assistant:" -%}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
{{- ns.out -}}
|
||||
174
scripts/gen-libllama-abi.py
Normal file
174
scripts/gen-libllama-abi.py
Normal file
@@ -0,0 +1,174 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract LLAMA_API function signatures from include/llama.h.
|
||||
|
||||
Outputs one normalized signature per line, sorted alphabetically by function
|
||||
name. The result is suitable for committing as scripts/libllama.abi and for
|
||||
diffing in CI to detect ABI changes.
|
||||
|
||||
Usage:
|
||||
python3 scripts/gen-libllama-abi.py [path/to/llama.h]
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
|
||||
def preprocess(text: str) -> str:
|
||||
"""Strip comments and preprocessor directives, keeping newlines for
|
||||
accurate error reporting (we don't use line numbers here but it keeps
|
||||
the character offsets meaningful for debugging)."""
|
||||
|
||||
# Remove /* ... */ block comments (may span lines).
|
||||
text = re.sub(r'/\*.*?\*/', lambda m: '\n' * m.group().count('\n'), text, flags=re.DOTALL)
|
||||
|
||||
# Remove // ... line comments (keep the newline).
|
||||
text = re.sub(r'//[^\n]*', '', text)
|
||||
|
||||
# Remove preprocessor directive lines (lines where the first non-space
|
||||
# char is '#'). Replace with blank lines to preserve offsets.
|
||||
lines = text.splitlines(keepends=True)
|
||||
result = []
|
||||
for line in lines:
|
||||
if line.lstrip().startswith('#'):
|
||||
result.append('\n' * line.count('\n'))
|
||||
else:
|
||||
result.append(line)
|
||||
return ''.join(result)
|
||||
|
||||
|
||||
def normalize(s: str) -> str:
|
||||
"""Collapse all whitespace runs to a single space and strip edges."""
|
||||
return re.sub(r'\s+', ' ', s).strip()
|
||||
|
||||
|
||||
def extract_signatures(header_text: str) -> list[str]:
|
||||
"""Return a sorted list of normalized LLAMA_API function signatures."""
|
||||
|
||||
text = preprocess(header_text)
|
||||
sigs: list[str] = []
|
||||
|
||||
i = 0
|
||||
n = len(text)
|
||||
|
||||
while i < n:
|
||||
# Find the next LLAMA_API token.
|
||||
pos = text.find('LLAMA_API', i)
|
||||
if pos == -1:
|
||||
break
|
||||
i = pos + len('LLAMA_API')
|
||||
|
||||
# Skip leading whitespace after LLAMA_API.
|
||||
while i < n and text[i] in ' \t\r\n':
|
||||
i += 1
|
||||
|
||||
# Determine whether we are inside DEPRECATED(...).
|
||||
#
|
||||
# Case A: DEPRECATED(LLAMA_API ..., "hint");
|
||||
# – look back before the LLAMA_API token for 'DEPRECATED('
|
||||
# Case B: LLAMA_API DEPRECATED(return_type func(...), "hint");
|
||||
# – look forward for 'DEPRECATED('
|
||||
|
||||
# Case A: look back (skip whitespace) for 'DEPRECATED('
|
||||
before = text[:pos].rstrip()
|
||||
in_deprecated_wrap = before.endswith('DEPRECATED(')
|
||||
|
||||
if in_deprecated_wrap:
|
||||
# We are the argument list of DEPRECATED(LLAMA_API ..., "hint");
|
||||
# Collect everything until the matching ')' that closes DEPRECATED,
|
||||
# then strip the trailing , "hint" part.
|
||||
depth = 1 # we just entered DEPRECATED(
|
||||
start = i # start of "return_type func_name(..."
|
||||
j = i
|
||||
while j < n and depth > 0:
|
||||
if text[j] == '(':
|
||||
depth += 1
|
||||
elif text[j] == ')':
|
||||
depth -= 1
|
||||
j += 1
|
||||
# text[start:j-1] is everything inside DEPRECATED(...).
|
||||
# We need the function signature part, which ends at the last
|
||||
# top-level comma (separating the function from the "hint" string).
|
||||
inner = text[start:j - 1]
|
||||
# Find the last top-level comma.
|
||||
depth2 = 0
|
||||
last_comma = -1
|
||||
for k, ch in enumerate(inner):
|
||||
if ch == '(':
|
||||
depth2 += 1
|
||||
elif ch == ')':
|
||||
depth2 -= 1
|
||||
elif ch == ',' and depth2 == 0:
|
||||
last_comma = k
|
||||
sig_raw = inner[:last_comma] if last_comma != -1 else inner
|
||||
i = j
|
||||
elif text[i:i + len('DEPRECATED(')] == 'DEPRECATED(':
|
||||
# Case B: LLAMA_API DEPRECATED(return_type func(...), "hint");
|
||||
i += len('DEPRECATED(')
|
||||
depth = 1
|
||||
start = i
|
||||
j = i
|
||||
while j < n and depth > 0:
|
||||
if text[j] == '(':
|
||||
depth += 1
|
||||
elif text[j] == ')':
|
||||
depth -= 1
|
||||
j += 1
|
||||
inner = text[start:j - 1]
|
||||
depth2 = 0
|
||||
last_comma = -1
|
||||
for k, ch in enumerate(inner):
|
||||
if ch == '(':
|
||||
depth2 += 1
|
||||
elif ch == ')':
|
||||
depth2 -= 1
|
||||
elif ch == ',' and depth2 == 0:
|
||||
last_comma = k
|
||||
sig_raw = inner[:last_comma] if last_comma != -1 else inner
|
||||
i = j
|
||||
else:
|
||||
# Plain: LLAMA_API return_type func_name(...);
|
||||
# Collect until the ';' at parenthesis depth 0.
|
||||
depth = 0
|
||||
start = i
|
||||
j = i
|
||||
while j < n:
|
||||
if text[j] == '(':
|
||||
depth += 1
|
||||
elif text[j] == ')':
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
j += 1
|
||||
break
|
||||
elif text[j] == ';' and depth == 0:
|
||||
break
|
||||
j += 1
|
||||
sig_raw = text[start:j]
|
||||
# Advance past the ';'
|
||||
i = j
|
||||
while i < n and text[i] in ' \t\r\n;':
|
||||
i += 1
|
||||
|
||||
sig = normalize(sig_raw)
|
||||
if sig and '(' in sig:
|
||||
sigs.append(sig)
|
||||
|
||||
_name_re = re.compile(r'\b(llama_\w+)\s*\(')
|
||||
|
||||
def _sort_key(s: str) -> str:
|
||||
m = _name_re.search(s)
|
||||
return m.group(1) if m else s
|
||||
|
||||
sigs.sort(key=_sort_key)
|
||||
return sigs
|
||||
|
||||
|
||||
def main() -> None:
|
||||
header_path = sys.argv[1] if len(sys.argv) > 1 else 'include/llama.h'
|
||||
with open(header_path, encoding='utf-8') as f:
|
||||
text = f.read()
|
||||
for sig in extract_signatures(text):
|
||||
print(sig)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
233
scripts/libllama.abi
Normal file
233
scripts/libllama.abi
Normal file
@@ -0,0 +1,233 @@
|
||||
const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter)
|
||||
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter)
|
||||
void llama_adapter_lora_free(struct llama_adapter_lora * adapter)
|
||||
struct llama_adapter_lora * llama_adapter_lora_init( struct llama_model * model, const char * path_lora)
|
||||
int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter)
|
||||
int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size)
|
||||
int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size)
|
||||
int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size)
|
||||
bool llama_add_bos_token(const struct llama_vocab * vocab)
|
||||
bool llama_add_eos_token(const struct llama_vocab * vocab)
|
||||
void llama_attach_threadpool( struct llama_context * ctx, ggml_threadpool_t threadpool, ggml_threadpool_t threadpool_batch)
|
||||
void llama_backend_free(void)
|
||||
void llama_backend_init(void)
|
||||
void llama_batch_free(struct llama_batch batch)
|
||||
struct llama_batch llama_batch_get_one( llama_token * tokens, int32_t n_tokens)
|
||||
struct llama_batch llama_batch_init( int32_t n_tokens, int32_t embd, int32_t n_seq_max)
|
||||
int32_t llama_chat_apply_template( const char * tmpl, const struct llama_chat_message * chat, size_t n_msg, bool add_ass, char * buf, int32_t length)
|
||||
int32_t llama_chat_builtin_templates(const char ** output, size_t len)
|
||||
struct llama_context_params llama_context_default_params(void)
|
||||
size_t llama_copy_state_data( struct llama_context * ctx, uint8_t * dst)
|
||||
int32_t llama_decode( struct llama_context * ctx, struct llama_batch batch)
|
||||
void llama_detach_threadpool(struct llama_context * ctx)
|
||||
int32_t llama_detokenize( const struct llama_vocab * vocab, const llama_token * tokens, int32_t n_tokens, char * text, int32_t text_len_max, bool remove_special, bool unparse_special)
|
||||
int32_t llama_encode( struct llama_context * ctx, struct llama_batch batch)
|
||||
const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type)
|
||||
void llama_free(struct llama_context * ctx)
|
||||
void llama_free_model(struct llama_model * model)
|
||||
float * llama_get_embeddings(struct llama_context * ctx)
|
||||
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i)
|
||||
float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id)
|
||||
float * llama_get_logits(struct llama_context * ctx)
|
||||
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i)
|
||||
llama_memory_t llama_get_memory (const struct llama_context * ctx)
|
||||
const struct llama_model * llama_get_model (const struct llama_context * ctx)
|
||||
uint32_t llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i)
|
||||
llama_token * llama_get_sampled_candidates_ith (struct llama_context * ctx, int32_t i)
|
||||
uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i)
|
||||
float * llama_get_sampled_logits_ith (struct llama_context * ctx, int32_t i)
|
||||
uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i)
|
||||
float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i)
|
||||
llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i)
|
||||
size_t llama_get_state_size(struct llama_context * ctx)
|
||||
struct llama_context * llama_init_from_model( struct llama_model * model, struct llama_context_params params)
|
||||
struct llama_model * llama_load_model_from_file( const char * path_model, struct llama_model_params params)
|
||||
bool llama_load_session_file( struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out)
|
||||
void llama_log_get(ggml_log_callback * log_callback, void ** user_data)
|
||||
void llama_log_set(ggml_log_callback log_callback, void * user_data)
|
||||
size_t llama_max_devices(void)
|
||||
size_t llama_max_parallel_sequences(void)
|
||||
size_t llama_max_tensor_buft_overrides(void)
|
||||
void llama_memory_breakdown_print(const struct llama_context * ctx)
|
||||
bool llama_memory_can_shift(llama_memory_t mem)
|
||||
void llama_memory_clear( llama_memory_t mem, bool data)
|
||||
void llama_memory_seq_add( llama_memory_t mem, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta)
|
||||
void llama_memory_seq_cp( llama_memory_t mem, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1)
|
||||
void llama_memory_seq_div( llama_memory_t mem, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d)
|
||||
void llama_memory_seq_keep( llama_memory_t mem, llama_seq_id seq_id)
|
||||
llama_pos llama_memory_seq_pos_max( llama_memory_t mem, llama_seq_id seq_id)
|
||||
llama_pos llama_memory_seq_pos_min( llama_memory_t mem, llama_seq_id seq_id)
|
||||
bool llama_memory_seq_rm( llama_memory_t mem, llama_seq_id seq_id, llama_pos p0, llama_pos p1)
|
||||
const char * llama_model_chat_template(const struct llama_model * model, const char * name)
|
||||
const char * llama_model_cls_label(const struct llama_model * model, uint32_t i)
|
||||
llama_token llama_model_decoder_start_token(const struct llama_model * model)
|
||||
struct llama_model_params llama_model_default_params(void)
|
||||
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size)
|
||||
void llama_model_free(struct llama_model * model)
|
||||
const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model)
|
||||
bool llama_model_has_decoder(const struct llama_model * model)
|
||||
bool llama_model_has_encoder(const struct llama_model * model)
|
||||
struct llama_model * llama_model_init_from_user( struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud, struct llama_model_params params)
|
||||
bool llama_model_is_diffusion(const struct llama_model * model)
|
||||
bool llama_model_is_hybrid(const struct llama_model * model)
|
||||
bool llama_model_is_recurrent(const struct llama_model * model)
|
||||
struct llama_model * llama_model_load_from_file( const char * path_model, struct llama_model_params params)
|
||||
struct llama_model * llama_model_load_from_file_ptr( FILE * file, struct llama_model_params params)
|
||||
struct llama_model * llama_model_load_from_splits( const char ** paths, size_t n_paths, struct llama_model_params params)
|
||||
int32_t llama_model_meta_count(const struct llama_model * model)
|
||||
int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size)
|
||||
const char * llama_model_meta_key_str(enum llama_model_meta_key key)
|
||||
int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size)
|
||||
int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size)
|
||||
uint32_t llama_model_n_cls_out(const struct llama_model * model)
|
||||
int32_t llama_model_n_ctx_train(const struct llama_model * model)
|
||||
int32_t llama_model_n_embd (const struct llama_model * model)
|
||||
int32_t llama_model_n_embd_inp (const struct llama_model * model)
|
||||
int32_t llama_model_n_embd_out (const struct llama_model * model)
|
||||
int32_t llama_model_n_head (const struct llama_model * model)
|
||||
int32_t llama_model_n_head_kv (const struct llama_model * model)
|
||||
int32_t llama_model_n_layer (const struct llama_model * model)
|
||||
uint64_t llama_model_n_params(const struct llama_model * model)
|
||||
int32_t llama_model_n_swa (const struct llama_model * model)
|
||||
uint32_t llama_model_quantize( const char * fname_inp, const char * fname_out, const llama_model_quantize_params * params)
|
||||
struct llama_model_quantize_params llama_model_quantize_default_params(void)
|
||||
float llama_model_rope_freq_scale_train(const struct llama_model * model)
|
||||
enum llama_rope_type llama_model_rope_type(const struct llama_model * model)
|
||||
void llama_model_save_to_file( const struct llama_model * model, const char * path_model)
|
||||
uint64_t llama_model_size(const struct llama_model * model)
|
||||
uint32_t llama_n_batch (const struct llama_context * ctx)
|
||||
uint32_t llama_n_ctx (const struct llama_context * ctx)
|
||||
uint32_t llama_n_ctx_seq (const struct llama_context * ctx)
|
||||
int32_t llama_n_ctx_train(const struct llama_model * model)
|
||||
int32_t llama_n_embd (const struct llama_model * model)
|
||||
int32_t llama_n_head (const struct llama_model * model)
|
||||
int32_t llama_n_layer (const struct llama_model * model)
|
||||
uint32_t llama_n_seq_max (const struct llama_context * ctx)
|
||||
int32_t llama_n_threads(struct llama_context * ctx)
|
||||
int32_t llama_n_threads_batch(struct llama_context * ctx)
|
||||
uint32_t llama_n_ubatch (const struct llama_context * ctx)
|
||||
int32_t llama_n_vocab (const struct llama_vocab * vocab)
|
||||
struct llama_context * llama_new_context_with_model( struct llama_model * model, struct llama_context_params params)
|
||||
void llama_numa_init(enum ggml_numa_strategy numa)
|
||||
void llama_opt_epoch( struct llama_context * lctx, ggml_opt_dataset_t dataset, ggml_opt_result_t result_train, ggml_opt_result_t result_eval, int64_t idata_split, ggml_opt_epoch_callback callback_train, ggml_opt_epoch_callback callback_eval)
|
||||
void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params)
|
||||
bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata)
|
||||
enum llama_params_fit_status llama_params_fit( const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides, size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level)
|
||||
struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx)
|
||||
void llama_perf_context_print(const struct llama_context * ctx)
|
||||
void llama_perf_context_reset( struct llama_context * ctx)
|
||||
struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain)
|
||||
void llama_perf_sampler_print(const struct llama_sampler * chain)
|
||||
void llama_perf_sampler_reset( struct llama_sampler * chain)
|
||||
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx)
|
||||
const char * llama_print_system_info(void)
|
||||
void llama_sampler_accept( struct llama_sampler * smpl, llama_token token)
|
||||
void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p)
|
||||
void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl)
|
||||
struct llama_sampler_chain_params llama_sampler_chain_default_params(void)
|
||||
struct llama_sampler * llama_sampler_chain_get( struct llama_sampler * chain, int32_t i)
|
||||
struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params)
|
||||
int llama_sampler_chain_n (const struct llama_sampler * chain)
|
||||
struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i)
|
||||
struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl)
|
||||
void llama_sampler_free ( struct llama_sampler * smpl)
|
||||
uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl)
|
||||
struct llama_sampler * llama_sampler_init ( struct llama_sampler_i * iface, llama_sampler_context_t ctx)
|
||||
struct llama_sampler * llama_sampler_init_adaptive_p( float target, float decay, uint32_t seed)
|
||||
struct llama_sampler * llama_sampler_init_dist(uint32_t seed)
|
||||
struct llama_sampler * llama_sampler_init_dry( const struct llama_vocab * vocab, int32_t n_ctx_train, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char ** seq_breakers, size_t num_breakers)
|
||||
struct llama_sampler * llama_sampler_init_grammar( const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root)
|
||||
struct llama_sampler * llama_sampler_init_grammar_lazy( const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root, const char ** trigger_words, size_t num_trigger_words, const llama_token * trigger_tokens, size_t num_trigger_tokens)
|
||||
struct llama_sampler * llama_sampler_init_grammar_lazy_patterns( const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root, const char ** trigger_patterns, size_t num_trigger_patterns, const llama_token * trigger_tokens, size_t num_trigger_tokens)
|
||||
struct llama_sampler * llama_sampler_init_greedy(void)
|
||||
struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab)
|
||||
struct llama_sampler * llama_sampler_init_logit_bias( int32_t n_vocab, int32_t n_logit_bias, const llama_logit_bias * logit_bias)
|
||||
struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep)
|
||||
struct llama_sampler * llama_sampler_init_mirostat( int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m)
|
||||
struct llama_sampler * llama_sampler_init_mirostat_v2( uint32_t seed, float tau, float eta)
|
||||
struct llama_sampler * llama_sampler_init_penalties( int32_t penalty_last_n, float penalty_repeat, float penalty_freq, float penalty_present)
|
||||
struct llama_sampler * llama_sampler_init_temp (float t)
|
||||
struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent)
|
||||
struct llama_sampler * llama_sampler_init_top_k (int32_t k)
|
||||
struct llama_sampler * llama_sampler_init_top_n_sigma(float n)
|
||||
struct llama_sampler * llama_sampler_init_top_p (float p, size_t min_keep)
|
||||
struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep)
|
||||
struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed)
|
||||
const char * llama_sampler_name (const struct llama_sampler * smpl)
|
||||
void llama_sampler_reset ( struct llama_sampler * smpl)
|
||||
llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx)
|
||||
bool llama_save_session_file( struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count)
|
||||
void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data)
|
||||
int32_t llama_set_adapter_cvec( struct llama_context * ctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end)
|
||||
int32_t llama_set_adapters_lora( struct llama_context * ctx, struct llama_adapter_lora ** adapters, size_t n_adapters, float * scales)
|
||||
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn)
|
||||
void llama_set_embeddings(struct llama_context * ctx, bool embeddings)
|
||||
void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch)
|
||||
bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl)
|
||||
size_t llama_set_state_data( struct llama_context * ctx, const uint8_t * src)
|
||||
void llama_set_warmup(struct llama_context * ctx, bool warmup)
|
||||
int32_t llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int32_t split_no, int32_t split_count)
|
||||
int32_t llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int32_t split_no, int32_t split_count)
|
||||
size_t llama_state_get_data( struct llama_context * ctx, uint8_t * dst, size_t size)
|
||||
size_t llama_state_get_size(struct llama_context * ctx)
|
||||
bool llama_state_load_file( struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out)
|
||||
bool llama_state_save_file( struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count)
|
||||
size_t llama_state_seq_get_data( struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id)
|
||||
size_t llama_state_seq_get_data_ext( struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags)
|
||||
size_t llama_state_seq_get_size( struct llama_context * ctx, llama_seq_id seq_id)
|
||||
size_t llama_state_seq_get_size_ext( struct llama_context * ctx, llama_seq_id seq_id, llama_state_seq_flags flags)
|
||||
size_t llama_state_seq_load_file( struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out)
|
||||
size_t llama_state_seq_save_file( struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count)
|
||||
size_t llama_state_seq_set_data( struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id)
|
||||
size_t llama_state_seq_set_data_ext( struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id, llama_state_seq_flags flags)
|
||||
size_t llama_state_set_data( struct llama_context * ctx, const uint8_t * src, size_t size)
|
||||
bool llama_supports_gpu_offload(void)
|
||||
bool llama_supports_mlock (void)
|
||||
bool llama_supports_mmap (void)
|
||||
bool llama_supports_rpc (void)
|
||||
void llama_synchronize(struct llama_context * ctx)
|
||||
int64_t llama_time_us(void)
|
||||
llama_token llama_token_bos(const struct llama_vocab * vocab)
|
||||
llama_token llama_token_cls(const struct llama_vocab * vocab)
|
||||
llama_token llama_token_eos(const struct llama_vocab * vocab)
|
||||
llama_token llama_token_eot(const struct llama_vocab * vocab)
|
||||
llama_token llama_token_fim_mid(const struct llama_vocab * vocab)
|
||||
llama_token llama_token_fim_pad(const struct llama_vocab * vocab)
|
||||
llama_token llama_token_fim_pre(const struct llama_vocab * vocab)
|
||||
llama_token llama_token_fim_rep(const struct llama_vocab * vocab)
|
||||
llama_token llama_token_fim_sep(const struct llama_vocab * vocab)
|
||||
llama_token llama_token_fim_suf(const struct llama_vocab * vocab)
|
||||
enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token)
|
||||
float llama_token_get_score(const struct llama_vocab * vocab, llama_token token)
|
||||
const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token)
|
||||
bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token)
|
||||
bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token)
|
||||
llama_token llama_token_nl (const struct llama_vocab * vocab)
|
||||
llama_token llama_token_pad(const struct llama_vocab * vocab)
|
||||
llama_token llama_token_sep(const struct llama_vocab * vocab)
|
||||
int32_t llama_token_to_piece( const struct llama_vocab * vocab, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special)
|
||||
int32_t llama_tokenize( const struct llama_vocab * vocab, const char * text, int32_t text_len, llama_token * tokens, int32_t n_tokens_max, bool add_special, bool parse_special)
|
||||
llama_token llama_vocab_bos(const struct llama_vocab * vocab)
|
||||
llama_token llama_vocab_cls(const struct llama_vocab * vocab)
|
||||
llama_token llama_vocab_eos(const struct llama_vocab * vocab)
|
||||
llama_token llama_vocab_eot(const struct llama_vocab * vocab)
|
||||
llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab)
|
||||
llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab)
|
||||
llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab)
|
||||
llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab)
|
||||
llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab)
|
||||
llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab)
|
||||
bool llama_vocab_get_add_bos(const struct llama_vocab * vocab)
|
||||
bool llama_vocab_get_add_eos(const struct llama_vocab * vocab)
|
||||
bool llama_vocab_get_add_sep(const struct llama_vocab * vocab)
|
||||
enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token)
|
||||
float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token)
|
||||
const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token)
|
||||
bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token)
|
||||
bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token)
|
||||
llama_token llama_vocab_mask(const struct llama_vocab * vocab)
|
||||
int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab)
|
||||
llama_token llama_vocab_nl (const struct llama_vocab * vocab)
|
||||
llama_token llama_vocab_pad(const struct llama_vocab * vocab)
|
||||
llama_token llama_vocab_sep(const struct llama_vocab * vocab)
|
||||
enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab)
|
||||
@@ -153,7 +153,7 @@ add_library(llama
|
||||
|
||||
set_target_properties(llama PROPERTIES
|
||||
VERSION ${LLAMA_INSTALL_VERSION}
|
||||
SOVERSION 0
|
||||
SOVERSION ${LLAMA_VERSION_MAJOR}
|
||||
MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
|
||||
)
|
||||
|
||||
|
||||
@@ -18,9 +18,6 @@
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
// TODO: tmp until the ggml meta backend matures and becomes public
|
||||
#include "../src/ggml-ext.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cfloat>
|
||||
|
||||
@@ -15,9 +15,6 @@
|
||||
#include "ggml-backend.h"
|
||||
#include "gguf.h"
|
||||
|
||||
// TODO: tmp until the ggml meta backend matures and becomes public
|
||||
#include "../src/ggml-ext.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
|
||||
@@ -8506,6 +8506,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 20481, 4, 1, 1 }));
|
||||
|
||||
test_cases.emplace_back(new test_xielu());
|
||||
test_cases.emplace_back(new test_xielu(GGML_TYPE_F16));
|
||||
test_cases.emplace_back(new test_xielu(GGML_TYPE_F32, { 512, 16, 1, 1 }));
|
||||
test_cases.emplace_back(new test_xielu(GGML_TYPE_F16, { 512, 16, 1, 1 }));
|
||||
|
||||
test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER));
|
||||
test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER_DIAG));
|
||||
|
||||
@@ -2164,7 +2164,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
|
||||
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}"
|
||||
"</tool_call>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
@@ -2172,7 +2172,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
|
||||
tst.test(
|
||||
"Hello, world!\nWhat's up?<tool_call>\n"
|
||||
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
|
||||
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}"
|
||||
"</tool_call>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_content)
|
||||
@@ -3329,6 +3329,92 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.run();
|
||||
}
|
||||
|
||||
// Reka-Edge tests - uses native JSON format with per-call wrapper
|
||||
{
|
||||
auto tst = peg_tester("models/templates/Reka-Edge.jinja", detailed_debug);
|
||||
|
||||
// Basic content only
|
||||
tst.test("Hello, world!\nWhat's up?").enable_thinking(false).expect(message_assist).run();
|
||||
|
||||
// Single tool call without reasoning
|
||||
tst.test("<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>")
|
||||
.enable_thinking(false)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.run();
|
||||
|
||||
// Tool call with string argument
|
||||
tst.test("<tool_call>\n{\"name\": \"get_time\", \"arguments\": {\"city\": \"XYZCITY\"}}</tool_call>")
|
||||
.enable_thinking(false)
|
||||
.tools({ get_time_tool })
|
||||
.expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
|
||||
.run();
|
||||
|
||||
// Tool call with reasoning (enable_thinking=true)
|
||||
tst.test("I'm\nthinking</think><tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>")
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_thoughts)
|
||||
.run();
|
||||
|
||||
// Multiple tool calls (parallel)
|
||||
tst.test(
|
||||
"<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>"
|
||||
"<tool_call>\n{\"name\": \"special_function_with_opt\", \"arguments\": {\"arg1\": 1, \"arg2\": 2}}</tool_call>"
|
||||
)
|
||||
.enable_thinking(false)
|
||||
.parallel_tool_calls(true)
|
||||
.tools({
|
||||
special_function_tool, special_function_tool_with_optional_param
|
||||
})
|
||||
.expect_tool_calls({
|
||||
{ "special_function", R"({"arg1": 1})", {} },
|
||||
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
// Tool call with reasoning and content
|
||||
tst.test("I need to call a function</think>"
|
||||
"Let me check the time.<tool_call>\n{\"name\": \"get_time\", \"arguments\": {\"city\": \"XYZCITY\"}}</tool_call>")
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({ get_time_tool })
|
||||
.expect(message_with_reasoning_content_and_multiple_tool_calls(
|
||||
"I need to call a function", "Let me check the time.", { { "get_time", "{\"city\":\"XYZCITY\"}" } }
|
||||
))
|
||||
.run();
|
||||
|
||||
// Partial tool call (streaming)
|
||||
tst.test("<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\":")
|
||||
.tools({ special_function_tool })
|
||||
.enable_thinking(false)
|
||||
.is_partial(true)
|
||||
.expect(simple_assist_msg("", "", "special_function", "{\"arg1\": "))
|
||||
.run();
|
||||
|
||||
// Tool call with empty arguments
|
||||
tst.test("<tool_call>\n{\"name\": \"empty_args\", \"arguments\": {}}</tool_call>")
|
||||
.enable_thinking(false)
|
||||
.tools({ empty_args_tool })
|
||||
.expect(simple_assist_msg("", "", "empty_args", "{}"))
|
||||
.run();
|
||||
|
||||
// fake tool call marker in reasoning
|
||||
tst.test(
|
||||
"Let me think about <tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 2}}</tool_call> hmm</think>"
|
||||
"<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>")
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({ special_function_tool })
|
||||
.expect_reasoning("Let me think about <tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 2}}</tool_call> hmm")
|
||||
.expect_tool_calls({
|
||||
{ "special_function", R"({"arg1": 1})", {} },
|
||||
})
|
||||
.run();
|
||||
}
|
||||
|
||||
|
||||
// Apertus-8B-Instruct tests - FUNC_NAME_AS_KEY format
|
||||
// Format: <|tools_prefix|>[{"function_name": {...arguments...}}]<|tools_suffix|>
|
||||
{
|
||||
|
||||
@@ -41,8 +41,10 @@ int main(void) {
|
||||
} else if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
const mtmd_image_tokens * image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
size_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
|
||||
size_t nx = mtmd_image_tokens_get_nx(image_tokens);
|
||||
size_t ny = mtmd_image_tokens_get_ny(image_tokens);
|
||||
// get position of the last token, which should be (nx - 1, ny - 1)
|
||||
struct mtmd_decoder_pos pos = mtmd_image_tokens_get_decoder_pos(image_tokens, n_tokens - 1);
|
||||
size_t nx = pos.x + 1;
|
||||
size_t ny = pos.y + 1;
|
||||
const char * id = mtmd_image_tokens_get_id(image_tokens);
|
||||
assert(n_tokens > 0);
|
||||
assert(nx > 0);
|
||||
|
||||
@@ -114,6 +114,13 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
|
||||
return n_pos;
|
||||
}
|
||||
|
||||
void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * chunks, mtmd_decoder_pos * out_pos) {
|
||||
size_t n_tokens = mtmd_image_tokens_get_n_tokens(chunks);
|
||||
for (size_t i = 0; i < n_tokens; i++) {
|
||||
out_pos[i] = mtmd_image_tokens_get_decoder_pos(chunks, i);
|
||||
}
|
||||
}
|
||||
|
||||
// helper struct to make working with embd batch easier
|
||||
// note: this will be removed after llama_batch_ext refactoring
|
||||
struct decode_embd_batch {
|
||||
@@ -156,18 +163,15 @@ struct decode_embd_batch {
|
||||
}
|
||||
|
||||
// M-RoPE for image
|
||||
void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
||||
void set_position_mrope_2d(llama_pos pos_0, const std::vector<mtmd_decoder_pos> & rel_pos, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
GGML_ASSERT(nx > 0 && ny > 0 && nx * ny == batch.n_tokens);
|
||||
GGML_ASSERT(!rel_pos.empty() && (int32_t)rel_pos.size() == batch.n_tokens);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int y = 0; y < ny; y++) {
|
||||
for (int x = 0; x < nx; x++) {
|
||||
int i = y * nx + x;
|
||||
pos[i ] = pos_0;
|
||||
pos[i + batch.n_tokens ] = pos_0 + y;
|
||||
pos[i + batch.n_tokens * 2] = pos_0 + x;
|
||||
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
||||
}
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
pos[i ] = pos_0 + rel_pos[i].t;
|
||||
pos[i + batch.n_tokens ] = pos_0 + rel_pos[i].y;
|
||||
pos[i + batch.n_tokens * 2] = pos_0 + rel_pos[i].x;
|
||||
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
||||
}
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.n_seq_id[i] = 1;
|
||||
@@ -262,9 +266,10 @@ int32_t mtmd_helper_decode_image_chunk(
|
||||
LOG_ERR("failed to decode chunk: image tokens are null\n");
|
||||
return -1;
|
||||
}
|
||||
const int nx = mtmd_image_tokens_get_nx(image_tokens);
|
||||
const int ny = mtmd_image_tokens_get_ny(image_tokens);
|
||||
batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
|
||||
const auto n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
|
||||
std::vector<mtmd_decoder_pos> rel_pos(n_tokens);
|
||||
mtmd_helper_image_get_decoder_pos(image_tokens, rel_pos.data());
|
||||
batch_embd.set_position_mrope_2d(n_past, rel_pos, seq_id);
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
batch_embd.set_position_mrope_1d(n_past, seq_id);
|
||||
} else {
|
||||
|
||||
@@ -47,6 +47,10 @@ MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
|
||||
// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
|
||||
MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
|
||||
|
||||
// helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE
|
||||
// out_pos must have length == mtmd_helper_get_n_tokens(image)
|
||||
MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, mtmd_decoder_pos * out_pos);
|
||||
|
||||
// helper function that automatically:
|
||||
// 1. run llama_decode() on text chunks
|
||||
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
||||
|
||||
@@ -1249,6 +1249,14 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
|
||||
return image_tokens->ny;
|
||||
}
|
||||
|
||||
mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i) {
|
||||
mtmd_decoder_pos pos;
|
||||
pos.t = 0;
|
||||
pos.x = i % image_tokens->nx;
|
||||
pos.y = i / image_tokens->nx;
|
||||
return pos;
|
||||
}
|
||||
|
||||
const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
|
||||
return image_tokens->id.c_str();
|
||||
}
|
||||
|
||||
@@ -186,12 +186,25 @@ MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
|
||||
// the instance will be constructed via mtmd_tokenize()
|
||||
// it will be freed along with mtmd_input_chunk
|
||||
MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
|
||||
MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate
|
||||
// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
|
||||
MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate
|
||||
|
||||
DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens),
|
||||
"use mtmd_image_tokens_get_decoder_pos() instead");
|
||||
DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens),
|
||||
"use mtmd_image_tokens_get_decoder_pos() instead");
|
||||
|
||||
struct mtmd_decoder_pos {
|
||||
uint32_t t;
|
||||
uint32_t x;
|
||||
uint32_t y;
|
||||
};
|
||||
// get position for decoder attention, to be used by M-RoPE models
|
||||
// i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1
|
||||
// return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position)
|
||||
MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i);
|
||||
|
||||
// tokenize an input text prompt and a list of bitmaps (images/audio)
|
||||
// the prompt must have the input image marker (default: "<__media__>") in it
|
||||
// the default marker is defined by mtmd_default_marker()
|
||||
|
||||
2
vendor/cpp-httplib/CMakeLists.txt
vendored
2
vendor/cpp-httplib/CMakeLists.txt
vendored
@@ -39,7 +39,7 @@ if (LLAMA_BUILD_BORINGSSL)
|
||||
set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)")
|
||||
|
||||
set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository")
|
||||
set(BORINGSSL_VERSION "0.20260327.0" CACHE STRING "BoringSSL version")
|
||||
set(BORINGSSL_VERSION "0.20260413.0" CACHE STRING "BoringSSL version")
|
||||
|
||||
message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user