mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-05-21 17:17:24 +03:00
Compare commits
70 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
81b0d882ae | ||
|
|
0f45f1a35c | ||
|
|
42532afff4 | ||
|
|
dbe7901ca6 | ||
|
|
320a6a44a5 | ||
|
|
9ed6e19b9d | ||
|
|
4c1c3ac09d | ||
|
|
7f3f843c31 | ||
|
|
ec562eb673 | ||
|
|
95d469a915 | ||
|
|
1e4579fbb8 | ||
|
|
527045bfb0 | ||
|
|
2dfeca31cc | ||
|
|
46be24d121 | ||
|
|
7e16646015 | ||
|
|
ad96bb8c0c | ||
|
|
e75cd5efb5 | ||
|
|
5d44db6008 | ||
|
|
3796c94bad | ||
|
|
634275fbbb | ||
|
|
bcfe63fc53 | ||
|
|
61af07c22d | ||
|
|
856c3adac1 | ||
|
|
a9883db8ee | ||
|
|
cce09f0b2b | ||
|
|
dded58b450 | ||
|
|
7bfe120c21 | ||
|
|
927dada6c9 | ||
|
|
239a497e5f | ||
|
|
89730c8d26 | ||
|
|
fde69a3607 | ||
|
|
ef93e98d01 | ||
|
|
706fbd8ab6 | ||
|
|
fa62042af9 | ||
|
|
4178259130 | ||
|
|
78fbbc2c07 | ||
|
|
da44953329 | ||
|
|
1ec7ba0c14 | ||
|
|
8e1f9d0834 | ||
|
|
e936660760 | ||
|
|
ef22b3e4ac | ||
|
|
68e7ea3eab | ||
|
|
928b486b0c | ||
|
|
7dbb0e998a | ||
|
|
dd9280a664 | ||
|
|
8cef8201a1 | ||
|
|
f5636f8fc7 | ||
|
|
838374375c | ||
|
|
7d442abf5c | ||
|
|
389ff61d77 | ||
|
|
2e97c5f96f | ||
|
|
5d5d2e15d2 | ||
|
|
2b2babd124 | ||
|
|
0b047287fe | ||
|
|
efbada936f | ||
|
|
f3c3e0e9a0 | ||
|
|
5755a100cd | ||
|
|
1e5ad35d56 | ||
|
|
65d7a8bbf0 | ||
|
|
00d56b11c3 | ||
|
|
5757c4dcb1 | ||
|
|
e20b83930c | ||
|
|
fd89556567 | ||
|
|
60489932ec | ||
|
|
4a4f819cb6 | ||
|
|
046e284437 | ||
|
|
66001722aa | ||
|
|
c5703e03a5 | ||
|
|
b46812de78 | ||
|
|
49956041ee |
@@ -5,8 +5,15 @@ ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
|
||||
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build
|
||||
|
||||
ARG GGML_SYCL_F16=OFF
|
||||
ARG LEVEL_ZERO_VERSION=1.28.2
|
||||
ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git libssl-dev
|
||||
apt-get install -y git libssl-dev wget ca-certificates && \
|
||||
cd /tmp && \
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb && \
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb && \
|
||||
apt-get -o Dpkg::Options::="--force-overwrite" install -y ./level-zero.deb ./level-zero-devel.deb && \
|
||||
rm -f /tmp/level-zero.deb /tmp/level-zero-devel.deb
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -33,11 +40,11 @@ RUN mkdir -p /app/full \
|
||||
|
||||
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
|
||||
|
||||
ARG IGC_VERSION=v2.30.1
|
||||
ARG IGC_VERSION_FULL=2_2.30.1+20950
|
||||
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
|
||||
ARG IGDGMM_VERSION=22.9.0
|
||||
ARG IGC_VERSION=v2.20.5
|
||||
ARG IGC_VERSION_FULL=2_2.20.5+19972
|
||||
ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL=25.40.35563.10-0
|
||||
ARG IGDGMM_VERSION=22.8.2
|
||||
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
|
||||
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
|
||||
@@ -109,4 +116,3 @@ WORKDIR /app
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
||||
|
||||
|
||||
@@ -103,6 +103,7 @@ let
|
||||
vulkan-headers
|
||||
vulkan-loader
|
||||
shaderc
|
||||
spirv-headers
|
||||
];
|
||||
in
|
||||
|
||||
@@ -146,7 +147,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
ninja
|
||||
pkg-config
|
||||
git
|
||||
spirv-headers
|
||||
]
|
||||
++ optionals useCuda [
|
||||
cudaPackages.cuda_nvcc
|
||||
|
||||
5
.github/workflows/build-cross.yml
vendored
5
.github/workflows/build-cross.yml
vendored
@@ -301,16 +301,17 @@ jobs:
|
||||
export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
|
||||
cmake -B build -DLLAMA_OPENSSL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DGGML_CPU_REPACK=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DGGML_CPU_RISCV64_SPACEMIT=ON \
|
||||
-DGGML_RVV=ON \
|
||||
-DGGML_RV_ZVFH=ON \
|
||||
-DGGML_RV_ZFH=ON \
|
||||
-DGGML_RV_ZICBOP=ON \
|
||||
-DGGML_RV_ZIHINTPAUSE=ON \
|
||||
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
|
||||
-DGGML_RV_ZBA=ON \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
18
.github/workflows/build-sycl.yml
vendored
18
.github/workflows/build-sycl.yml
vendored
@@ -50,6 +50,8 @@ jobs:
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
LEVEL_ZERO_VERSION: "1.28.2"
|
||||
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
@@ -71,6 +73,14 @@ jobs:
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
@@ -107,6 +117,7 @@ jobs:
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
steps:
|
||||
@@ -127,6 +138,13 @@ jobs:
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: pwsh
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
|
||||
50
.github/workflows/build-virtgpu.yml
vendored
Normal file
50
.github/workflows/build-virtgpu.yml
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
name: CI (virtgpu)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-virtgpu.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-virtgpu.yml',
|
||||
'ggml/src/ggml-virtgpu/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
ubuntu-24-virtgpu:
|
||||
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential libdrm-dev pkg-config libssl-dev
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_VIRTGPU=ON \
|
||||
-DGGML_VIRTGPU_BACKEND=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
3
.github/workflows/build.yml
vendored
3
.github/workflows/build.yml
vendored
@@ -456,7 +456,8 @@ jobs:
|
||||
run: |
|
||||
cd build
|
||||
# This is using llvmpipe and runs slower than other backends
|
||||
ctest -L main --verbose --timeout 900
|
||||
# test-backend-ops is too slow on llvmpipe, skip it
|
||||
ctest -L main -E test-backend-ops --verbose --timeout 900
|
||||
|
||||
ubuntu-24-webgpu-wasm:
|
||||
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
|
||||
51
.github/workflows/code-style.yml
vendored
Normal file
51
.github/workflows/code-style.yml
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
name: Code Style Checker
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request:
|
||||
branches:
|
||||
- master
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
model-naming:
|
||||
runs-on: ubuntu-slim
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- name: Check model naming conventions
|
||||
run: |
|
||||
python3 - << 'EOF'
|
||||
import re, os, sys
|
||||
|
||||
pairs = re.findall(
|
||||
r'case\s+(LLM_ARCH_\w+)\s*:\s*\n\s+return new (llama_model_\w+)\s*\(',
|
||||
open("src/llama-model.cpp").read())
|
||||
|
||||
errors = []
|
||||
for arch, cls in pairs:
|
||||
suffix = arch[len("LLM_ARCH_"):]
|
||||
csuffix = cls[len("llama_model_"):]
|
||||
fname = csuffix.replace("_", "-") + ".cpp"
|
||||
|
||||
if not re.fullmatch(r'[A-Z][A-Z0-9_]*', suffix):
|
||||
errors.append(f"{arch}: suffix not upper snake case, example: LLM_ARCH_MY_MODEL")
|
||||
|
||||
if not re.fullmatch(r'[a-z][a-z0-9_]*', csuffix):
|
||||
errors.append(f"{arch}: class suffix not lower snake case, example: llama_model_my_model")
|
||||
|
||||
elif suffix.lower() != csuffix:
|
||||
errors.append(f"{arch}: arch/class name mismatch, expected class 'llama_model_{suffix.lower()}' but got '{cls}'")
|
||||
|
||||
elif not os.path.isfile(f"src/models/{fname}"):
|
||||
errors.append(f"{arch}: expects model file name to be src/models/{fname}, but not found")
|
||||
|
||||
if errors:
|
||||
print('\n'.join(f" - {e}" for e in errors)); sys.exit(1)
|
||||
print(f"OK: {len(pairs)} mappings validated.")
|
||||
EOF
|
||||
5
.github/workflows/editorconfig.yml
vendored
5
.github/workflows/editorconfig.yml
vendored
@@ -2,11 +2,6 @@ name: EditorConfig Checker
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
inputs:
|
||||
create_release:
|
||||
description: 'Create new release'
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
2
.github/workflows/python-type-check.yml
vendored
2
.github/workflows/python-type-check.yml
vendored
@@ -31,7 +31,7 @@ jobs:
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: "3.11"
|
||||
pip-install: -r requirements/requirements-all.txt ty==0.0.33
|
||||
pip-install: -r requirements/requirements-all.txt ty==0.0.35
|
||||
# - name: Type-check with Pyright
|
||||
# uses: jakebailey/pyright-action@v2
|
||||
# with:
|
||||
|
||||
25
.github/workflows/release.yml
vendored
25
.github/workflows/release.yml
vendored
@@ -600,6 +600,7 @@ jobs:
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
|
||||
@@ -621,6 +622,13 @@ jobs:
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: pwsh
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
@@ -655,6 +663,13 @@ jobs:
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
|
||||
if [ -n "$ZE_LOADER_DLL" ]; then
|
||||
echo "Using Level Zero loader: $ZE_LOADER_DLL"
|
||||
cp "$ZE_LOADER_DLL" ./build/bin
|
||||
else
|
||||
echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
|
||||
fi
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
@@ -695,6 +710,8 @@ jobs:
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
LEVEL_ZERO_VERSION: "1.28.2"
|
||||
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -718,6 +735,14 @@ jobs:
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -110,6 +110,7 @@ uv.lock
|
||||
|
||||
# Nix
|
||||
|
||||
flake.lock
|
||||
/result
|
||||
|
||||
# Test binaries
|
||||
|
||||
@@ -24,6 +24,6 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
|
||||
set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
|
||||
set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zvfh_zba_zicbop -mabi=lp64d -fno-tree-vectorize -fno-tree-loop-vectorize ${CMAKE_C_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zvfh_zba_zicbop -mabi=lp64d -fno-tree-vectorize -fno-tree-loop-vectorize ${CMAKE_CXX_FLAGS}")
|
||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
|
||||
|
||||
@@ -357,8 +357,7 @@ static handle_model_result common_params_handle_model(struct common_params_model
|
||||
auto download_result = common_download_model(model, opts, true);
|
||||
|
||||
if (download_result.model_path.empty()) {
|
||||
LOG_ERR("error: failed to download model from Hugging Face\n");
|
||||
exit(1);
|
||||
throw std::runtime_error("failed to download model from Hugging Face");
|
||||
}
|
||||
|
||||
model.name = model.hf_repo;
|
||||
@@ -380,8 +379,7 @@ static handle_model_result common_params_handle_model(struct common_params_model
|
||||
opts.offline = offline;
|
||||
auto download_result = common_download_model(model, opts);
|
||||
if (download_result.model_path.empty()) {
|
||||
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
|
||||
exit(1);
|
||||
throw std::runtime_error("failed to download model from " + model.url);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -435,6 +433,25 @@ static bool parse_bool_value(const std::string & value) {
|
||||
// CLI argument parsing functions
|
||||
//
|
||||
|
||||
void common_params_handle_models(common_params & params, llama_example curr_ex) {
|
||||
auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
|
||||
if (params.no_mmproj) {
|
||||
params.mmproj = {};
|
||||
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
||||
// optionally, handle mmproj model when -hf is specified
|
||||
params.mmproj = res.mmproj;
|
||||
}
|
||||
// only download mmproj if the current example is using it
|
||||
for (const auto & ex : mmproj_examples) {
|
||||
if (curr_ex == ex) {
|
||||
common_params_handle_model(params.mmproj, params.hf_token, params.offline);
|
||||
break;
|
||||
}
|
||||
}
|
||||
common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
|
||||
common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
|
||||
}
|
||||
|
||||
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
|
||||
common_params & params = ctx_arg.params;
|
||||
|
||||
@@ -588,22 +605,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
|
||||
// handle model and download
|
||||
if (!skip_model_download) {
|
||||
auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
|
||||
if (params.no_mmproj) {
|
||||
params.mmproj = {};
|
||||
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
||||
// optionally, handle mmproj model when -hf is specified
|
||||
params.mmproj = res.mmproj;
|
||||
}
|
||||
// only download mmproj if the current example is using it
|
||||
for (const auto & ex : mmproj_examples) {
|
||||
if (ctx_arg.ex == ex) {
|
||||
common_params_handle_model(params.mmproj, params.hf_token, params.offline);
|
||||
break;
|
||||
}
|
||||
}
|
||||
common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
|
||||
common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
|
||||
common_params_handle_models(params, ctx_arg.ex);
|
||||
}
|
||||
|
||||
// model is required (except for server)
|
||||
@@ -622,10 +624,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
|
||||
string_process_escapes(seq_breaker);
|
||||
}
|
||||
for (auto & pair : params.speculative.draft.replacements) {
|
||||
string_process_escapes(pair.first);
|
||||
string_process_escapes(pair.second);
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.kv_overrides.empty()) {
|
||||
@@ -2223,7 +2221,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
if (llama_supports_rpc()) {
|
||||
add_opt(common_arg(
|
||||
{"--rpc"}, "SERVERS",
|
||||
"comma separated list of RPC servers (host:port)",
|
||||
"comma-separated list of RPC servers (host:port)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
add_rpc_devices(value);
|
||||
GGML_UNUSED(params);
|
||||
@@ -3518,13 +3516,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.speculative.draft.p_min = std::stof(value);
|
||||
}
|
||||
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_MIN"));
|
||||
add_opt(common_arg(
|
||||
{"--spec-draft-ctx-size", "-cd", "--ctx-size-draft"}, "N",
|
||||
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.draft.n_ctx),
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.draft.n_ctx = value;
|
||||
}
|
||||
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_CTX_SIZE"));
|
||||
add_opt(common_arg(
|
||||
{"--spec-draft-device", "-devd", "--device-draft"}, "<dev1,dev2,..>",
|
||||
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
||||
@@ -3561,32 +3552,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL"));
|
||||
add_opt(common_arg(
|
||||
{"--spec-draft-replace", "--spec-replace"}, "TARGET", "DRAFT",
|
||||
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
|
||||
[](common_params & params, const std::string & tgt, const std::string & dft) {
|
||||
params.speculative.draft.replacements.push_back({ tgt, dft });
|
||||
}
|
||||
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
add_opt(common_arg(
|
||||
{"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
|
||||
string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
|
||||
common_speculative_type_to_str(params.speculative.type).c_str()),
|
||||
{"--spec-type"}, common_speculative_all_types_str(),
|
||||
string_format("comma-separated list of types of speculative decoding to use (default: %s)\n",
|
||||
common_speculative_type_name_str(params.speculative.types).c_str()),
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (value == "none") {
|
||||
params.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
|
||||
} else if (value == "ngram-cache") {
|
||||
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_CACHE;
|
||||
} else if (value == "ngram-simple") {
|
||||
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE;
|
||||
} else if (value == "ngram-map-k") {
|
||||
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
|
||||
} else if (value == "ngram-map-k4v") {
|
||||
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
|
||||
} else if (value == "ngram-mod") {
|
||||
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
|
||||
} else {
|
||||
throw std::invalid_argument("unknown speculative decoding type without draft model");
|
||||
}
|
||||
const auto enabled_types = string_split<std::string>(value, ',');
|
||||
params.speculative.types = common_speculative_types_from_names(enabled_types);
|
||||
}
|
||||
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_TYPE"));
|
||||
add_opt(common_arg(
|
||||
@@ -4075,7 +4046,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--spec-default"},
|
||||
string_format("enable default speculative decoding config"),
|
||||
[](common_params & params) {
|
||||
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
|
||||
params.speculative.types = { COMMON_SPECULATIVE_TYPE_NGRAM_MOD };
|
||||
params.speculative.ngram_mod.n_match = 24;
|
||||
params.speculative.ngram_mod.n_min = 48;
|
||||
params.speculative.ngram_mod.n_max = 64;
|
||||
|
||||
@@ -129,5 +129,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
|
||||
// see: https://github.com/ggml-org/llama.cpp/issues/18163
|
||||
void common_params_add_preset_options(std::vector<common_arg> & args);
|
||||
|
||||
// Populate model paths (main model, mmproj, etc) from -hf if necessary
|
||||
void common_params_handle_models(common_params & params, llama_example curr_ex);
|
||||
|
||||
// initialize argument parser context - used by test-arg-parser and preset
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
||||
@@ -369,9 +369,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
|
||||
arguments.name_suffix) +
|
||||
arguments.value_prefix +
|
||||
(schema_info.resolves_to_string(param_schema) ?
|
||||
p.tool_arg_string_value(p.schema(until_suffix,
|
||||
"tool-" + name + "-arg-" + param_name + "-schema",
|
||||
param_schema, true)) :
|
||||
p.tool_arg_string_value(until_suffix) :
|
||||
p.tool_arg_json_value(p.schema(
|
||||
p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
|
||||
p.space()) +
|
||||
|
||||
@@ -1422,7 +1422,7 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
|
||||
|
||||
// try to remove the last tokens
|
||||
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
|
||||
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
|
||||
LOG_WRN("%s: the context does not support partial sequence removal\n", __func__);
|
||||
res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
|
||||
goto done;
|
||||
}
|
||||
@@ -1960,3 +1960,102 @@ bool common_prompt_batch_decode(
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t common_prompt_checkpoint::size() const {
|
||||
return data_tgt.size() + data_dft.size();
|
||||
}
|
||||
|
||||
bool common_prompt_checkpoint::empty() const {
|
||||
return data_tgt.empty();
|
||||
}
|
||||
|
||||
void common_prompt_checkpoint::clear() {
|
||||
n_tokens = 0;
|
||||
|
||||
pos_min = 0;
|
||||
pos_max = 0;
|
||||
|
||||
data_tgt.clear();
|
||||
data_dft.clear();
|
||||
}
|
||||
|
||||
void common_prompt_checkpoint::update_pos(
|
||||
int64_t n_tokens,
|
||||
llama_pos pos_min,
|
||||
llama_pos pos_max) {
|
||||
this->n_tokens = n_tokens;
|
||||
this->pos_min = pos_min;
|
||||
this->pos_max = pos_max;
|
||||
}
|
||||
|
||||
void common_prompt_checkpoint::update_tgt(
|
||||
llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_state_seq_flags flags) {
|
||||
if (ctx == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);
|
||||
|
||||
data_tgt.resize(ckpt_size);
|
||||
|
||||
const size_t n = llama_state_seq_get_data_ext(ctx, data_tgt.data(), ckpt_size, seq_id, flags);
|
||||
if (n != ckpt_size) {
|
||||
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
|
||||
}
|
||||
}
|
||||
|
||||
void common_prompt_checkpoint::update_dft(
|
||||
llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_state_seq_flags flags) {
|
||||
if (ctx == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);
|
||||
|
||||
data_dft.resize(ckpt_size);
|
||||
|
||||
const size_t n = llama_state_seq_get_data_ext(ctx, data_dft.data(), ckpt_size, seq_id, flags);
|
||||
if (n != ckpt_size) {
|
||||
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
|
||||
}
|
||||
}
|
||||
|
||||
void common_prompt_checkpoint::load_tgt(
|
||||
llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_state_seq_flags flags) const {
|
||||
if (ctx == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (data_tgt.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t n = llama_state_seq_set_data_ext(ctx, data_tgt.data(), data_tgt.size(), seq_id, flags);
|
||||
if (n != data_tgt.size()) {
|
||||
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_tgt.size(), n);
|
||||
}
|
||||
}
|
||||
|
||||
void common_prompt_checkpoint::load_dft(
|
||||
llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_state_seq_flags flags) const {
|
||||
if (ctx == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (data_dft.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t n = llama_state_seq_set_data_ext(ctx, data_dft.data(), data_dft.size(), seq_id, flags);
|
||||
if (n != data_dft.size()) {
|
||||
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_dft.size(), n);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -157,9 +157,9 @@ enum common_params_sampling_config : uint64_t {
|
||||
|
||||
enum common_speculative_type {
|
||||
COMMON_SPECULATIVE_TYPE_NONE, // no speculative decoding
|
||||
COMMON_SPECULATIVE_TYPE_DRAFT, // draft model
|
||||
COMMON_SPECULATIVE_TYPE_EAGLE3, // eagle draft model
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
|
||||
COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, // standalone draft model speculative decoding
|
||||
COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, // Eagle3 speculative decoding
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding based on n-grams
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
|
||||
@@ -295,8 +295,6 @@ struct common_params_model {
|
||||
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
|
||||
};
|
||||
|
||||
struct common_ngram_mod;
|
||||
|
||||
// draft-model-based speculative decoding parameters
|
||||
struct common_params_speculative_draft {
|
||||
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
||||
@@ -307,11 +305,9 @@ struct common_params_speculative_draft {
|
||||
|
||||
common_params_model mparams;
|
||||
|
||||
llama_model * model = nullptr; // a llama_model that can be shared by multiple speculative contexts
|
||||
llama_context * ctx_tgt = nullptr;
|
||||
llama_context * ctx_dft = nullptr;
|
||||
|
||||
llama_context_params cparams; // these are the parameters for the draft llama_context
|
||||
|
||||
int32_t n_ctx = 0; // draft context size
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
|
||||
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
||||
@@ -322,7 +318,6 @@ struct common_params_speculative_draft {
|
||||
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
|
||||
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
};
|
||||
|
||||
@@ -331,9 +326,6 @@ struct common_params_speculative_ngram_mod {
|
||||
|
||||
int32_t n_max = 64;
|
||||
int32_t n_min = 48;
|
||||
|
||||
// shared instance of the ngram container for all speculative decoding contexts
|
||||
std::shared_ptr<common_ngram_mod> obj;
|
||||
};
|
||||
|
||||
struct common_params_speculative_ngram_map {
|
||||
@@ -348,9 +340,9 @@ struct common_params_speculative_ngram_cache {
|
||||
};
|
||||
|
||||
struct common_params_speculative {
|
||||
// TODO: become a vector in order to support "chains of speculators"
|
||||
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE;
|
||||
std::vector<enum common_speculative_type> types = { COMMON_SPECULATIVE_TYPE_NONE };
|
||||
|
||||
// used by Simple, MTP, Eagle3, etc. - all methods that require some kind of draft model
|
||||
common_params_speculative_draft draft;
|
||||
|
||||
common_params_speculative_ngram_mod ngram_mod;
|
||||
@@ -1026,3 +1018,47 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
|
||||
|
||||
// "adamw" or "sgd" (case insensitive)
|
||||
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
|
||||
|
||||
//
|
||||
// prompt utils
|
||||
//
|
||||
|
||||
struct common_prompt_checkpoint {
|
||||
int64_t n_tokens;
|
||||
|
||||
llama_pos pos_min;
|
||||
llama_pos pos_max;
|
||||
|
||||
std::vector<uint8_t> data_tgt;
|
||||
std::vector<uint8_t> data_dft;
|
||||
|
||||
size_t size() const;
|
||||
|
||||
bool empty() const;
|
||||
void clear();
|
||||
|
||||
void update_pos(
|
||||
int64_t n_tokens,
|
||||
llama_pos pos_min,
|
||||
llama_pos pos_max);
|
||||
|
||||
void update_tgt(
|
||||
llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_state_seq_flags flags);
|
||||
|
||||
void update_dft(
|
||||
llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_state_seq_flags flags);
|
||||
|
||||
void load_tgt(
|
||||
llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_state_seq_flags flags) const;
|
||||
|
||||
void load_dft(
|
||||
llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_state_seq_flags flags) const;
|
||||
};
|
||||
|
||||
@@ -163,8 +163,13 @@ void common_preset::merge(const common_preset & other) {
|
||||
}
|
||||
}
|
||||
|
||||
void common_preset::apply_to_params(common_params & params) const {
|
||||
void common_preset::apply_to_params(common_params & params, const std::set<std::string> & handled_keys) const {
|
||||
for (const auto & [opt, val] : options) {
|
||||
if (!handled_keys.empty()) {
|
||||
if (!opt.env || handled_keys.find(opt.env) == handled_keys.end()) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// apply each option to params
|
||||
if (opt.handler_string) {
|
||||
opt.handler_string(params, val);
|
||||
|
||||
@@ -43,7 +43,8 @@ struct common_preset {
|
||||
void merge(const common_preset & other);
|
||||
|
||||
// apply preset options to common_params
|
||||
void apply_to_params(common_params & params) const;
|
||||
// optionally specify handled_keys to only apply a subset of options (identified by their env), if empty, apply all options
|
||||
void apply_to_params(common_params & params, const std::set<std::string> & handled_keys = std::set<std::string>()) const;
|
||||
};
|
||||
|
||||
// interface for multiple presets in one file
|
||||
|
||||
@@ -547,6 +547,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
auto & chain = gsmpl->chain;
|
||||
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
||||
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
// Check if a backend sampler has already sampled a token in which case we
|
||||
// return that token id directly.
|
||||
{
|
||||
@@ -558,17 +560,17 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
|
||||
GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");
|
||||
|
||||
// TODO: simplify
|
||||
gsmpl->cur.resize(1);
|
||||
gsmpl->cur[0] = { id, 0.0f, 1.0f };
|
||||
cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
|
||||
for (size_t i = 0; i < cur_p.size; ++i) {
|
||||
if (cur_p.data[i].id == id) {
|
||||
cur_p.selected = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return id;
|
||||
}
|
||||
}
|
||||
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
// apply reasoning budget first
|
||||
llama_sampler_apply(rbudget, &cur_p);
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -5,8 +5,14 @@
|
||||
|
||||
struct common_speculative;
|
||||
|
||||
// comma separated list the provided types
|
||||
std::string common_speculative_type_name_str(const std::vector<enum common_speculative_type> & types);
|
||||
|
||||
// comma separated list of all types
|
||||
std::string common_speculative_type_name_str();
|
||||
const char * common_speculative_all_types_str();
|
||||
|
||||
// parse user provided types
|
||||
std::vector<enum common_speculative_type> common_speculative_types_from_names(const std::vector<std::string> & names);
|
||||
|
||||
// convert string to type
|
||||
enum common_speculative_type common_speculative_type_from_name(const std::string & name);
|
||||
@@ -14,27 +20,44 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
|
||||
// convert type to string
|
||||
std::string common_speculative_type_to_str(enum common_speculative_type type);
|
||||
|
||||
common_speculative * common_speculative_init(
|
||||
common_params_speculative & params,
|
||||
llama_context * ctx_tgt);
|
||||
common_speculative * common_speculative_init(common_params_speculative & params, uint32_t n_seq);
|
||||
|
||||
void common_speculative_free(common_speculative * spec);
|
||||
|
||||
struct common_speculative_draft_params {
|
||||
// this flag is used to chain the drafts through all the available implementations
|
||||
// after the first successful draft from an implementation, we set it
|
||||
// to false to prevent further drafts for that sequence
|
||||
// at the end of the draft() call, all drafting flags will be reset to false
|
||||
bool drafting = false;
|
||||
|
||||
// overrides individual configurations (-1 disabled)
|
||||
// can be used to constraint the max draft based on the remaining context size
|
||||
int32_t n_max = -1;
|
||||
|
||||
llama_pos n_past;
|
||||
llama_token id_last;
|
||||
|
||||
// TODO: remove in the future by keeping track of the prompt from the _begin() call and the consecutive accept calls
|
||||
const llama_tokens * prompt;
|
||||
|
||||
// the generated draft from the last _draft() call
|
||||
llama_tokens * result;
|
||||
};
|
||||
|
||||
common_speculative_draft_params & common_speculative_get_draft_params(common_speculative * spec, llama_seq_id seq_id);
|
||||
|
||||
// optionally call once at the beginning of a new generation
|
||||
void common_speculative_begin(common_speculative * spec, const llama_tokens & prompt);
|
||||
void common_speculative_begin(common_speculative * spec, llama_seq_id seq_id, const llama_tokens & prompt);
|
||||
|
||||
// sample up to n_draft tokens and add them to the batch using the draft model
|
||||
llama_tokens common_speculative_draft(
|
||||
common_speculative * spec,
|
||||
const common_params_speculative & params,
|
||||
const llama_tokens & prompt,
|
||||
llama_token id_last);
|
||||
// process the batch and update the internal state of the speculative context
|
||||
bool common_speculative_process(common_speculative * spec, const llama_batch & batch);
|
||||
|
||||
// informs the speculative decoder that n_accepted tokens were accepted by the target model
|
||||
void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);
|
||||
// generate drafts for the sequences specified with `common_speculative_get_draft_params`
|
||||
void common_speculative_draft(common_speculative * spec);
|
||||
|
||||
int32_t common_speculative_n_max(const common_speculative * spec, const common_params_speculative & params);
|
||||
int32_t common_speculative_n_min(const common_speculative * spec, const common_params_speculative & params);
|
||||
// informs the speculative context that n_accepted tokens were accepted by the target model
|
||||
void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted);
|
||||
|
||||
// print statistics about the speculative decoding
|
||||
void common_speculative_print_stats(const common_speculative * spec);
|
||||
|
||||
@@ -1570,6 +1570,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015":
|
||||
# ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B
|
||||
res = "f2llmv2"
|
||||
if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57":
|
||||
# ref: https://huggingface.co/sarvamai/sarvam-30b
|
||||
res = "sarvam-moe"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -2173,7 +2176,8 @@ class MmprojModel(ModelBase):
|
||||
text_config = {
|
||||
k: v for k, v in self.hparams.items() if k not in ["vision_encoder", "audio_encoder"]
|
||||
}
|
||||
self.n_embd_text = text_config.get("hidden_dim", 0)
|
||||
# mistral native params.json: "dim" is the text hidden size ("hidden_dim" is the FFN intermediate size)
|
||||
self.n_embd_text = text_config.get("dim", 0)
|
||||
|
||||
assert self.n_embd_text > 0, "n_embd not found in hparams"
|
||||
|
||||
@@ -2861,8 +2865,12 @@ class LlamaModel(TextModel):
|
||||
# fix for SmolVLM2, missing `num_attention_heads` in config.json
|
||||
if self.hf_arch == "VLlama3ForCausalLM":
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
||||
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
||||
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||
# Mistral consolidated format has no config.json; origin_hf_arch is HF-only.
|
||||
if self.is_mistral_format:
|
||||
self.origin_hf_arch = None
|
||||
else:
|
||||
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
||||
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||
|
||||
def set_vocab(self):
|
||||
if self.origin_hf_arch == "GlmasrModel":
|
||||
@@ -3134,6 +3142,11 @@ class LlavaVisionModel(MmprojModel):
|
||||
assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
|
||||
if self.use_break_tok:
|
||||
self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
|
||||
|
||||
# params.json may ship -1 placeholders (Mistral Medium 3.5)
|
||||
# resolve the real id from the bundled tokenizer in that case
|
||||
if self.img_break_tok_id < 0:
|
||||
self.img_break_tok_id = self.get_mistral_token_id("[IMG_BREAK]")
|
||||
else:
|
||||
raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
|
||||
logger.info(f"Image break token id: {self.img_break_tok_id}")
|
||||
@@ -3153,6 +3166,24 @@ class LlavaVisionModel(MmprojModel):
|
||||
return int(token_data["id"])
|
||||
raise ValueError(f"Token '{token}' not found in tokenizer config.")
|
||||
|
||||
def get_mistral_token_id(self, token: str) -> int:
|
||||
# mistral native format ships tekken.json or a versioned spm tokenizer
|
||||
tekken_file = self.dir_model / "tekken.json"
|
||||
if tekken_file.is_file():
|
||||
with open(tekken_file, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
for entry in data.get("special_tokens", []):
|
||||
if entry.get("token_str") == token:
|
||||
return int(entry["rank"])
|
||||
tokenizer_json_file = self.dir_model / "tokenizer.json"
|
||||
if tokenizer_json_file.is_file():
|
||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
for entry in data.get("added_tokens", []):
|
||||
if entry.get("content") == token:
|
||||
return int(entry["id"])
|
||||
raise ValueError(f"Token '{token}' not found in mistral tokenizer files.")
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
@@ -9733,6 +9764,73 @@ class MimoV2Model(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("MiMoV2ForCausalLM")
|
||||
class MiMoV2VisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
hp = self.hparams_vision
|
||||
|
||||
hp["image_size"] = hp.get("image_size", 560)
|
||||
hp["num_attention_heads"] = hp.get("num_heads", 32)
|
||||
hp["num_hidden_layers"] = hp.get("depth", 28)
|
||||
|
||||
self.n_q_heads = int(hp["num_heads"])
|
||||
self.num_kv_heads = int(hp.get("num_key_value_heads", 8))
|
||||
self.head_dim = int(hp.get("qk_channels", 64))
|
||||
self.spatial_merge_size = int(hp["spatial_merge_size"])
|
||||
# MiMoV2 vision RMSNorm: HF uses getattr(config, "rms_norm_eps", 1e-6) and the
|
||||
# field is absent from MiMo-V2.5's vision_config
|
||||
self.rms_norm_eps = float(hp.get("rms_norm_eps", 1e-6))
|
||||
|
||||
# fullatt_block_indexes are also reflected in vit_window_attn_types as -1
|
||||
self.fullatt_block_indexes = list(hp.get("fullatt_block_indexes") or [])
|
||||
self.vit_window_attn_types = list(hp.get("vit_window_attn_types") or [])
|
||||
self.visual_token_window_size = int(hp.get("visual_token_window_size", -1))
|
||||
self.use_sink = bool(hp.get("use_sink", False))
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MIMOVL)
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
self.gguf_writer.add_vision_head_count_kv(self.num_kv_heads)
|
||||
self.gguf_writer.add_vision_spatial_merge_size(self.spatial_merge_size)
|
||||
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.WINDOW_SIZE, self.visual_token_window_size)
|
||||
self.gguf_writer.add_vision_wa_pattern_mode(self.vit_window_attn_types)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.rms_norm_eps)
|
||||
self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
|
||||
self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
# Sinks must be F32: any sink-style softmax/mask add in ggml requires
|
||||
# F32, and we fold sinks into a host-built F32 mask at encode time.
|
||||
if new_name.endswith(".attn_sinks"):
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, _ = item
|
||||
if not name.startswith("visual."):
|
||||
return None
|
||||
return super().filter_tensors(item)
|
||||
|
||||
def modify_tensors(self, data_torch, name, bid):
|
||||
# Conv3D patch embed: split along the temporal axis (kt=2) into two Conv2D
|
||||
# weights that the existing qwen2vl-style two-Conv2D path consumes.
|
||||
if name == "visual.patch_embed.proj.weight":
|
||||
_, _, kt, _, _ = data_torch.shape
|
||||
if kt != 2:
|
||||
raise ValueError(f"unexpected temporal_patch_size: {kt}")
|
||||
embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH]
|
||||
yield (embd_name + ".weight", data_torch[:, :, 0, ...])
|
||||
yield (embd_name + ".weight.1", data_torch[:, :, 1, ...])
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Step3p5ForCausalLM")
|
||||
class Step35Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.STEP35
|
||||
@@ -11591,6 +11689,34 @@ class BailingMoeV2Model(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("SarvamMoEForCausalLM", "modeling_sarvam_moe.SarvamMoEForCausalLM")
|
||||
class SarvamMoEModel(BailingMoeV2Model):
|
||||
model_arch = gguf.MODEL_ARCH.BAILINGMOE2
|
||||
# Sarvam-MoE shares the BailingMoeV2 architecture; only differences:
|
||||
# - full rotary (no partial_rotary_factor)
|
||||
# - expert bias is zero-mean normalized at load time
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
if (rope_dim := hparams.get("head_dim")) is None:
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
# Override the partial-rotary value written by BailingMoeV2 with the full rotary dim
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
if name.endswith(".expert_bias"):
|
||||
# Sarvam normalizes expert bias to zero mean
|
||||
inner = gen
|
||||
|
||||
def gen():
|
||||
t = inner()
|
||||
return t - t.mean()
|
||||
return super().filter_tensors((name, gen))
|
||||
|
||||
|
||||
@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
|
||||
class GroveMoeModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.GROVEMOE
|
||||
@@ -13287,7 +13413,7 @@ class PixtralModel(LlavaVisionModel):
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
|
||||
# spatial_merge_size
|
||||
if self.find_vparam(["mm_projector_id"]) == "patch_merge":
|
||||
if self.find_vparam(["mm_projector_id"], optional=True) == "patch_merge":
|
||||
self.gguf_writer.add_vision_spatial_merge_size(
|
||||
self.find_vparam(["spatial_merge_size"])
|
||||
)
|
||||
@@ -13295,8 +13421,12 @@ class PixtralModel(LlavaVisionModel):
|
||||
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
|
||||
if name == "vision_language_adapter.w_in.weight":
|
||||
return "mm.1.weight"
|
||||
elif name == "vision_language_adapter.w_in.bias":
|
||||
return "mm.1.bias"
|
||||
elif name == "vision_language_adapter.w_out.weight":
|
||||
return "mm.2.weight"
|
||||
elif name == "vision_language_adapter.w_out.bias":
|
||||
return "mm.2.bias"
|
||||
return super().map_tensor_name(name, try_suffixes)
|
||||
|
||||
|
||||
|
||||
@@ -155,6 +155,7 @@ models = [
|
||||
{"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
|
||||
{"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
|
||||
{"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
|
||||
{"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
|
||||
@@ -188,6 +188,24 @@ class LoraTorchTensor:
|
||||
def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
|
||||
return self.transpose(axis0, axis1)
|
||||
|
||||
def split(self, split_size: int | Sequence[int], dim: int = 0) -> tuple[LoraTorchTensor, ...]:
|
||||
shape = self.shape
|
||||
ndim = len(shape)
|
||||
if dim < 0:
|
||||
dim += ndim
|
||||
if dim == ndim - 1:
|
||||
A_chunks = self._lora_A.split(split_size, dim=-1)
|
||||
return tuple(LoraTorchTensor(a, self._lora_B) for a in A_chunks)
|
||||
elif dim == ndim - 2:
|
||||
B_chunks = self._lora_B.split(split_size, dim=-2)
|
||||
return tuple(LoraTorchTensor(self._lora_A, b) for b in B_chunks)
|
||||
else:
|
||||
B_chunks = self._lora_B.split(split_size, dim=dim)
|
||||
if self._lora_A.shape[dim] == 1:
|
||||
return tuple(LoraTorchTensor(self._lora_A, b) for b in B_chunks)
|
||||
A_chunks = self._lora_A.split(split_size, dim=dim)
|
||||
return tuple(LoraTorchTensor(a, b) for a, b in zip(A_chunks, B_chunks))
|
||||
|
||||
def to(self, *args, **kwargs):
|
||||
return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
|
||||
|
||||
@@ -230,6 +248,11 @@ class LoraTorchTensor:
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
elif func is torch.split:
|
||||
assert len(args) and len(args) >= 2
|
||||
tensor, split_size = args[0], args[1]
|
||||
dim = args[2] if len(args) > 2 else kwargs.get("dim", 0)
|
||||
return tensor.split(split_size, dim=dim)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@@ -57,17 +57,22 @@ Although OpenVINO supports a wide range of [Intel hardware](https://docs.openvin
|
||||
|
||||
## Validated Models
|
||||
|
||||
The following models have been validated for functionality on Intel® Core™ Ultra Series 1 and Series 2:
|
||||
The following models were validated on Intel® Core™ Ultra Series 2. While our testing was limited, the OpenVINO backend is expected to work across a broad range of [Intel hardware](https://docs.openvino.ai/2026/about-openvino/release-notes-openvino/system-requirements.html).
|
||||
- Use `GGML_OPENVINO_STATEFUL_EXECUTION=1` when using GPU device.
|
||||
- `-fa 1` is required when running llama-bench with the OpenVINO backend.
|
||||
- Additional model support, quantization formats and validations are work in progress.
|
||||
|
||||
- [Llama-3.2-1B-Instruct-GGUF](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/)
|
||||
- [Llama-3.1-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF)
|
||||
- [microsoft/Phi-3-mini-4k-instruct-gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf)
|
||||
- [Qwen/Qwen2.5-1.5B-Instruct-GGUF](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF)
|
||||
- [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B-GGUF)
|
||||
- [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-S-1B-sft-gguf)
|
||||
- [tencent/Hunyuan-7B-Instruct](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF)
|
||||
- [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF)
|
||||
- [bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF)
|
||||
| Model | Validated | Known Issues |
|
||||
| :------| :---------- | :-------------|
|
||||
| [Llama-3.2-1B-Instruct](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
|
||||
| [Meta-Llama-3.1-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) | `Q8_0`, `Q4_K_M` on CPU/GPU/NPU | `Q4_0_8_8`, `Q4_0_4_8`, `Q4_0_4_4` fail |
|
||||
| [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) | `FP16`, `Q4` on CPU/NPU | GPU unsupported for `FP16` and `Q4` (`llama-cli`, `llama-bench`) |
|
||||
| [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
|
||||
| [Qwen3-8B-Instruct](https://huggingface.co/Qwen/Qwen3-8B-GGUF) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/NPU; GPU works via `llama-bench` | GPU `llama-cli` unsupported for all quantizations |
|
||||
| [MiniCPM-V-2_6-GGUF](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `Q4_0` on CPU/GPU/NPU | — |
|
||||
| [DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF) | `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
|
||||
| [Hunyuan-7B-Instruct](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF) | CPU: `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M`; GPU: `Q8_0`, `Q4_0`, `Q4_1`; NPU (`llama-bench` only): `Q4_0`, `Q4_1`, `Q4_K_M` | GPU `Q4_K_M` unsupported; NPU `llama-cli` unsupported |
|
||||
| [Mistral-7B-Instruct-v0.3](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF/) | CPU/GPU: `Q8_0`, `Q4_K_M`; NPU: `Q8_0`, `Q4_K_M` (via `llama-bench`) | NPU `llama-cli` unsupported for `Q8_0`, `Q4_K_M` |
|
||||
|
||||
## Build Instructions
|
||||
|
||||
|
||||
@@ -720,6 +720,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| GGML_SYCL_GRAPH | OFF *(default)* \|ON *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
|
||||
| GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. |
|
||||
| GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. |
|
||||
| GGML_SYCL_SUPPORT_LEVEL_ZERO | ON *(default)* \|OFF *(Optional)* | Enable Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. |
|
||||
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
|
||||
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||
|
||||
@@ -733,9 +734,18 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.|
|
||||
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
|
||||
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
|
||||
| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. |
|
||||
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
|
||||
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
|
||||
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Allow SYCL/Unified Runtime Level Zero device allocations larger than 4 GiB. llama.cpp's direct Level Zero allocation path requests the relaxed maximum-size limit itself when GGML_SYCL_ENABLE_LEVEL_ZERO=1. |
|
||||
|
||||
## Compile-time Flags
|
||||
|
||||
Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spot.
|
||||
|
||||
| Name | Function |
|
||||
|-----------------|----------------------------------------------------------------------------------|
|
||||
| DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |
|
||||
|
||||
## Design Rule
|
||||
|
||||
@@ -811,7 +821,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
|
||||
- `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 5000000000 Bytes of memory on device`
|
||||
|
||||
You need to enable to support 4GB memory malloc by:
|
||||
With the default `GGML_SYCL_ENABLE_LEVEL_ZERO=1`, llama.cpp requests Level Zero's relaxed maximum-size allocation limit directly. If Level Zero support is disabled at build time or runtime and the allocation goes through SYCL/Unified Runtime instead, enable support for allocations larger than 4 GiB by:
|
||||
```
|
||||
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||
|
||||
@@ -9,18 +9,20 @@ wget https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_6
|
||||
~~~
|
||||
|
||||
2. Build
|
||||
Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
|
||||
Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1` and `RISCV64_SPACEMIT_IME2`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
|
||||
```bash
|
||||
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_CPU_RISCV64_SPACEMIT=ON \
|
||||
-DGGML_CPU_REPACK=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_RVV=ON \
|
||||
-DGGML_RV_ZVFH=ON \
|
||||
-DGGML_RV_ZFH=ON \
|
||||
-DGGML_RV_ZICBOP=ON \
|
||||
-DGGML_RV_ZIHINTPAUSE=ON \
|
||||
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
|
||||
-DGGML_RV_ZBA=ON \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \
|
||||
-DCMAKE_INSTALL_PREFIX=build/installed
|
||||
|
||||
@@ -47,8 +49,25 @@ export RISCV_ROOT_PATH_IME1={your RISC-V compiler path}
|
||||
|
||||
${QEMU_ROOT_PATH}/bin/qemu-riscv64 -L ${RISCV_ROOT_PATH_IME1}/sysroot -cpu max,vlen=256,elen=64,vext_spec=v1.0 ${PWD}/build/bin/llama-cli -m ${PWD}/models/Qwen2.5-0.5B-Instruct-Q4_0.gguf -t 1
|
||||
~~~
|
||||
|
||||
## Quantization Support For Matrix
|
||||
|
||||
| Quantization Type | X60 | A100 |
|
||||
| ---: | ---: | ---: |
|
||||
| Q2_K | | :heavy_check_mark: |
|
||||
| Q3_K | | :heavy_check_mark: |
|
||||
| Q4_0 | :heavy_check_mark: | :heavy_check_mark: |
|
||||
| Q4_1 | :heavy_check_mark: | :heavy_check_mark: |
|
||||
| Q4_K | :heavy_check_mark: | :heavy_check_mark: |
|
||||
| Q5_0 | | :heavy_check_mark: |
|
||||
| Q5_1 | | :heavy_check_mark: |
|
||||
| Q5_K | | :heavy_check_mark: |
|
||||
| Q6_K | | :heavy_check_mark: |
|
||||
| Q8_0 | | :heavy_check_mark: |
|
||||
|
||||
|
||||
## Performance
|
||||
#### Quantization Support For Matrix
|
||||
* Spacemit(R) X60
|
||||
~~~
|
||||
model name : Spacemit(R) X60
|
||||
isa : rv64imafdcv_zicbom_zicboz_zicntr_zicond_zicsr_zifencei_zihintpause_zihpm_zfh_zfhmin_zca_zcd_zba_zbb_zbc_zbs_zkt_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkt_sscofpmf_sstc_svinval_svnapot_svpbmt
|
||||
@@ -58,33 +77,34 @@ mvendorid : 0x710
|
||||
marchid : 0x8000000058000001
|
||||
~~~
|
||||
|
||||
Q4_0
|
||||
| Model | Size | Params | backend | threads | test | t/s |
|
||||
| -----------| -------- | ------ | ------- | ------- | ---- |------|
|
||||
Qwen2.5 0.5B |403.20 MiB|630.17 M| cpu | 4 | pp512|64.12 ± 0.26|
|
||||
Qwen2.5 0.5B |403.20 MiB|630.17 M| cpu | 4 | tg128|10.03 ± 0.01|
|
||||
Qwen2.5 1.5B |1011.16 MiB| 1.78 B | cpu | 4 | pp512|24.16 ± 0.02|
|
||||
Qwen2.5 1.5B |1011.16 MiB| 1.78 B | cpu | 4 | tg128|3.83 ± 0.06|
|
||||
Qwen2.5 3B | 1.86 GiB | 3.40 B | cpu | 4 | pp512|12.08 ± 0.02|
|
||||
Qwen2.5 3B | 1.86 GiB | 3.40 B | cpu | 4 | tg128|2.23 ± 0.02|
|
||||
|
||||
Q4_1
|
||||
| Model | Size | Params | backend | threads | test | t/s |
|
||||
| -----------| -------- | ------ | ------- | ------- | ---- |------|
|
||||
Qwen2.5 0.5B |351.50 MiB|494.03 M| cpu | 4 | pp512|62.07 ± 0.12|
|
||||
Qwen2.5 0.5B |351.50 MiB|494.03 M| cpu | 4 | tg128|9.91 ± 0.01|
|
||||
Qwen2.5 1.5B |964.06 MiB| 1.54 B | cpu | 4 | pp512|22.95 ± 0.25|
|
||||
Qwen2.5 1.5B |964.06 MiB| 1.54 B | cpu | 4 | tg128|4.01 ± 0.15|
|
||||
Qwen2.5 3B | 1.85 GiB | 3.09 B | cpu | 4 | pp512|11.55 ± 0.16|
|
||||
Qwen2.5 3B | 1.85 GiB | 3.09 B | cpu | 4 | tg128|2.25 ± 0.04|
|
||||
| model | size | params | backend | threads | n_ubatch | fa | mmap | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | ---: | --------------: | -------------------: |
|
||||
| qwen35 2B Q4_1 | 1.19 GiB | 1.88 B | CPU | 4 | 128 | 1 | 0 | pp128 | 10.32 ± 0.02 |
|
||||
| qwen35 2B Q4_1 | 1.19 GiB | 1.88 B | CPU | 4 | 128 | 1 | 0 | tg128 | 3.07 ± 0.01 |
|
||||
| qwen3 0.6B Q4_0 | 358.78 MiB | 596.05 M | CPU | 4 | 128 | 1 | 0 | pp128 | 49.15 ± 0.25 |
|
||||
| qwen3 0.6B Q4_0 | 358.78 MiB | 596.05 M | CPU | 4 | 128 | 1 | 0 | tg128 | 11.73 ± 0.02 |
|
||||
|
||||
|
||||
Q4_K
|
||||
| Model | Size | Params | backend | threads | test | t/s |
|
||||
| -----------| -------- | ------ | ------- | ------- | ---- |------|
|
||||
Qwen2.5 0.5B |462.96 MiB|630.17 M| cpu | 4 | pp512|9.29 ± 0.05|
|
||||
Qwen2.5 0.5B |462.96 MiB|630.17 M| cpu | 4 | tg128|5.67 ± 0.04|
|
||||
Qwen2.5 1.5B | 1.04 GiB | 1.78 B | cpu | 4 | pp512|10.38 ± 0.10|
|
||||
Qwen2.5 1.5B | 1.04 GiB | 1.78 B | cpu | 4 | tg128|3.17 ± 0.08|
|
||||
Qwen2.5 3B | 1.95 GiB | 3.40 B | cpu | 4 | pp512|4.23 ± 0.04|
|
||||
Qwen2.5 3B | 1.95 GiB | 3.40 B | cpu | 4 | tg128|1.73 ± 0.00|
|
||||
* Spacemit(R) A100
|
||||
~~~
|
||||
model name : Spacemit(R) A100
|
||||
isa : rv64imafdcvh_zicbom_zicbop_zicboz_zicntr_zicond_zicsr_zifencei_zihintntl_zihintpause_zihpm_zimop_zaamo_zalrsc_zawrs_zfa_zfh_zfhmin_zca_zcb_zcd_zcmop_zba_zbb_zbc_zbs_zkt_zvbb_zvbc_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh_zvkt_smaia_smstateen_ssaia_sscofpmf_sstc_svinval_svnapot_svpbmt_sdtrig
|
||||
mmu : sv39
|
||||
mvendorid : 0x710
|
||||
marchid : 0x8000000041000002
|
||||
mimpid : 0x10000000d5686200
|
||||
hart isa : rv64imafdcv_zicbom_zicbop_zicboz_zicntr_zicond_zicsr_zifencei_zihintntl_zihintpause_zihpm_zimop_zaamo_zalrsc_zawrs_zfa_zfh_zfhmin_zca_zcb_zcd_zcmop_zba_zbb_zbc_zbs_zkt_zvbb_zvbc_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh_zvkt_smaia_smstateen_ssaia_sscofpmf_sstc_svinval_svnapot_svpbmt_sdtrig
|
||||
~~~
|
||||
|
||||
| model | size | params | backend | threads | n_ubatch | fa | mmap | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | ---: | --------------: | -------------------: |
|
||||
| qwen3 0.6B Q4_0 | 358.78 MiB | 596.05 M | CPU | 8 | 128 | 1 | 0 | pp128 | 565.83 ± 0.31 |
|
||||
| qwen3 0.6B Q4_0 | 358.78 MiB | 596.05 M | CPU | 8 | 128 | 1 | 0 | tg128 | 55.77 ± 0.02 |
|
||||
| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | CPU | 8 | 128 | 1 | 0 | pp128 | 79.74 ± 0.04 |
|
||||
| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | CPU | 8 | 128 | 1 | 0 | tg128 | 11.29 ± 0.00 |
|
||||
| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | CPU | 8 | 128 | 1 | 0 | pp128 | 57.88 ± 0.31 |
|
||||
| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | CPU | 8 | 128 | 1 | 0 | tg128 | 12.79 ± 0.00 |
|
||||
| qwen35 2B Q4_1 | 1.19 GiB | 1.88 B | CPU | 8 | 128 | 1 | 0 | pp128 | 115.23 ± 0.04 |
|
||||
| qwen35 2B Q4_1 | 1.19 GiB | 1.88 B | CPU | 8 | 128 | 1 | 0 | tg128 | 16.49 ± 0.01 |
|
||||
| gemma4 E4B Q4_K - Medium | 4.76 GiB | 7.52 B | CPU | 8 | 128 | 1 | 0 | pp128 | 21.13 ± 0.01 |
|
||||
| gemma4 E4B Q4_K - Medium | 4.76 GiB | 7.52 B | CPU | 8 | 128 | 1 | 0 | tg128 | 5.66 ± 0.00 |
|
||||
|
||||
@@ -18,7 +18,7 @@ Legend:
|
||||
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
@@ -61,16 +61,17 @@ Legend:
|
||||
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||
| LOG | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| MUL_MAT_HADAMARD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
|
||||
| NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||
| NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| OUT_PROD | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
|
||||
@@ -117,5 +118,5 @@ Legend:
|
||||
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
|
||||
4272
docs/ops/SYCL.csv
4272
docs/ops/SYCL.csv
File diff suppressed because it is too large
Load Diff
11465
docs/ops/WebGPU.csv
11465
docs/ops/WebGPU.csv
File diff suppressed because it is too large
Load Diff
26
examples/llama-eval/README.md
Normal file
26
examples/llama-eval/README.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# llama-eval
|
||||
|
||||
Simple evaluation tool for llama.cpp with support for multiple datasets.
|
||||
|
||||
For a full description, usage examples, and sample results, see:
|
||||
|
||||
- [PR 21152](https://github.com/ggml-org/llama.cpp/pull/21152)
|
||||
|
||||
## Quick start
|
||||
|
||||
```bash
|
||||
# Single server
|
||||
python3 llama-eval.py \
|
||||
--server http://localhost:8033 \
|
||||
--model my-model \
|
||||
--dataset gsm8k --n_cases 100 \
|
||||
--grader-type regex --threads 32
|
||||
|
||||
# Multiple servers (comma-separated URLs and thread counts)
|
||||
python3 llama-eval.py \
|
||||
--server http://server1:8033,http://server2:8033 \
|
||||
--server-name server1,server2 \
|
||||
--threads 16,16 \
|
||||
--dataset aime2025 --n_cases 240 \
|
||||
--grader-type regex
|
||||
```
|
||||
1417
examples/llama-eval/llama-eval.py
Executable file
1417
examples/llama-eval/llama-eval.py
Executable file
File diff suppressed because it is too large
Load Diff
317
examples/llama-eval/llama-server-simulator.py
Executable file
317
examples/llama-eval/llama-server-simulator.py
Executable file
@@ -0,0 +1,317 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
import threading
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
from typing import Dict, List, Optional
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import datasets
|
||||
|
||||
# Set cache directory for HuggingFace datasets
|
||||
cache_dir = Path.home() / ".cache" / "huggingface" / "datasets"
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
os.environ["HF_DATASETS_CACHE"] = str(cache_dir)
|
||||
|
||||
def dice(s1: str, s2: str) -> float:
|
||||
"""Calculate Dice coefficient between two strings based on bigram overlap."""
|
||||
if not s1 and not s2:
|
||||
return 1.0
|
||||
|
||||
def _bigrams(s: str):
|
||||
return [s[i : i + 2] for i in range(len(s) - 1)]
|
||||
|
||||
bigrams1 = _bigrams(s1)
|
||||
bigrams2 = _bigrams(s2)
|
||||
|
||||
if not bigrams1 and not bigrams2:
|
||||
return 1.0
|
||||
|
||||
from collections import Counter
|
||||
|
||||
freq1 = Counter(bigrams1)
|
||||
freq2 = Counter(bigrams2)
|
||||
|
||||
intersection = sum(min(freq1[bg], freq2[bg]) for bg in freq1)
|
||||
dice_coeff = 2 * intersection / (len(bigrams1) + len(bigrams2))
|
||||
return dice_coeff
|
||||
|
||||
def debug_log(message: str):
|
||||
"""Log debug messages to both stdout and a file"""
|
||||
print(message, file=sys.stderr)
|
||||
with open("/tmp/simulator-debug.log", "a") as f:
|
||||
f.write(message + "\n")
|
||||
|
||||
simulator: Optional["Simulator"] = None
|
||||
|
||||
@dataclass
|
||||
class EvalState:
|
||||
id: str
|
||||
tasks: List[str]
|
||||
task_states: Dict[str, Dict]
|
||||
sampling_config: Dict
|
||||
|
||||
def normalize_number(s: str) -> Optional[int]:
|
||||
match = re.match(r"\d+", s) # match digits from the start
|
||||
if not match:
|
||||
return None
|
||||
return int(match.group(0))
|
||||
|
||||
class AimeDataset:
|
||||
def __init__(self, split: str = "train"):
|
||||
self.split = split
|
||||
self.questions: List[Dict] = []
|
||||
self._load_dataset()
|
||||
|
||||
def _load_dataset(self):
|
||||
print(f"Loading AIME dataset (split: {self.split})...")
|
||||
|
||||
cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
|
||||
if cache_path.exists():
|
||||
print(f"Using cached dataset from {cache_path}")
|
||||
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
|
||||
else:
|
||||
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
|
||||
|
||||
self.questions = list(ds)
|
||||
print(f"AIME dataset loaded: {len(self.questions)} questions")
|
||||
|
||||
def find_question(self, request_text: str) -> Optional[Dict]:
|
||||
best_match = None
|
||||
best_distance = -1
|
||||
best_index = -1
|
||||
|
||||
for i, question in enumerate(self.questions):
|
||||
question_text = question["problem"]
|
||||
request_lower = request_text.lower()
|
||||
question_lower = question_text.lower()
|
||||
|
||||
# Exact match
|
||||
if question_lower == request_lower:
|
||||
debug_log(f"DEBUG: Found exact match at index {i}")
|
||||
return question
|
||||
|
||||
# Remove LaTeX formatting for more flexible matching
|
||||
question_no_latex = re.sub(r'\$[^$]+\$', '', question_text)
|
||||
if question_no_latex.lower() == request_lower:
|
||||
debug_log(f"DEBUG: Found match (no LaTeX) at index {i}")
|
||||
return question
|
||||
|
||||
# Calculate Dice coefficient for partial matches
|
||||
# Only consider if request is at least 50% of question length
|
||||
if len(request_lower) >= len(question_lower) * 0.5:
|
||||
distance = dice(question_lower, request_lower)
|
||||
|
||||
if distance > best_distance:
|
||||
best_distance = distance
|
||||
best_match = question
|
||||
best_index = i
|
||||
|
||||
if best_match and best_distance > 0.3: # Threshold for partial match
|
||||
debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}")
|
||||
return best_match
|
||||
|
||||
debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...")
|
||||
return None
|
||||
|
||||
def get_answer(self, question: Dict) -> str:
|
||||
answer = question["answer"]
|
||||
if isinstance(answer, str):
|
||||
normalized = normalize_number(answer)
|
||||
return str(normalized) if normalized is not None else answer
|
||||
return str(answer)
|
||||
|
||||
class Simulator:
|
||||
def __init__(
|
||||
self,
|
||||
port: int = 8033,
|
||||
host: str = "localhost",
|
||||
success_rate: float = 0.8,
|
||||
dataset_split: str = "train"
|
||||
):
|
||||
self.port = port
|
||||
self.host = host
|
||||
self.success_rate = success_rate
|
||||
self.dataset = AimeDataset(dataset_split)
|
||||
self.eval_state = EvalState(
|
||||
id="aime-2025",
|
||||
tasks=["aime"],
|
||||
task_states={},
|
||||
sampling_config={"temperature": 0, "max_tokens": 2048}
|
||||
)
|
||||
|
||||
def _generate_response(
|
||||
self,
|
||||
question: Dict,
|
||||
should_be_correct: bool
|
||||
) -> Dict:
|
||||
expected_answer = self.dataset.get_answer(question)
|
||||
|
||||
if should_be_correct:
|
||||
response_text = expected_answer
|
||||
else:
|
||||
response_text = self._generate_wrong_answer(question)
|
||||
|
||||
return {
|
||||
"id": f"chatcmpl-{int(time.time())}",
|
||||
"object": "chat.completion",
|
||||
"created": int(time.time()),
|
||||
"model": "llama",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": response_text
|
||||
},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 100,
|
||||
"completion_tokens": 50,
|
||||
"total_tokens": 150
|
||||
}
|
||||
}
|
||||
|
||||
def _generate_wrong_answer(self, question: Dict) -> str:
|
||||
expected_answer = self.dataset.get_answer(question)
|
||||
|
||||
if expected_answer.isdigit():
|
||||
wrong_answer = str(int(expected_answer) + 1)
|
||||
else:
|
||||
wrong_answer = expected_answer + " (wrong)"
|
||||
|
||||
return wrong_answer
|
||||
|
||||
def _process_request(self, request_data: Dict) -> Dict:
|
||||
messages = request_data.get("messages", [])
|
||||
if not messages:
|
||||
return {"error": "No messages in request"}
|
||||
|
||||
request_text = messages[0].get("content", "")
|
||||
debug_log(f"DEBUG: Received request with content: {request_text[:150]}...")
|
||||
|
||||
question = self.dataset.find_question(request_text)
|
||||
if not question:
|
||||
debug_log(f"DEBUG: find_question returned None")
|
||||
return {"error": "No matching question found"}
|
||||
|
||||
should_be_correct = random.random() < self.success_rate
|
||||
|
||||
response = self._generate_response(question, should_be_correct)
|
||||
|
||||
task_id = "aime"
|
||||
self.eval_state.task_states[task_id] = {
|
||||
"correct": should_be_correct,
|
||||
"expected": self.dataset.get_answer(question),
|
||||
"predicted": response["choices"][0]["message"]["content"]
|
||||
}
|
||||
|
||||
return response
|
||||
|
||||
class RequestHandler(BaseHTTPRequestHandler):
|
||||
def do_POST(self):
|
||||
if self.path != "/v1/chat/completions":
|
||||
self._send_json({"error": "Not found"}, 404)
|
||||
return
|
||||
|
||||
try:
|
||||
content_length = int(self.headers.get("Content-Length", 0))
|
||||
body = self.rfile.read(content_length)
|
||||
request_data = json.loads(body) if body else None
|
||||
|
||||
if not request_data:
|
||||
self._send_json({"error": "Invalid JSON"}, 400)
|
||||
return
|
||||
|
||||
if simulator is None:
|
||||
self._send_json({"error": "Simulator not initialized"}, 500)
|
||||
return
|
||||
|
||||
response = simulator._process_request(request_data)
|
||||
self._send_json(response, 200)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
self._send_json({"error": "Invalid JSON"}, 400)
|
||||
except Exception as e:
|
||||
print(f"Error processing request: {e}")
|
||||
self._send_json({"error": str(e)}, 500)
|
||||
|
||||
def _send_json(self, data: dict, status: int = 200):
|
||||
body = json.dumps(data).encode("utf-8")
|
||||
self.send_response(status)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def log_message(self, format, *args):
|
||||
# Suppress default request logging
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="llama-server simulator for testing eval scripts"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
default=8033,
|
||||
help="Server port (default: 8033)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--host",
|
||||
type=str,
|
||||
default="localhost",
|
||||
help="Server host (default: localhost)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--success-rate",
|
||||
type=float,
|
||||
default=0.8,
|
||||
help="Success rate 0-1 (default: 0.8)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset-split",
|
||||
type=str,
|
||||
default="train",
|
||||
help="AIME dataset split to use (default: train)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
global simulator
|
||||
simulator = Simulator(
|
||||
port=args.port,
|
||||
host=args.host,
|
||||
success_rate=args.success_rate,
|
||||
dataset_split=args.dataset_split
|
||||
)
|
||||
|
||||
server = HTTPServer((args.host, args.port), RequestHandler)
|
||||
server_thread = threading.Thread(target=server.serve_forever, daemon=True)
|
||||
server_thread.start()
|
||||
|
||||
print("\n=== llama-server-simulator ===")
|
||||
print(f"Server running on http://{args.host}:{args.port}")
|
||||
print(f"Success rate: {args.success_rate}")
|
||||
print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions")
|
||||
print("\nPress Ctrl+C to stop\n")
|
||||
|
||||
try:
|
||||
server_thread.join()
|
||||
except KeyboardInterrupt:
|
||||
print("\nShutting down...")
|
||||
server.shutdown()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
86
examples/llama-eval/test-simulator.sh
Executable file
86
examples/llama-eval/test-simulator.sh
Executable file
@@ -0,0 +1,86 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# Get the directory where this script is located
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
echo "=== llama-server-simulator Test Script ==="
|
||||
echo ""
|
||||
|
||||
PORT=8033
|
||||
SUCCESS_RATE=0.8
|
||||
TEST_PORT=8034
|
||||
|
||||
echo "Starting simulator on port $PORT with success rate $SUCCESS_RATE..."
|
||||
source "$SCRIPT_DIR/venv/bin/activate"
|
||||
python3 "$SCRIPT_DIR/llama-server-simulator.py" --port $PORT --success-rate $SUCCESS_RATE > /tmp/simulator-test.log 2>&1 &
|
||||
SIMULATOR_PID=$!
|
||||
|
||||
echo "Waiting for simulator to start..."
|
||||
sleep 5
|
||||
|
||||
# Helper function to make a request and extract the answer
|
||||
make_request() {
|
||||
local question="$1"
|
||||
curl -s -X POST http://localhost:$PORT/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"model\": \"llama\",
|
||||
\"messages\": [
|
||||
{\"role\": \"user\", \"content\": \"$question\"}
|
||||
],
|
||||
\"temperature\": 0,
|
||||
\"max_tokens\": 2048
|
||||
}" | python3 -c "import sys, json; data = json.load(sys.stdin); print(data.get('choices', [{}])[0].get('message', {}).get('content', data.get('error', 'No response')))"
|
||||
}
|
||||
|
||||
# Test question (repeated in multiple tests)
|
||||
TEST_QUESTION="Quadratic polynomials P(x) and Q(x) have leading coefficients 2 and -2, respectively. The graphs of both polynomials pass through the two points (16,54) and (20,53). Find P(0) + Q(0)."
|
||||
|
||||
echo ""
|
||||
echo "=== Test 1: Correct Answer ==="
|
||||
echo "Sending request with known question..."
|
||||
answer=$(make_request "$TEST_QUESTION")
|
||||
echo "Answer: $answer"
|
||||
echo "Expected: 116"
|
||||
echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
|
||||
|
||||
echo ""
|
||||
echo "=== Test 2: Wrong Answer ==="
|
||||
echo "Sending request with known question (success rate 0.0)..."
|
||||
answer=$(make_request "$TEST_QUESTION")
|
||||
echo "Answer: $answer"
|
||||
echo "Expected: 116"
|
||||
echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
|
||||
|
||||
echo ""
|
||||
echo "=== Test 3: No Matching Question ==="
|
||||
echo "Sending request with non-matching text..."
|
||||
response=$(make_request "What is the capital of France?")
|
||||
echo "Response: $response"
|
||||
echo "Expected: No matching question found"
|
||||
echo "Correct: $([ "$response" == "No matching question found" ] && echo "Yes" || echo "No")"
|
||||
|
||||
echo ""
|
||||
echo "=== Test 4: Success Rate Verification ==="
|
||||
echo "Sending 10 requests to test success rate..."
|
||||
correct_count=0
|
||||
for i in {1..10}; do
|
||||
answer=$(make_request "$TEST_QUESTION")
|
||||
if [ "$answer" == "116" ]; then
|
||||
correct_count=$((correct_count + 1))
|
||||
fi
|
||||
echo " Request $i: Answer = $answer"
|
||||
done
|
||||
echo "Correct answers: $correct_count/10"
|
||||
echo "Expected: ~8/10 (80% success rate)"
|
||||
echo "Success rate: $(echo "scale=1; $correct_count * 10" | bc)%"
|
||||
|
||||
echo ""
|
||||
echo "=== Test Complete ==="
|
||||
echo "Stopping simulator..."
|
||||
kill $SIMULATOR_PID 2>/dev/null
|
||||
wait $SIMULATOR_PID 2>/dev/null || true
|
||||
|
||||
echo "Simulator stopped."
|
||||
@@ -52,6 +52,10 @@ causal-convert-mm-model:
|
||||
METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
|
||||
./scripts/causal/convert-model.sh
|
||||
|
||||
$(MAKE) causal-convert-mmproj MM_OUTTYPE="$(MM_OUTTYPE)"
|
||||
|
||||
causal-convert-mmproj:
|
||||
$(call validate_model_path,causal-convert-mmproj)
|
||||
@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(MM_OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
|
||||
METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
|
||||
./scripts/causal/convert-model.sh --mmproj
|
||||
|
||||
@@ -6,7 +6,7 @@ Demonstration of basic greedy speculative decoding
|
||||
./bin/llama-speculative-simple \
|
||||
-m ../models/qwen2.5-32b-coder-instruct/ggml-model-q8_0.gguf \
|
||||
-md ../models/qwen2.5-1.5b-coder-instruct/ggml-model-q4_0.gguf \
|
||||
-f test.txt -c 0 -ngl 99 --color \
|
||||
--sampling-seq k --top-k 1 -fa --temp 0.0 \
|
||||
-ngld 99 --draft-max 16 --draft-min 5 --draft-p-min 0.9
|
||||
-f test.txt -c 0 -ngl 99 --color on \
|
||||
--sampling-seq k --top-k 1 -fa on --temp 0.0 \
|
||||
-ngld 99 --spec-draft-n-max 16 --spec-draft-n-draft-min 5 --draft-p-min 0.9
|
||||
```
|
||||
|
||||
@@ -13,20 +13,6 @@
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
struct spec_checkpoint {
|
||||
int64_t n_tokens = 0;
|
||||
|
||||
std::vector<uint8_t> data;
|
||||
|
||||
size_t size() const {
|
||||
return data.size();
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
return data.empty();
|
||||
}
|
||||
};
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
@@ -43,11 +29,6 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.speculative.draft.mparams.path.empty()) {
|
||||
LOG_ERR("%s: --model-draft is required\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@@ -62,18 +43,11 @@ int main(int argc, char ** argv) {
|
||||
model_tgt = llama_init_tgt->model();
|
||||
ctx_tgt = llama_init_tgt->context();
|
||||
|
||||
// check if the context supports partial sequence removal
|
||||
const auto ctx_seq_rm = common_context_can_seq_rm(ctx_tgt);
|
||||
const bool use_ckpt = (ctx_seq_rm == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
|
||||
|
||||
if (use_ckpt) {
|
||||
LOG_INF("speculative decoding will use checkpoints (context does not support partial sequence removal)\n");
|
||||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
|
||||
|
||||
// load the draft model
|
||||
llama_model_ptr model_dft;
|
||||
llama_context_ptr ctx_dft;
|
||||
|
||||
// TODO: simplify this logic
|
||||
{
|
||||
@@ -81,9 +55,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
auto params_dft = params;
|
||||
|
||||
params_dft.n_parallel = 1;
|
||||
params_dft.n_ctx = params_spec.n_ctx;
|
||||
params_dft.n_batch = llama_n_ctx_seq(ctx_tgt);
|
||||
params_dft.devices = params_spec.devices;
|
||||
params_dft.model = params_spec.mparams;
|
||||
params_dft.n_gpu_layers = params_spec.n_gpu_layers;
|
||||
@@ -103,8 +74,19 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
params.speculative.draft.model = model_dft.get();
|
||||
params.speculative.draft.cparams = common_context_params_to_llama(params_dft);
|
||||
auto cparams = common_context_params_to_llama(params_dft);
|
||||
ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams));
|
||||
|
||||
params.speculative.draft.ctx_tgt = ctx_tgt;
|
||||
params.speculative.draft.ctx_dft = ctx_dft.get();
|
||||
}
|
||||
|
||||
// check if the context supports partial sequence removal
|
||||
const bool use_ckpt_tgt = (common_context_can_seq_rm(ctx_tgt) == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
|
||||
const bool use_ckpt_dft = (common_context_can_seq_rm(ctx_dft.get()) == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
|
||||
|
||||
if (use_ckpt_tgt) {
|
||||
LOG_INF("speculative decoding will use checkpoints (context does not support partial sequence removal)\n");
|
||||
}
|
||||
|
||||
// Tokenize the prompt
|
||||
@@ -136,6 +118,8 @@ int main(int argc, char ** argv) {
|
||||
// used to determine end of generation
|
||||
bool has_eos = false;
|
||||
|
||||
llama_seq_id seq_id = 0;
|
||||
|
||||
// ================================================
|
||||
// everything until here is standard initialization
|
||||
// the relevant stuff for speculative decoding starts here
|
||||
@@ -146,7 +130,8 @@ int main(int argc, char ** argv) {
|
||||
common_sampler_ptr smpl(common_sampler_init(model_tgt, params.sampling));
|
||||
|
||||
// eval the prompt
|
||||
llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
|
||||
llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
|
||||
llama_decode(ctx_dft.get(), llama_batch_get_one(inp.data(), inp.size() - 1));
|
||||
|
||||
// note: keep the last token separate!
|
||||
llama_token id_last = inp.back();
|
||||
@@ -160,16 +145,16 @@ int main(int argc, char ** argv) {
|
||||
// init the speculator
|
||||
const auto & params_spec = params.speculative;
|
||||
|
||||
struct common_speculative * spec = common_speculative_init(params.speculative, ctx_tgt);
|
||||
struct common_speculative * spec = common_speculative_init(params.speculative, 1);
|
||||
|
||||
common_speculative_begin(spec, prompt_tgt);
|
||||
common_speculative_begin(spec, seq_id, prompt_tgt);
|
||||
|
||||
llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
|
||||
|
||||
size_t n_draft = 0;
|
||||
|
||||
llama_tokens draft;
|
||||
spec_checkpoint spec_ckpt;
|
||||
common_prompt_checkpoint ckpt;
|
||||
|
||||
const auto t_enc_end = ggml_time_us();
|
||||
|
||||
@@ -184,40 +169,57 @@ int main(int argc, char ** argv) {
|
||||
// from a cache or lookup tables.
|
||||
//
|
||||
if (draft.empty()) {
|
||||
ckpt.update_pos(
|
||||
prompt_tgt.size(),
|
||||
llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), seq_id),
|
||||
llama_memory_seq_pos_max(llama_get_memory(ctx_tgt), seq_id));
|
||||
|
||||
if (use_ckpt_dft) {
|
||||
ckpt.update_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
}
|
||||
|
||||
// generate a new draft
|
||||
draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);
|
||||
common_speculative_get_draft_params(spec, seq_id) = {
|
||||
/* .drafting = */ true,
|
||||
/* .n_max = */ -1,
|
||||
/* .n_past = */ n_past,
|
||||
/* .id_last = */ id_last,
|
||||
/* .prompt = */ &prompt_tgt,
|
||||
/* .result = */ &draft, // output
|
||||
};
|
||||
common_speculative_draft(spec);
|
||||
|
||||
// save the original draft size
|
||||
n_draft = draft.size();
|
||||
|
||||
// save a checkpoint of the target context before evaluating the draft
|
||||
// this allows us to restore the state if partial draft acceptance occurs
|
||||
if (!draft.empty() && use_ckpt) {
|
||||
const size_t ckpt_size = llama_state_seq_get_size_ext(ctx_tgt, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
spec_ckpt.data.resize(ckpt_size);
|
||||
if (!draft.empty()) {
|
||||
if (use_ckpt_tgt) {
|
||||
ckpt.update_tgt(ctx_tgt, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
}
|
||||
}
|
||||
|
||||
const size_t n = llama_state_seq_get_data_ext(ctx_tgt, spec_ckpt.data.data(), ckpt_size, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
GGML_ASSERT(n == ckpt_size);
|
||||
{
|
||||
ckpt.load_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
|
||||
spec_ckpt.n_tokens = (int64_t) prompt_tgt.size();
|
||||
LOG_DBG("created speculative checkpoint (n_tokens = %" PRId64 ", size = %.3f MiB)\n",
|
||||
spec_ckpt.n_tokens, (float) spec_ckpt.data.size() / 1024 / 1024);
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), seq_id, ckpt.pos_max + 1, -1);
|
||||
}
|
||||
} else {
|
||||
// we have a previous (partial) draft to reuse from checkpoint restoration
|
||||
if (use_ckpt) {
|
||||
GGML_ASSERT(!spec_ckpt.empty());
|
||||
if (use_ckpt_tgt) {
|
||||
GGML_ASSERT(!ckpt.empty());
|
||||
}
|
||||
}
|
||||
|
||||
// always have a token to evaluate from before - id_last
|
||||
common_batch_clear(batch_tgt);
|
||||
common_batch_add (batch_tgt, id_last, n_past++, { 0 }, true);
|
||||
common_batch_add (batch_tgt, id_last, n_past++, { seq_id }, true);
|
||||
|
||||
// evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
|
||||
{
|
||||
for (size_t i = 0; i < draft.size(); ++i) {
|
||||
common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
|
||||
common_batch_add(batch_tgt, draft[i], n_past + i, { seq_id }, true);
|
||||
}
|
||||
|
||||
//LOG_DBG("target batch: %s\n", string_from(ctx_tgt, batch_tgt).c_str());
|
||||
@@ -225,9 +227,15 @@ int main(int argc, char ** argv) {
|
||||
llama_decode(ctx_tgt, batch_tgt);
|
||||
}
|
||||
|
||||
// evaluate the same batch with the draft model
|
||||
{
|
||||
// TODO: extend to support MTP, Eagle, etc. See server code for reference
|
||||
llama_decode(ctx_dft.get(), batch_tgt);
|
||||
}
|
||||
|
||||
// only save the sampler sampler state if we use checkpoints
|
||||
common_sampler_ptr smpl_save;
|
||||
if (use_ckpt) {
|
||||
if (use_ckpt_tgt) {
|
||||
smpl_save.reset(common_sampler_clone(smpl.get()));
|
||||
}
|
||||
|
||||
@@ -247,17 +255,24 @@ int main(int argc, char ** argv) {
|
||||
// check for partial draft acceptance:
|
||||
// if the context doesn't support partial sequence removal, restore the checkpoint
|
||||
// and make the accepted tokens the new partial draft for the next iteration
|
||||
if (use_ckpt && ids.size() - 1 < draft.size()) {
|
||||
if (use_ckpt_tgt && ids.size() - 1 < draft.size()) {
|
||||
LOG_DBG("partial acceptance: %zu < %zu, restoring checkpoint\n", ids.size() - 1, draft.size());
|
||||
|
||||
draft = std::move(ids);
|
||||
|
||||
const size_t n = llama_state_seq_set_data_ext(ctx_tgt, spec_ckpt.data.data(), spec_ckpt.size(), 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
GGML_ASSERT(n == spec_ckpt.size());
|
||||
{
|
||||
ckpt.load_tgt(ctx_tgt, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, spec_ckpt.n_tokens, -1);
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_tgt), seq_id, ckpt.pos_max + 1, -1);
|
||||
}
|
||||
|
||||
prompt_tgt.resize(spec_ckpt.n_tokens);
|
||||
{
|
||||
ckpt.load_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), seq_id, ckpt.pos_max + 1, -1);
|
||||
}
|
||||
|
||||
prompt_tgt.resize(ckpt.n_tokens);
|
||||
smpl = std::move(smpl_save);
|
||||
|
||||
n_past = (int) prompt_tgt.size();
|
||||
@@ -265,7 +280,7 @@ int main(int argc, char ** argv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
common_speculative_accept(spec, ids.size() - 1);
|
||||
common_speculative_accept(spec, seq_id, ids.size() - 1);
|
||||
|
||||
// full acceptance: consume the draft and commit accepted tokens
|
||||
n_past += ids.size() - 1;
|
||||
@@ -305,7 +320,8 @@ int main(int argc, char ** argv) {
|
||||
{
|
||||
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, n_past, -1);
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_tgt), seq_id, n_past, -1);
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), seq_id, n_past, -1);
|
||||
}
|
||||
|
||||
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
||||
|
||||
58
flake.lock
generated
58
flake.lock
generated
@@ -1,58 +0,0 @@
|
||||
{
|
||||
"nodes": {
|
||||
"flake-parts": {
|
||||
"inputs": {
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1730504689,
|
||||
"narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "506278e768c2a08bec68eb62932193e341f55c90",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1732014248,
|
||||
"narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "23e89b7da85c3640bbc2173fe04f4bd114342367",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"lastModified": 1730504152,
|
||||
"narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=",
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
|
||||
},
|
||||
"original": {
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-parts": "flake-parts",
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
|
||||
### GGML Version
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 11)
|
||||
set(GGML_VERSION_PATCH 0)
|
||||
set(GGML_VERSION_PATCH 1)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
@@ -249,6 +249,7 @@ option(GGML_SYCL "ggml: use SYCL"
|
||||
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
||||
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
|
||||
option(GGML_SYCL_HOST_MEM_FALLBACK "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
|
||||
option(GGML_SYCL_SUPPORT_LEVEL_ZERO "ggml: use Level Zero API in SYCL backend" ON)
|
||||
option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON)
|
||||
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
||||
"ggml: sycl target device")
|
||||
|
||||
@@ -450,12 +450,22 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
ggml-cpu/arch/riscv/repack.cpp
|
||||
)
|
||||
if (GGML_CPU_RISCV64_SPACEMIT)
|
||||
include(ggml-cpu/cmake/FindSMTIME.cmake)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/spacemit/ime.cpp
|
||||
ggml-cpu/spacemit/ime.h
|
||||
ggml-cpu/spacemit/spine_mem_pool.cpp
|
||||
ggml-cpu/spacemit/spine_mem_pool.h
|
||||
ggml-cpu/spacemit/repack.cpp
|
||||
ggml-cpu/spacemit/repack.h
|
||||
ggml-cpu/spacemit/ime_env.cpp
|
||||
ggml-cpu/spacemit/ime_env.h
|
||||
ggml-cpu/spacemit/ime1_kernels.cpp
|
||||
ggml-cpu/spacemit/ime2_kernels.cpp
|
||||
ggml-cpu/spacemit/ime_kernels.h
|
||||
ggml-cpu/spacemit/rvv_kernels.cpp
|
||||
ggml-cpu/spacemit/rvv_kernels.h
|
||||
)
|
||||
endif()
|
||||
if(NOT GGML_CPU_ALL_VARIANTS)
|
||||
@@ -485,6 +495,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (GGML_RV_ZIHINTPAUSE)
|
||||
string(APPEND MARCH_STR "_zihintpause")
|
||||
endif()
|
||||
if (GGML_RV_ZBA)
|
||||
string(APPEND MARCH_STR "_zba")
|
||||
endif()
|
||||
if (GGML_CPU_RISCV64_SPACEMIT)
|
||||
# `xsmtvdotii' is only required for GCC >= 15.
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
|
||||
|
||||
32
ggml/src/ggml-cpu/cmake/FindSMTIME.cmake
Normal file
32
ggml/src/ggml-cpu/cmake/FindSMTIME.cmake
Normal file
@@ -0,0 +1,32 @@
|
||||
include(CheckCSourceRuns)
|
||||
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)" AND GGML_CPU_RISCV64_SPACEMIT)
|
||||
set(SMT_MARCH_STR "-march=rv64gcv_zfh_zvfh_zba_zicbop")
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
|
||||
CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 15)
|
||||
string(APPEND SMT_MARCH_STR "_xsmtvdotii")
|
||||
endif()
|
||||
set(CMAKE_REQUIRED_FLAGS "${SMT_MARCH_STR}")
|
||||
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_IME1)
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1, i4\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S4)
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1, i8\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S8)
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vfwmadot v2, v0, v1, fp16\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFWMADOT_FP16)
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot.hp v2, v0, v1, v0, 0, i4\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFMADOT_S4)
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot.hp v2, v0, v1, v0, 0, i8\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFMADOT_S8)
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot1 v2, v0, v1\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOTN)
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vpack.vv v2, v0, v1, 2\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VPACK)
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vnspack.vv v2, v0, v1, 2\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VNPACK)
|
||||
unset(CMAKE_REQUIRED_FLAGS)
|
||||
|
||||
list(APPEND RISCV64_SPACEMIT_IME_SPEC "")
|
||||
if (SPACEMIT_RISCV_COMPILER_SUPPORT_IME1)
|
||||
set(RISCV64_SPACEMIT_IME_SPEC "RISCV64_SPACEMIT_IME1")
|
||||
endif()
|
||||
|
||||
if (SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S4 AND SPACEMIT_RISCV_COMPILER_SUPPORT_VPACK AND SPACEMIT_RISCV_COMPILER_SUPPORT_VNPACK)
|
||||
list(APPEND RISCV64_SPACEMIT_IME_SPEC "RISCV64_SPACEMIT_IME2")
|
||||
endif()
|
||||
|
||||
message("RISCV64_SPACEMIT_IME_SPEC: ${RISCV64_SPACEMIT_IME_SPEC}")
|
||||
endif()
|
||||
@@ -50,6 +50,10 @@
|
||||
#include "llamafile/sgemm.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
|
||||
# include "spacemit/ime.h"
|
||||
#endif
|
||||
|
||||
// Note: once we move threading into a separate C++ file
|
||||
// will use std::hardware_destructive_interference_size instead of hardcoding it here
|
||||
// and we'll use C++ attribute syntax.
|
||||
@@ -3011,7 +3015,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
const struct ggml_cgraph * cgraph = tp->cgraph;
|
||||
const struct ggml_cplan * cplan = tp->cplan;
|
||||
|
||||
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
|
||||
ggml_backend_cpu_riscv64_spacemit_set_numa_thread_affinity(state->ith);
|
||||
#else
|
||||
set_numa_thread_affinity(state->ith);
|
||||
#endif
|
||||
|
||||
struct ggml_compute_params params = {
|
||||
/*.ith =*/ state->ith,
|
||||
@@ -3068,6 +3076,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
|
||||
ggml_barrier(state->threadpool);
|
||||
|
||||
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
|
||||
ggml_backend_cpu_riscv64_spacemit_clear_numa_thread_affinity_threaded(state->ith);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -8,6 +8,14 @@ extern "C" {
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_riscv64_spacemit_buffer_type(void);
|
||||
|
||||
void ggml_backend_cpu_riscv64_spacemit_set_numa_thread_affinity(int thread_n);
|
||||
|
||||
void ggml_backend_cpu_riscv64_spacemit_clear_numa_thread_affinity_threaded(int thread_n);
|
||||
|
||||
void * ggml_backend_cpu_riscv64_spacemit_alloc_shared(size_t size, size_t alignment);
|
||||
|
||||
void ggml_backend_cpu_riscv64_spacemit_free_shared(void * ptr);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
5768
ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp
Normal file
5768
ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp
Normal file
File diff suppressed because it is too large
Load Diff
320
ggml/src/ggml-cpu/spacemit/ime_env.cpp
Normal file
320
ggml/src/ggml-cpu/spacemit/ime_env.cpp
Normal file
@@ -0,0 +1,320 @@
|
||||
#include "ime_env.h"
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "spine_mem_pool.h"
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cctype>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace ggml::cpu::riscv64_spacemit {
|
||||
bool spine_core_info::get_spine_core_info(std::vector<spine_core_info> & result) {
|
||||
static std::unordered_map<uint64_t, spine_core_arch_id> spine_march_mapping_ = {
|
||||
{0x8000000058000001, spine_core_arch_id::core_arch_x60 },
|
||||
{ 0x8000000041000001, spine_core_arch_id::core_arch_a60 },
|
||||
{ 0x8000000058000002, spine_core_arch_id::core_arch_x100},
|
||||
{ 0x8000000041000002, spine_core_arch_id::core_arch_a100},
|
||||
};
|
||||
|
||||
result.clear();
|
||||
std::ifstream file("/proc/cpuinfo");
|
||||
std::string line;
|
||||
|
||||
std::vector<std::array<uint64_t, 2>> cpu_info_list;
|
||||
|
||||
uint64_t current_processor = spine_invalid_core_id;
|
||||
uint64_t current_marchid = 0;
|
||||
bool has_processor = false;
|
||||
bool has_marchid = false;
|
||||
|
||||
if (!file.is_open()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
while (std::getline(file, line)) {
|
||||
if (line.substr(0, 9) == "processor") {
|
||||
if (has_processor && has_marchid) {
|
||||
cpu_info_list.push_back({ current_processor, current_marchid });
|
||||
}
|
||||
|
||||
size_t colon_pos = line.find(':');
|
||||
if (colon_pos != std::string::npos) {
|
||||
current_processor = std::stoi(line.substr(colon_pos + 1));
|
||||
has_processor = true;
|
||||
}
|
||||
|
||||
has_marchid = false;
|
||||
} else if (line.substr(0, 7) == "marchid") {
|
||||
size_t colon_pos = line.find(':');
|
||||
if (colon_pos != std::string::npos) {
|
||||
std::string marchid_str = line.substr(colon_pos + 1);
|
||||
marchid_str.erase(std::remove_if(marchid_str.begin(), marchid_str.end(), isspace), marchid_str.end());
|
||||
current_marchid = std::stoull(marchid_str, nullptr, 16);
|
||||
has_marchid = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (has_processor && has_marchid) {
|
||||
cpu_info_list.push_back({ current_processor, current_marchid });
|
||||
}
|
||||
|
||||
if (has_processor && has_marchid) {
|
||||
for (auto & cpu_info : cpu_info_list) {
|
||||
if (cpu_info[0] != spine_invalid_core_id &&
|
||||
spine_march_mapping_.find(cpu_info[1]) != spine_march_mapping_.end()) {
|
||||
auto core_info = spine_core_info();
|
||||
core_info.core_id = cpu_info[0];
|
||||
core_info.arch_id = spine_core_arch_id(spine_march_mapping_[cpu_info[1]]);
|
||||
|
||||
result.push_back(core_info);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return has_processor && has_marchid;
|
||||
}
|
||||
|
||||
namespace {
|
||||
uint16_t hex_string_to_u16(const std::string & hex_str) {
|
||||
try {
|
||||
size_t pos = 0;
|
||||
if (hex_str.substr(0, 2) == "0x" || hex_str.substr(0, 2) == "0X") {
|
||||
pos = 2;
|
||||
}
|
||||
unsigned long result = std::stoul(hex_str.substr(pos), nullptr, 16);
|
||||
if (result > std::numeric_limits<uint16_t>::max()) {
|
||||
throw std::out_of_range("Converted value is out of range for uint16_t");
|
||||
}
|
||||
return static_cast<uint16_t>(result);
|
||||
} catch (const std::invalid_argument & e) {
|
||||
throw std::invalid_argument("Invalid hexadecimal string");
|
||||
} catch (const std::out_of_range & e) {
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
const char * spine_mem_pool_backend_to_string(spine_mem_pool_backend backend) {
|
||||
switch (backend) {
|
||||
case spine_mem_pool_backend::none:
|
||||
return "NONE";
|
||||
case spine_mem_pool_backend::posix_memalign:
|
||||
return "POSIX";
|
||||
case spine_mem_pool_backend::transparent_hugepage:
|
||||
return "HPAGE";
|
||||
case spine_mem_pool_backend::hugetlb_1g:
|
||||
return "HPAGE1GB";
|
||||
}
|
||||
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
spine_mem_pool_backend parse_mem_backend(const char * mem_backend_str) {
|
||||
if (mem_backend_str == nullptr || mem_backend_str[0] == '\0') {
|
||||
return spine_mem_pool_backend::transparent_hugepage;
|
||||
}
|
||||
|
||||
std::string value(mem_backend_str);
|
||||
std::transform(value.begin(), value.end(), value.begin(),
|
||||
[](unsigned char ch) { return static_cast<char>(std::tolower(ch)); });
|
||||
|
||||
if (value == "none") {
|
||||
return spine_mem_pool_backend::none;
|
||||
}
|
||||
|
||||
if (value == "posix") {
|
||||
return spine_mem_pool_backend::posix_memalign;
|
||||
}
|
||||
|
||||
if (value == "hpage") {
|
||||
return spine_mem_pool_backend::transparent_hugepage;
|
||||
}
|
||||
|
||||
if (value == "hpage1gb") {
|
||||
return spine_mem_pool_backend::hugetlb_1g;
|
||||
}
|
||||
|
||||
throw std::runtime_error("invalid SPACEMIT_MEM_BACKEND: " + value + ", expected NONE, POSIX, HPAGE or HPAGE1GB");
|
||||
}
|
||||
} // namespace
|
||||
|
||||
spine_env_info::spine_env_info() {
|
||||
num_cores = static_cast<int>(std::thread::hardware_concurrency());
|
||||
spine_core_info::get_spine_core_info(core_info_list);
|
||||
|
||||
// special for x60 K1
|
||||
if (core_info_list.size() == 8 && core_info_list[0].arch_id == spine_core_arch_id::core_arch_x60) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
core_info_list[i].arch_id = spine_core_arch_id::core_arch_a60;
|
||||
}
|
||||
}
|
||||
|
||||
// special for qemu
|
||||
if (core_info_list.size() == 0) {
|
||||
char * spine_core_arch_str = getenv("SPACEMIT_CORE_ARCH");
|
||||
if (spine_core_arch_str != nullptr) {
|
||||
auto arch_id = hex_string_to_u16(spine_core_arch_str);
|
||||
for (int i = 0; i < num_cores; i++) {
|
||||
auto core_info = spine_core_info();
|
||||
core_info.core_id = i;
|
||||
core_info.arch_id = spine_core_arch_id{ arch_id };
|
||||
core_info_list.push_back(core_info);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (core_info_list.size() == 0) {
|
||||
throw std::runtime_error(
|
||||
"Failed to get SPACEMIT_CORE_ARCH from environment or failed to parse it from /proc/cpuinfo");
|
||||
}
|
||||
|
||||
char * spine_perfer_core_arch_str = getenv("SPACEMIT_PERFER_CORE_ARCH");
|
||||
if (spine_perfer_core_arch_str != nullptr && spine_perfer_core_arch_str != "") {
|
||||
perfer_core_arch_id = spine_core_arch_id{ hex_string_to_u16(spine_perfer_core_arch_str) };
|
||||
}
|
||||
|
||||
char * spine_perfer_core_id_str = getenv("SPACEMIT_PERFER_CORE_ID");
|
||||
std::vector<int> perfer_core_id_vec;
|
||||
if (spine_perfer_core_id_str != nullptr && spine_perfer_core_id_str != "") {
|
||||
std::string perfer_core_id_str(spine_perfer_core_id_str);
|
||||
size_t start = 0;
|
||||
size_t end = 0;
|
||||
while ((end = perfer_core_id_str.find(',', start)) != std::string::npos) {
|
||||
std::string core_id_substr = perfer_core_id_str.substr(start, end - start);
|
||||
perfer_core_id_vec.push_back(std::stoi(core_id_substr));
|
||||
start = end + 1;
|
||||
}
|
||||
std::string core_id_substr = perfer_core_id_str.substr(start);
|
||||
perfer_core_id_vec.push_back(std::stoi(core_id_substr));
|
||||
}
|
||||
|
||||
perfer_core_ids.reserve(num_cores);
|
||||
if (perfer_core_arch_id == spine_core_arch_id::core_arch_none) {
|
||||
for (auto & core_info : core_info_list) {
|
||||
auto core_arch_id = core_info.arch_id;
|
||||
auto core_arch_head = (uint16_t) (core_arch_id) >> 12;
|
||||
if (core_arch_head == 0xA) {
|
||||
num_perfer_cores++;
|
||||
perfer_core_arch_id = core_arch_id;
|
||||
cpu_mask |= (1ULL << core_info.core_id);
|
||||
perfer_core_ids.push_back(core_info.core_id);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (auto & core_info : core_info_list) {
|
||||
auto core_arch_id = core_info.arch_id;
|
||||
if (core_arch_id == perfer_core_arch_id) {
|
||||
num_perfer_cores++;
|
||||
cpu_mask |= (1ULL << core_info.core_id);
|
||||
|
||||
auto core_arch_head = (uint16_t) (core_arch_id) >> 12;
|
||||
if (core_arch_head == 0xA) {
|
||||
perfer_core_ids.push_back(core_info.core_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (num_perfer_cores == 0) {
|
||||
GGML_ABORT("can not find core with arch id %x for SPACEMIT_PERFER_CORE_ARCH in core info list\n",
|
||||
(uint16_t) perfer_core_arch_id);
|
||||
}
|
||||
}
|
||||
|
||||
if (perfer_core_id_vec.size() > 0) {
|
||||
perfer_core_ids.clear();
|
||||
cpu_mask = 0;
|
||||
num_perfer_cores = 0;
|
||||
for (int core_id : perfer_core_id_vec) {
|
||||
if (core_id < 0 || core_id >= num_cores) {
|
||||
GGML_ABORT("invalid core id in SPACEMIT_PERFER_CORE_ID: %d, should be between 0 and %d\n", core_id,
|
||||
num_cores - 1);
|
||||
}
|
||||
auto core_info = core_info_list[core_id];
|
||||
auto core_arch_id = core_info.arch_id;
|
||||
if (core_arch_id == perfer_core_arch_id) {
|
||||
cpu_mask |= (1ULL << core_id);
|
||||
perfer_core_ids.push_back(core_id);
|
||||
} else {
|
||||
GGML_ABORT(
|
||||
"core id %d in SPACEMIT_PERFER_CORE_ID has arch id %x which does not match "
|
||||
"SPACEMIT_PERFER_CORE_ARCH %x\n",
|
||||
core_id, (uint16_t) core_arch_id, (uint16_t) perfer_core_arch_id);
|
||||
}
|
||||
}
|
||||
std::string perfer_core_id_vec_str;
|
||||
for (int core_id : perfer_core_id_vec) {
|
||||
perfer_core_id_vec_str += std::to_string(core_id) + ",";
|
||||
}
|
||||
perfer_core_id_vec_str.pop_back();
|
||||
GGML_LOG_DEBUG("SPACEMIT_PERFER_CORE_ID is set, perferred core ids: %s\n", perfer_core_id_vec_str.c_str());
|
||||
num_perfer_cores = static_cast<int>(perfer_core_id_vec.size());
|
||||
}
|
||||
|
||||
use_ime1 = perfer_core_arch_id == spine_core_arch_id::core_arch_a60 ||
|
||||
perfer_core_arch_id == spine_core_arch_id::core_arch_x100;
|
||||
|
||||
use_ime2 = perfer_core_arch_id == spine_core_arch_id::core_arch_a100;
|
||||
|
||||
mem_backend = parse_mem_backend(getenv("SPACEMIT_MEM_BACKEND"));
|
||||
char * spine_disable_tcm_str = getenv("SPACEMIT_DISABLE_TCM");
|
||||
auto user_disable_tcm = spine_disable_tcm_str != nullptr && strcmp(spine_disable_tcm_str, "0") != 0;
|
||||
|
||||
if (!user_disable_tcm) {
|
||||
spine_mem_pool_tcm_info tcm_info;
|
||||
if (spine_mem_pool_tcm_init(&tcm_info)) {
|
||||
use_tcm = tcm_info.available;
|
||||
tcm_blk_size = tcm_info.blk_size;
|
||||
GGML_LOG_DEBUG("CPU_RISCV64_SPACEMIT: tcm is available, blk_size: %zu, blk_num: %zu, is_fake_tcm: %d\n",
|
||||
tcm_info.blk_size, tcm_info.blk_num, tcm_info.is_fake_tcm);
|
||||
|
||||
for (auto & core_info : core_info_list) {
|
||||
auto core_arch_head = (uint16_t) (core_info.arch_id) >> 12;
|
||||
if (core_arch_head != 0xA) {
|
||||
aicpu_id_offset++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GGML_LOG_DEBUG(
|
||||
"CPU_RISCV64_SPACEMIT: num_cores: %d, num_perfer_cores: %d, perfer_core_arch_id: %x, exclude_main_thread: %d, "
|
||||
"use_ime1: %d, use_ime2: %d, mem_backend: %s, cpu_mask: %lx, aicpu_id_offset: %d\n",
|
||||
num_cores, num_perfer_cores, (uint16_t) perfer_core_arch_id, exclude_main_thread, use_ime1, use_ime2,
|
||||
spine_mem_pool_backend_to_string(mem_backend), cpu_mask, aicpu_id_offset);
|
||||
|
||||
const size_t init_barrier_size = sizeof(spine_barrier_t) * spine_init_barrier_count;
|
||||
init_barrier =
|
||||
static_cast<spine_barrier_t *>(spine_mem_pool_shared_mem_alloc(init_barrier_size, alignof(spine_barrier_t)));
|
||||
if (init_barrier != nullptr) {
|
||||
init_barrier_is_shared_mem = true;
|
||||
} else {
|
||||
GGML_LOG_WARN("CPU_RISCV64_SPACEMIT: failed to allocate init_barrier from shared mem, falling back to heap\n",
|
||||
__func__);
|
||||
init_barrier = new spine_barrier_t[spine_init_barrier_count];
|
||||
}
|
||||
|
||||
spine_barrier_init(init_barrier, spine_init_barrier_count, 2);
|
||||
}
|
||||
|
||||
spine_env_info::~spine_env_info() {
|
||||
if (init_barrier_is_shared_mem) {
|
||||
spine_mem_pool_shared_mem_free(init_barrier);
|
||||
} else {
|
||||
delete[] init_barrier;
|
||||
}
|
||||
|
||||
init_barrier = nullptr;
|
||||
init_barrier_is_shared_mem = false;
|
||||
}
|
||||
|
||||
spine_env_info global_spine_env_info;
|
||||
|
||||
} // namespace ggml::cpu::riscv64_spacemit
|
||||
55
ggml/src/ggml-cpu/spacemit/ime_env.h
Normal file
55
ggml/src/ggml-cpu/spacemit/ime_env.h
Normal file
@@ -0,0 +1,55 @@
|
||||
#pragma once
|
||||
|
||||
#include "spine_barrier.h"
|
||||
#include "spine_mem_pool.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
namespace ggml::cpu::riscv64_spacemit {
|
||||
|
||||
constexpr uint64_t spine_invalid_core_id = 0xFFFFFFFF;
|
||||
constexpr size_t spine_init_barrier_count = 16;
|
||||
|
||||
enum class spine_core_arch_id : uint16_t {
|
||||
core_arch_none = 0,
|
||||
core_arch_x60 = 0x503C,
|
||||
core_arch_x100 = 0x5064,
|
||||
core_arch_x200 = 0x50C8,
|
||||
core_arch_a60 = 0xA03C,
|
||||
core_arch_a100 = 0xA064,
|
||||
core_arch_a200 = 0xA0C8,
|
||||
};
|
||||
|
||||
struct spine_core_info {
|
||||
uint64_t core_id{ spine_invalid_core_id };
|
||||
spine_core_arch_id arch_id{ spine_core_arch_id::core_arch_none };
|
||||
|
||||
static bool get_spine_core_info(std::vector<spine_core_info> & result);
|
||||
};
|
||||
|
||||
struct spine_env_info {
|
||||
std::vector<spine_core_info> core_info_list;
|
||||
std::vector<int> perfer_core_ids;
|
||||
int aicpu_id_offset{ 0 };
|
||||
int num_cores{ 0 };
|
||||
int num_perfer_cores{ 0 };
|
||||
spine_core_arch_id perfer_core_arch_id{ spine_core_arch_id::core_arch_none };
|
||||
bool exclude_main_thread{ false };
|
||||
bool use_ime2{ false };
|
||||
bool use_ime1{ false };
|
||||
bool use_tcm{ false };
|
||||
spine_mem_pool_backend mem_backend{ spine_mem_pool_backend::transparent_hugepage };
|
||||
uint64_t tcm_blk_size{ 0 };
|
||||
uint64_t cpu_mask{ 0 };
|
||||
spine_barrier_t * init_barrier{ nullptr };
|
||||
bool init_barrier_is_shared_mem{ false };
|
||||
|
||||
spine_env_info();
|
||||
~spine_env_info();
|
||||
};
|
||||
|
||||
extern spine_env_info global_spine_env_info;
|
||||
|
||||
} // namespace ggml::cpu::riscv64_spacemit
|
||||
@@ -1,26 +1,189 @@
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <functional>
|
||||
|
||||
namespace spacemit_kernels {
|
||||
|
||||
#define BLOCK_QNK_LEN 256
|
||||
|
||||
template <int N> struct nrow_block_q2_k {
|
||||
// [4bit scale + 4bit zp] * N * 16
|
||||
uint8_t scales[N * BLOCK_QNK_LEN / 16];
|
||||
// [b0, b16, b32, b48] [b1, b17, b33, b49] ... [b15, b31, b47, b63]
|
||||
// [b64, b80, b96, b112] ...[b79, b95, b111, b127]
|
||||
// [b128, b144, b160, b176] ...[b143, b159, b175, b191]
|
||||
// [b192, b208, b224, b240] ...[b207, b223, b239, b255]
|
||||
uint8_t qs[N * BLOCK_QNK_LEN / 4];
|
||||
uint16_t scales16[N];
|
||||
uint16_t zeros16[N];
|
||||
};
|
||||
|
||||
template <int N> struct nrow_block_q3_k {
|
||||
// [8bit scale] * N * 16
|
||||
int8_t scales[N * 16];
|
||||
// [b0, b1, b2, b3, b4, b5, b6, b7] ... [b248, b249, b250, b251, b252, b253, b254, b255]
|
||||
uint8_t hmask[N * BLOCK_QNK_LEN / 8];
|
||||
// [b0, b16, b32, b48] [b1, b17, b33, b49] ... [b15, b31, b47, b63]
|
||||
// [b64, b80, b96, b112] ...[b79, b95, b111, b127]
|
||||
// [b128, b144, b160, b176] ...[b143, b159, b175, b191]
|
||||
// [b192, b208, b224, b240] ...[b207, b223, b239, b255]
|
||||
uint8_t qs[N * BLOCK_QNK_LEN / 4];
|
||||
uint16_t scales16[N];
|
||||
};
|
||||
|
||||
template <int N> struct nrow_block_mxfp4 {
|
||||
uint8_t e[N];
|
||||
uint8_t qh[4 * N];
|
||||
uint8_t qs[16 * N];
|
||||
};
|
||||
|
||||
template <int N> struct __attribute__((packed)) nrow_block_q5_1 {
|
||||
uint16_t scales16[N];
|
||||
uint8_t zp[N];
|
||||
// n0 [bh0, bh1, bh2, bh3, bh4, bh5, bh6, bh7] ....
|
||||
uint8_t qh[4 * N];
|
||||
// n0 [b0, b1], [b2, b3] .... [b30, b31]
|
||||
// n1 [b0, b1], [b2, b3] .... [b30, b31]
|
||||
uint8_t qs[16 * N];
|
||||
};
|
||||
|
||||
static_assert(sizeof(nrow_block_q5_1<1>) == sizeof(uint8_t) + 22, "wrong nrow_block_q5_1 block size/padding");
|
||||
|
||||
template <int N> struct __attribute__((packed)) nrow_block_q5_0 {
|
||||
uint16_t scales16[N];
|
||||
// n0 [bh0, bh1, bh2, bh3, bh4, bh5, bh6, bh7] ....
|
||||
uint8_t qh[4 * N];
|
||||
// n0 [b0, b1], [b2, b3] .... [b30, b31]
|
||||
// n1 [b0, b1], [b2, b3] .... [b30, b31]
|
||||
uint8_t qs[16 * N];
|
||||
};
|
||||
|
||||
static_assert(sizeof(nrow_block_q5_0<1>) == 22, "wrong nrow_block_q5_0 block size/padding");
|
||||
|
||||
using gemm_kernel_quantize_def = std::function<
|
||||
size_t(size_t, const uint8_t *, const uint8_t *, const uint8_t *, float *, size_t, size_t, size_t, size_t)>;
|
||||
|
||||
using moe_gemm_kernel_quantize_def = std::function<
|
||||
size_t(size_t, const uint8_t **, const uint8_t *, const uint8_t *, float **, size_t, size_t, size_t, size_t)>;
|
||||
|
||||
namespace sqnbitgemm_spacemit_ime {
|
||||
namespace ime1 {
|
||||
size_t gemm_kernel_i8i4(size_t blk_len,
|
||||
const std::byte * quant_a_ptr,
|
||||
const std::byte * quant_b_data,
|
||||
const float * quant_b_scale,
|
||||
const std::byte * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t count_k,
|
||||
size_t block_count_k,
|
||||
size_t ldc,
|
||||
const float * bias,
|
||||
const size_t scale_stride);
|
||||
size_t gemm_kernel_i8i4(size_t blk_len,
|
||||
const uint8_t * quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
|
||||
void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
|
||||
|
||||
void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
|
||||
void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
|
||||
|
||||
} // namespace ime1
|
||||
} // namespace sqnbitgemm_spacemit_ime
|
||||
|
||||
namespace ime2 {
|
||||
size_t gemm_kernel_i8i2k(size_t blk_len,
|
||||
const uint8_t * quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t gemm_kernel_i8i3k(size_t blk_len,
|
||||
const uint8_t * quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t gemm_kernel_i8i4(size_t blk_len,
|
||||
const uint8_t * quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t gemm_kernel_i8i4_hp(size_t blk_len,
|
||||
const uint8_t * quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t moe_m2_gemm_kernel_i8i4(size_t blk_len,
|
||||
const uint8_t ** quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float ** c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t gemm_kernel_i8i8(size_t blk_len,
|
||||
const uint8_t * quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t gemm_kernel_i8mxfp4(size_t blk_len,
|
||||
const uint8_t * quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t moe_m2_gemm_kernel_i8mxfp4(size_t blk_len,
|
||||
const uint8_t ** quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float ** c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t gemm_kernel_i8i5(size_t blk_len,
|
||||
const uint8_t * quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t moe_m2_gemm_kernel_i8i5(size_t blk_len,
|
||||
const uint8_t ** quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float ** c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
} // namespace ime2
|
||||
} // namespace spacemit_kernels
|
||||
|
||||
1795
ggml/src/ggml-cpu/spacemit/repack.cpp
Normal file
1795
ggml/src/ggml-cpu/spacemit/repack.cpp
Normal file
File diff suppressed because it is too large
Load Diff
14
ggml/src/ggml-cpu/spacemit/repack.h
Normal file
14
ggml/src/ggml-cpu/spacemit/repack.h
Normal file
@@ -0,0 +1,14 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-common.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
namespace ggml::cpu::riscv64_spacemit {
|
||||
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
||||
int repack(ggml_tensor * t, const void * data, size_t data_size);
|
||||
|
||||
} // namespace ggml::cpu::riscv64_spacemit
|
||||
3178
ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp
Normal file
3178
ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp
Normal file
File diff suppressed because it is too large
Load Diff
95
ggml/src/ggml-cpu/spacemit/rvv_kernels.h
Normal file
95
ggml/src/ggml-cpu/spacemit/rvv_kernels.h
Normal file
@@ -0,0 +1,95 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
|
||||
namespace spacemit_kernels {
|
||||
|
||||
constexpr auto div_round_up(auto up, auto down) {
|
||||
return (up + down - 1) / down;
|
||||
}
|
||||
|
||||
// Q8 Blk [f32] [s16] [int8 * blk_len]
|
||||
// Q8 Blk N [f32 * N] [s16 * N] [int8 * blk_len * N]
|
||||
constexpr size_t q8_blk_size(size_t blk_len, bool with_blk_sum = false) {
|
||||
const size_t blk_size = sizeof(float) + blk_len * sizeof(int8_t) + (with_blk_sum ? sizeof(int16_t) : 0);
|
||||
return blk_size;
|
||||
}
|
||||
|
||||
// Q8 HP row block: K is split into K32 subblocks.
|
||||
// Each subblock stores [f32 scale] [int8 * 32], with an optional fp16 sum trailer per subblock.
|
||||
constexpr size_t q8_hp_blk_size(size_t blk_len, bool with_blk_sum = false, bool with_blk_scale = false) {
|
||||
const size_t subblk_count = div_round_up(blk_len, size_t(32));
|
||||
const size_t blk_size = blk_len * sizeof(int8_t) + subblk_count * sizeof(_Float16) +
|
||||
(with_blk_sum ? subblk_count * sizeof(_Float16) : 0) +
|
||||
(with_blk_scale ? sizeof(_Float16) : 0);
|
||||
return blk_size;
|
||||
}
|
||||
|
||||
// Q8K Blk [f32] [s16 * (blk_len / 16)] [int8 * blk_len]
|
||||
// Q8K Blk N [f32 * N] [s16 * (blk_len / 16) * N] [int8 * blk_len * N]
|
||||
constexpr size_t q8k_blk_size(size_t blk_len) {
|
||||
const size_t blk_size = sizeof(float) + blk_len * sizeof(int8_t) + sizeof(int16_t) * blk_len / 16;
|
||||
return blk_size;
|
||||
}
|
||||
|
||||
using quantize_a_row_def = std::function<void(size_t, const float *, size_t, uint8_t *)>;
|
||||
|
||||
namespace rvv {
|
||||
void memcpy1d(void * dst, const void * src, int64_t size);
|
||||
|
||||
void memcpy2d(void * dst, int64_t dst_stride, const void * src, int64_t src_stride, int64_t tile_rows, int64_t size);
|
||||
|
||||
void forward_flash_attn_ext_f16_one_chunk_vlen1024_vf16(const ggml_compute_params * params,
|
||||
ggml_tensor * dst,
|
||||
int ir0,
|
||||
int ir1,
|
||||
void * tcm_buffer,
|
||||
size_t tcm_buffer_size);
|
||||
|
||||
void forward_flash_attn_ext_f16_tiled_vlen1024_vf16(const ggml_compute_params * params,
|
||||
ggml_tensor * dst,
|
||||
int ir0,
|
||||
int ir1,
|
||||
void * tcm_buffer,
|
||||
size_t tcm_buffer_size);
|
||||
|
||||
void forward_rms_norm_f32(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
void forward_norm_f32(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
void forward_cont_with_permute(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
void forward_cpy_with_permute(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
template <typename T> void forward_get_rows(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
template <typename T> void forward_concat(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
template <ggml_op op_type, typename T> void forward_binary(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
template <typename T> void forward_sum_rows(const ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
template <typename T> void forward_repeat_nrows(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
template <typename T> void forward_repeat_dim1(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
|
||||
|
||||
void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
|
||||
|
||||
void quantize_a_row_i8_hp(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
|
||||
|
||||
void quantize_a_4row_i8_hp(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
|
||||
|
||||
void quantize_a_row_i8k(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
|
||||
|
||||
void quantize_a_4row_i8k(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
|
||||
|
||||
} // namespace rvv
|
||||
|
||||
} // namespace spacemit_kernels
|
||||
34
ggml/src/ggml-cpu/spacemit/spine_barrier.h
Normal file
34
ggml/src/ggml-cpu/spacemit/spine_barrier.h
Normal file
@@ -0,0 +1,34 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
|
||||
#define SPINE_CACHE_LINE 64
|
||||
#define SPINE_CACHE_ALIGN __attribute__((aligned(SPINE_CACHE_LINE)))
|
||||
|
||||
struct spine_barrier_t {
|
||||
SPINE_CACHE_ALIGN std::atomic<int64_t> pending_;
|
||||
SPINE_CACHE_ALIGN std::atomic<int64_t> rounds_;
|
||||
SPINE_CACHE_ALIGN int64_t total_;
|
||||
};
|
||||
|
||||
inline void spine_barrier_wait(spine_barrier_t * b) {
|
||||
auto cur_round = b->rounds_.load(std::memory_order_acquire);
|
||||
auto cnt = --b->pending_;
|
||||
if (cnt == 0) {
|
||||
b->pending_.store(b->total_);
|
||||
b->rounds_.store(cur_round + 1);
|
||||
} else {
|
||||
while (cur_round == b->rounds_.load(std::memory_order_relaxed)) {
|
||||
__asm__ volatile("pause " ::: "memory");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void spine_barrier_init(spine_barrier_t * b, int num_barriers, uint64_t thread_count) {
|
||||
for (int i = 0; i < num_barriers; i++) {
|
||||
b[i].total_ = thread_count;
|
||||
b[i].pending_.store(thread_count);
|
||||
b[i].rounds_.store(0);
|
||||
}
|
||||
}
|
||||
760
ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp
Normal file
760
ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp
Normal file
@@ -0,0 +1,760 @@
|
||||
#include "spine_mem_pool.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "ime_env.h"
|
||||
#include "spine_tcm.h"
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cerrno>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace ggml::cpu::riscv64_spacemit {
|
||||
namespace {
|
||||
|
||||
constexpr size_t SPINE_MEM_POOL_CHUNK_SIZE = 512ull * 1024ull * 1024ull;
|
||||
constexpr size_t SPINE_SHARE_MEM_POOL_CHUNK_SIZE = 512ull * 1024ull;
|
||||
constexpr size_t SPINE_MEM_POOL_1G_REGION_SIZE = 1ull << 30;
|
||||
constexpr uint64_t HUGETLB_1G_FLAG_REQUIRE_PUD = 1ull << 0;
|
||||
constexpr char SPINE_MEM_POOL_HUGETLB_1G_DEV[] = "/dev/hugetlb_1g";
|
||||
constexpr char SPINE_MEM_POOL_TCM_SYNC_MEM_DEV[] = "/dev/tcm_sync_mem";
|
||||
|
||||
struct hugetlb_1g_region {
|
||||
uint64_t size{ 0 };
|
||||
uint64_t dma_addr{ 0 };
|
||||
uint64_t flags{ 0 };
|
||||
uint64_t reserved{ 0 };
|
||||
};
|
||||
|
||||
#define HUGETLB_1G_IOC_MAGIC 'M'
|
||||
#define HUGETLB_1G_IOC_ALLOC _IOWR(HUGETLB_1G_IOC_MAGIC, 0x00, struct hugetlb_1g_region)
|
||||
#define HUGETLB_1G_IOC_FREE _IO(HUGETLB_1G_IOC_MAGIC, 0x01)
|
||||
|
||||
struct free_block {
|
||||
size_t offset{ 0 };
|
||||
size_t size{ 0 };
|
||||
};
|
||||
|
||||
struct pool_chunk {
|
||||
uint8_t * base{ nullptr };
|
||||
size_t size{ 0 };
|
||||
int fd{ -1 };
|
||||
std::vector<free_block> free_blocks;
|
||||
};
|
||||
|
||||
struct pool_allocation {
|
||||
void * chunk_base{ nullptr };
|
||||
size_t chunk_size{ 0 };
|
||||
void * base{ nullptr };
|
||||
size_t size{ 0 };
|
||||
};
|
||||
|
||||
bool is_power_of_two(size_t value) {
|
||||
return value != 0 && (value & (value - 1)) == 0;
|
||||
}
|
||||
|
||||
bool align_up(size_t value, size_t alignment, size_t * aligned_value) {
|
||||
if (aligned_value == nullptr || alignment == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const size_t remainder = value % alignment;
|
||||
if (remainder == 0) {
|
||||
*aligned_value = value;
|
||||
return true;
|
||||
}
|
||||
|
||||
const size_t padding = alignment - remainder;
|
||||
if (value > std::numeric_limits<size_t>::max() - padding) {
|
||||
return false;
|
||||
}
|
||||
|
||||
*aligned_value = value + padding;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool align_up_uintptr(uintptr_t value, size_t alignment, uintptr_t * aligned_value) {
|
||||
if (aligned_value == nullptr || alignment == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const uintptr_t remainder = value % alignment;
|
||||
if (remainder == 0) {
|
||||
*aligned_value = value;
|
||||
return true;
|
||||
}
|
||||
|
||||
const uintptr_t padding = alignment - remainder;
|
||||
if (value > std::numeric_limits<uintptr_t>::max() - padding) {
|
||||
return false;
|
||||
}
|
||||
|
||||
*aligned_value = value + padding;
|
||||
return true;
|
||||
}
|
||||
|
||||
class spine_mem_pool_manager {
|
||||
public:
|
||||
explicit spine_mem_pool_manager(size_t default_chunk_size) : default_chunk_size_(default_chunk_size) {}
|
||||
|
||||
virtual ~spine_mem_pool_manager() = default;
|
||||
|
||||
void * alloc(size_t size, size_t alignment) {
|
||||
if (size == 0 || !is_power_of_two(alignment)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
size_t aligned_size = 0;
|
||||
if (!align_up(size, alignment, &aligned_size)) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: align_up failed for size %zu alignment %zu\n", __func__, size,
|
||||
alignment);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
pool_allocation allocation;
|
||||
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
|
||||
if (!try_alloc_locked(aligned_size, alignment, &allocation)) {
|
||||
if (!add_chunk_locked(aligned_size, alignment)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (!try_alloc_locked(aligned_size, alignment, &allocation)) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: allocation retry failed for size %zu alignment %zu\n",
|
||||
__func__, aligned_size, alignment);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const auto [allocation_it, inserted] = allocations_.emplace(allocation.base, allocation);
|
||||
if (!inserted) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: duplicate allocation key %p\n", __func__, allocation.base);
|
||||
rollback_allocation_locked(allocation);
|
||||
return nullptr;
|
||||
}
|
||||
} catch (const std::bad_alloc &) {
|
||||
rollback_allocation_locked(allocation);
|
||||
throw;
|
||||
}
|
||||
|
||||
return allocation.base;
|
||||
}
|
||||
|
||||
void free(void * base) {
|
||||
if (base == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
|
||||
auto allocation_it = allocations_.find(base);
|
||||
if (allocation_it == allocations_.end()) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: unknown allocation %p\n", __func__, base);
|
||||
return;
|
||||
}
|
||||
|
||||
pool_allocation allocation = allocation_it->second;
|
||||
allocations_.erase(allocation_it);
|
||||
|
||||
auto chunk_it = find_chunk_locked(allocation);
|
||||
if (chunk_it == chunks_.end()) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: unknown chunk for allocation %p size %zu\n", __func__,
|
||||
allocation.base, allocation.size);
|
||||
return;
|
||||
}
|
||||
|
||||
auto * chunk_base = chunk_it->base;
|
||||
auto * alloc_base = static_cast<uint8_t *>(allocation.base);
|
||||
if (alloc_base < chunk_base || alloc_base >= chunk_base + chunk_it->size) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: allocation %p out of chunk range %p..%p\n", __func__,
|
||||
allocation.base, chunk_base, chunk_base + chunk_it->size);
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t offset = static_cast<size_t>(alloc_base - chunk_base);
|
||||
if (offset > chunk_it->size || allocation.size > chunk_it->size - offset) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: allocation %p size %zu exceeds chunk size %zu\n", __func__,
|
||||
allocation.base, allocation.size, chunk_it->size);
|
||||
return;
|
||||
}
|
||||
|
||||
insert_free_block_locked(*chunk_it, { offset, allocation.size });
|
||||
maybe_release_empty_chunk_locked(chunk_it);
|
||||
}
|
||||
|
||||
protected:
|
||||
void release_chunks() {
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
|
||||
allocations_.clear();
|
||||
for (auto & chunk : chunks_) {
|
||||
dealloc_chunk(&chunk);
|
||||
}
|
||||
chunks_.clear();
|
||||
}
|
||||
|
||||
size_t default_chunk_size() const { return default_chunk_size_; }
|
||||
|
||||
static void clear_chunk(pool_chunk * chunk) {
|
||||
chunk->base = nullptr;
|
||||
chunk->size = 0;
|
||||
chunk->fd = -1;
|
||||
chunk->free_blocks.clear();
|
||||
}
|
||||
|
||||
virtual bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) = 0;
|
||||
virtual void dealloc_chunk(pool_chunk * chunk) = 0;
|
||||
|
||||
private:
|
||||
struct alloc_candidate {
|
||||
size_t chunk_index{ 0 };
|
||||
size_t block_index{ 0 };
|
||||
size_t aligned_offset{ 0 };
|
||||
uintptr_t address{ std::numeric_limits<uintptr_t>::max() };
|
||||
bool valid{ false };
|
||||
};
|
||||
|
||||
std::vector<pool_chunk>::iterator find_chunk_locked(const pool_allocation & allocation) {
|
||||
return std::find_if(chunks_.begin(), chunks_.end(), [&](const pool_chunk & chunk) {
|
||||
return chunk.base == allocation.chunk_base && chunk.size == allocation.chunk_size;
|
||||
});
|
||||
}
|
||||
|
||||
bool add_chunk_locked(size_t min_size, size_t alignment) {
|
||||
pool_chunk chunk;
|
||||
const size_t chunk_request = default_chunk_size_ == 0 ? min_size : std::max(min_size, default_chunk_size_);
|
||||
void * hint_addr = nullptr;
|
||||
|
||||
for (const auto & existing_chunk : chunks_) {
|
||||
auto * chunk_end = existing_chunk.base + existing_chunk.size;
|
||||
if (hint_addr == nullptr || chunk_end > hint_addr) {
|
||||
hint_addr = chunk_end;
|
||||
}
|
||||
}
|
||||
|
||||
if (!alloc_chunk(chunk_request, alignment, hint_addr, &chunk)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (chunk.base == nullptr || chunk.size < min_size) {
|
||||
GGML_LOG_ERROR(
|
||||
"CPU_RISCV64_SPACEMIT: %s: invalid chunk returned for request size %zu, chunk_base=%p chunk_size=%zu\n",
|
||||
__func__, min_size, chunk.base, chunk.size);
|
||||
dealloc_chunk(&chunk);
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
chunk.free_blocks.push_back({ 0, chunk.size });
|
||||
chunks_.push_back(std::move(chunk));
|
||||
} catch (const std::bad_alloc &) {
|
||||
dealloc_chunk(&chunk);
|
||||
throw;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void rollback_allocation_locked(const pool_allocation & allocation) {
|
||||
auto chunk_it = find_chunk_locked(allocation);
|
||||
if (chunk_it == chunks_.end()) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to rollback allocation %p, owning chunk not found\n",
|
||||
__func__, allocation.base);
|
||||
return;
|
||||
}
|
||||
|
||||
auto * chunk_base = chunk_it->base;
|
||||
auto * alloc_base = static_cast<uint8_t *>(allocation.base);
|
||||
if (alloc_base < chunk_base || alloc_base >= chunk_base + chunk_it->size) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to rollback allocation %p, chunk range is invalid\n",
|
||||
__func__, allocation.base);
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t offset = static_cast<size_t>(alloc_base - chunk_base);
|
||||
if (offset > chunk_it->size || allocation.size > chunk_it->size - offset) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to rollback allocation %p size %zu\n", __func__,
|
||||
allocation.base, allocation.size);
|
||||
return;
|
||||
}
|
||||
|
||||
insert_free_block_locked(*chunk_it, { offset, allocation.size });
|
||||
maybe_release_empty_chunk_locked(chunk_it);
|
||||
}
|
||||
|
||||
bool try_alloc_locked(size_t size, size_t alignment, pool_allocation * allocation) {
|
||||
alloc_candidate best;
|
||||
|
||||
for (size_t chunk_index = 0; chunk_index < chunks_.size(); ++chunk_index) {
|
||||
const auto & chunk = chunks_[chunk_index];
|
||||
for (size_t block_index = 0; block_index < chunk.free_blocks.size(); ++block_index) {
|
||||
const auto & block = chunk.free_blocks[block_index];
|
||||
|
||||
uintptr_t aligned_addr = 0;
|
||||
const auto block_addr = reinterpret_cast<uintptr_t>(chunk.base + block.offset);
|
||||
if (!align_up_uintptr(block_addr, alignment, &aligned_addr)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (aligned_addr < block_addr) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const size_t aligned_offset = block.offset + static_cast<size_t>(aligned_addr - block_addr);
|
||||
const size_t padding = aligned_offset - block.offset;
|
||||
if (padding > block.size || size > block.size - padding) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!best.valid || aligned_addr < best.address) {
|
||||
best.chunk_index = chunk_index;
|
||||
best.block_index = block_index;
|
||||
best.aligned_offset = aligned_offset;
|
||||
best.address = aligned_addr;
|
||||
best.valid = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!best.valid) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto & chunk = chunks_[best.chunk_index];
|
||||
const free_block block = chunk.free_blocks[best.block_index];
|
||||
const size_t padding = best.aligned_offset - block.offset;
|
||||
const size_t alloc_end = best.aligned_offset + size;
|
||||
const size_t block_end = block.offset + block.size;
|
||||
|
||||
chunk.free_blocks.erase(chunk.free_blocks.begin() + best.block_index);
|
||||
auto insert_it = chunk.free_blocks.begin() + best.block_index;
|
||||
if (padding != 0) {
|
||||
insert_it = chunk.free_blocks.insert(insert_it, { block.offset, padding });
|
||||
++insert_it;
|
||||
}
|
||||
if (alloc_end < block_end) {
|
||||
chunk.free_blocks.insert(insert_it, { alloc_end, block_end - alloc_end });
|
||||
}
|
||||
|
||||
allocation->chunk_base = chunk.base;
|
||||
allocation->chunk_size = chunk.size;
|
||||
allocation->base = chunk.base + best.aligned_offset;
|
||||
allocation->size = size;
|
||||
return true;
|
||||
}
|
||||
|
||||
void maybe_release_empty_chunk_locked(std::vector<pool_chunk>::iterator chunk_it) {
|
||||
if (chunk_it->free_blocks.size() != 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto & block = chunk_it->free_blocks.front();
|
||||
if (block.offset != 0 || block.size != chunk_it->size) {
|
||||
return;
|
||||
}
|
||||
|
||||
dealloc_chunk(&*chunk_it);
|
||||
chunks_.erase(chunk_it);
|
||||
}
|
||||
|
||||
void insert_free_block_locked(pool_chunk & chunk, free_block block) {
|
||||
auto it = chunk.free_blocks.begin();
|
||||
while (it != chunk.free_blocks.end() && it->offset < block.offset) {
|
||||
++it;
|
||||
}
|
||||
|
||||
if (it != chunk.free_blocks.begin()) {
|
||||
const auto & prev = *(it - 1);
|
||||
if (prev.offset + prev.size > block.offset) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: overlapping free block at offset %zu size %zu\n", __func__,
|
||||
block.offset, block.size);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (it != chunk.free_blocks.end() && block.offset + block.size > it->offset) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: overlapping next free block at offset %zu size %zu\n", __func__,
|
||||
block.offset, block.size);
|
||||
return;
|
||||
}
|
||||
|
||||
it = chunk.free_blocks.insert(it, block);
|
||||
|
||||
if (it != chunk.free_blocks.begin()) {
|
||||
auto prev = it - 1;
|
||||
if (prev->offset + prev->size == it->offset) {
|
||||
it->offset = prev->offset;
|
||||
it->size += prev->size;
|
||||
it = chunk.free_blocks.erase(prev);
|
||||
}
|
||||
}
|
||||
|
||||
if (it + 1 != chunk.free_blocks.end() && it->offset + it->size == (it + 1)->offset) {
|
||||
it->size += (it + 1)->size;
|
||||
chunk.free_blocks.erase(it + 1);
|
||||
}
|
||||
}
|
||||
|
||||
std::mutex mutex_;
|
||||
std::vector<pool_chunk> chunks_;
|
||||
std::unordered_map<void *, pool_allocation> allocations_;
|
||||
size_t default_chunk_size_{ 0 };
|
||||
};
|
||||
|
||||
class spine_mem_pool_posix final : public spine_mem_pool_manager {
|
||||
public:
|
||||
spine_mem_pool_posix() : spine_mem_pool_manager(0) {}
|
||||
|
||||
~spine_mem_pool_posix() override { release_chunks(); }
|
||||
|
||||
private:
|
||||
bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
|
||||
(void) hint_addr;
|
||||
|
||||
const size_t alloc_alignment = std::max(alignment, sizeof(void *));
|
||||
void * base = nullptr;
|
||||
const int rc = posix_memalign(&base, alloc_alignment, min_size);
|
||||
if (rc != 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: posix_memalign failed for size %zu alignment %zu, rc=%d\n",
|
||||
__func__, min_size, alloc_alignment, rc);
|
||||
return false;
|
||||
}
|
||||
|
||||
chunk->base = static_cast<uint8_t *>(base);
|
||||
chunk->size = min_size;
|
||||
chunk->fd = -1;
|
||||
return true;
|
||||
}
|
||||
|
||||
void dealloc_chunk(pool_chunk * chunk) override {
|
||||
std::free(chunk->base);
|
||||
clear_chunk(chunk);
|
||||
}
|
||||
};
|
||||
|
||||
class spine_mem_pool_transparent_hugepage final : public spine_mem_pool_manager {
|
||||
public:
|
||||
spine_mem_pool_transparent_hugepage() : spine_mem_pool_manager(SPINE_MEM_POOL_CHUNK_SIZE) {}
|
||||
|
||||
~spine_mem_pool_transparent_hugepage() override { release_chunks(); }
|
||||
|
||||
private:
|
||||
bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
|
||||
(void) alignment;
|
||||
|
||||
size_t chunk_size = 0;
|
||||
if (!align_up(min_size, default_chunk_size(), &chunk_size)) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to round chunk size for %zu\n", __func__, min_size);
|
||||
return false;
|
||||
}
|
||||
|
||||
void * map_addr = mmap(hint_addr, chunk_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||||
if (map_addr == MAP_FAILED) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: mmap failed for chunk size %zu, errno=%d\n", __func__, chunk_size,
|
||||
errno);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (madvise(map_addr, chunk_size, MADV_HUGEPAGE) != 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: madvise(MADV_HUGEPAGE) failed for chunk size %zu, errno=%d\n",
|
||||
__func__, chunk_size, errno);
|
||||
munmap(map_addr, chunk_size);
|
||||
return false;
|
||||
}
|
||||
|
||||
chunk->base = static_cast<uint8_t *>(map_addr);
|
||||
chunk->size = chunk_size;
|
||||
chunk->fd = -1;
|
||||
return true;
|
||||
}
|
||||
|
||||
void dealloc_chunk(pool_chunk * chunk) override {
|
||||
if (chunk->base != nullptr && chunk->size != 0 && munmap(chunk->base, chunk->size) != 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: munmap failed for chunk %p size %zu, errno=%d\n", __func__,
|
||||
chunk->base, chunk->size, errno);
|
||||
}
|
||||
|
||||
clear_chunk(chunk);
|
||||
}
|
||||
};
|
||||
|
||||
class spine_mem_pool_hugetlb_1g final : public spine_mem_pool_manager {
|
||||
public:
|
||||
spine_mem_pool_hugetlb_1g() : spine_mem_pool_manager(SPINE_MEM_POOL_1G_REGION_SIZE) {}
|
||||
|
||||
~spine_mem_pool_hugetlb_1g() override { release_chunks(); }
|
||||
|
||||
private:
|
||||
bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
|
||||
(void) alignment;
|
||||
(void) hint_addr;
|
||||
|
||||
size_t region_size = 0;
|
||||
if (!align_up(min_size, SPINE_MEM_POOL_1G_REGION_SIZE, ®ion_size)) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to round hugetlb_1g size for %zu\n", __func__, min_size);
|
||||
return false;
|
||||
}
|
||||
|
||||
const int fd = open(SPINE_MEM_POOL_HUGETLB_1G_DEV, O_RDWR);
|
||||
if (fd < 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: open(%s) failed, errno=%d\n", __func__,
|
||||
SPINE_MEM_POOL_HUGETLB_1G_DEV, errno);
|
||||
return false;
|
||||
}
|
||||
|
||||
hugetlb_1g_region region;
|
||||
region.size = region_size;
|
||||
region.flags = HUGETLB_1G_FLAG_REQUIRE_PUD;
|
||||
if (ioctl(fd, HUGETLB_1G_IOC_ALLOC, ®ion) < 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: HUGETLB_1G_IOC_ALLOC failed for size %zu, errno=%d\n", __func__,
|
||||
region_size, errno);
|
||||
close(fd);
|
||||
return false;
|
||||
}
|
||||
|
||||
void * map_addr = mmap(nullptr, region.size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
if (map_addr == MAP_FAILED) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: mmap failed for hugetlb_1g size %llu, errno=%d\n", __func__,
|
||||
static_cast<unsigned long long>(region.size), errno);
|
||||
ioctl(fd, HUGETLB_1G_IOC_FREE);
|
||||
close(fd);
|
||||
return false;
|
||||
}
|
||||
|
||||
chunk->base = static_cast<uint8_t *>(map_addr);
|
||||
chunk->size = region.size;
|
||||
chunk->fd = fd;
|
||||
return true;
|
||||
}
|
||||
|
||||
void dealloc_chunk(pool_chunk * chunk) override {
|
||||
if (chunk->base != nullptr && chunk->size != 0 && munmap(chunk->base, chunk->size) != 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: munmap failed for hugetlb_1g chunk %p size %zu, errno=%d\n",
|
||||
__func__, chunk->base, chunk->size, errno);
|
||||
}
|
||||
|
||||
if (chunk->fd >= 0) {
|
||||
if (ioctl(chunk->fd, HUGETLB_1G_IOC_FREE) < 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: HUGETLB_1G_IOC_FREE failed for chunk %p, errno=%d\n",
|
||||
__func__, chunk->base, errno);
|
||||
}
|
||||
|
||||
close(chunk->fd);
|
||||
}
|
||||
|
||||
clear_chunk(chunk);
|
||||
}
|
||||
};
|
||||
|
||||
class spine_mem_pool_shared_mem final : public spine_mem_pool_manager {
|
||||
public:
|
||||
spine_mem_pool_shared_mem() : spine_mem_pool_manager(SPINE_SHARE_MEM_POOL_CHUNK_SIZE) {}
|
||||
|
||||
~spine_mem_pool_shared_mem() override { release_chunks(); }
|
||||
|
||||
private:
|
||||
bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
|
||||
(void) alignment;
|
||||
|
||||
if (hint_addr != nullptr) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: shared_mem does not support multiple active chunks\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (min_size > default_chunk_size()) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: shared_mem request %zu exceeds chunk size %zu\n", __func__,
|
||||
min_size, default_chunk_size());
|
||||
return false;
|
||||
}
|
||||
|
||||
const int fd = open(SPINE_MEM_POOL_TCM_SYNC_MEM_DEV, O_RDWR | O_SYNC);
|
||||
if (fd < 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: open(%s) failed, errno=%d\n", __func__,
|
||||
SPINE_MEM_POOL_TCM_SYNC_MEM_DEV, errno);
|
||||
return false;
|
||||
}
|
||||
|
||||
void * map_addr = mmap(nullptr, default_chunk_size(), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
if (map_addr == MAP_FAILED) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: mmap failed for %s size %zu, errno=%d\n", __func__,
|
||||
SPINE_MEM_POOL_TCM_SYNC_MEM_DEV, default_chunk_size(), errno);
|
||||
close(fd);
|
||||
return false;
|
||||
}
|
||||
|
||||
chunk->base = static_cast<uint8_t *>(map_addr);
|
||||
chunk->size = default_chunk_size();
|
||||
chunk->fd = fd;
|
||||
return true;
|
||||
}
|
||||
|
||||
void dealloc_chunk(pool_chunk * chunk) override {
|
||||
if (chunk->base != nullptr && chunk->size != 0 && munmap(chunk->base, chunk->size) != 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: munmap failed for shared_mem chunk %p size %zu, errno=%d\n",
|
||||
__func__, chunk->base, chunk->size, errno);
|
||||
}
|
||||
|
||||
if (chunk->fd >= 0) {
|
||||
close(chunk->fd);
|
||||
}
|
||||
|
||||
clear_chunk(chunk);
|
||||
}
|
||||
};
|
||||
|
||||
spine_mem_pool_manager & get_spine_mem_pool_manager() {
|
||||
static std::once_flag pool_once;
|
||||
static std::unique_ptr<spine_mem_pool_manager> selected_pool;
|
||||
static spine_mem_pool_backend selected_backend = spine_mem_pool_backend::none;
|
||||
|
||||
spine_mem_pool_backend backend = global_spine_env_info.mem_backend;
|
||||
if (backend == spine_mem_pool_backend::none) {
|
||||
backend = spine_mem_pool_backend::transparent_hugepage;
|
||||
}
|
||||
|
||||
std::call_once(pool_once, [&]() {
|
||||
selected_backend = backend;
|
||||
|
||||
switch (selected_backend) {
|
||||
case spine_mem_pool_backend::posix_memalign:
|
||||
selected_pool = std::make_unique<spine_mem_pool_posix>();
|
||||
break;
|
||||
case spine_mem_pool_backend::transparent_hugepage:
|
||||
selected_pool = std::make_unique<spine_mem_pool_transparent_hugepage>();
|
||||
break;
|
||||
case spine_mem_pool_backend::hugetlb_1g:
|
||||
selected_pool = std::make_unique<spine_mem_pool_hugetlb_1g>();
|
||||
break;
|
||||
case spine_mem_pool_backend::none:
|
||||
selected_backend = spine_mem_pool_backend::transparent_hugepage;
|
||||
selected_pool = std::make_unique<spine_mem_pool_transparent_hugepage>();
|
||||
break;
|
||||
}
|
||||
});
|
||||
|
||||
if (backend != selected_backend) {
|
||||
GGML_LOG_ERROR(
|
||||
"CPU_RISCV64_SPACEMIT: %s: mem pool backend is process-global and mutually exclusive, requested=%d but "
|
||||
"selected=%d\n",
|
||||
__func__, static_cast<int>(backend), static_cast<int>(selected_backend));
|
||||
}
|
||||
|
||||
if (selected_pool) {
|
||||
return *selected_pool;
|
||||
}
|
||||
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
|
||||
spine_mem_pool_manager & get_spine_mem_pool_shared_mem_manager() {
|
||||
static std::once_flag shared_mem_pool_once;
|
||||
static std::unique_ptr<spine_mem_pool_shared_mem> shared_mem_pool;
|
||||
|
||||
std::call_once(shared_mem_pool_once, [&]() { shared_mem_pool = std::make_unique<spine_mem_pool_shared_mem>(); });
|
||||
|
||||
if (shared_mem_pool) {
|
||||
return *shared_mem_pool;
|
||||
}
|
||||
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
bool spine_mem_pool_tcm_init(spine_mem_pool_tcm_info * info) noexcept {
|
||||
if (info == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
*info = {};
|
||||
|
||||
if (spine_tcm_open_handle(NULL) != 0 || !spine_tcm_is_available()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
spine_tcm_mem_info_t mem_info;
|
||||
if (spine_tcm_mem_info(&mem_info) != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
info->available = true;
|
||||
info->blk_size = mem_info.blk_size;
|
||||
info->blk_num = mem_info.blk_num;
|
||||
info->is_fake_tcm = mem_info.is_fake_tcm != 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
void * spine_mem_pool_tcm_mem_get(int cpu_id) noexcept {
|
||||
return spine_tcm_mem_get(cpu_id);
|
||||
}
|
||||
|
||||
void * spine_mem_pool_tcm_mem_wait(int cpu_id) noexcept {
|
||||
return spine_tcm_mem_try_wait(cpu_id, 1000 * 1000);
|
||||
}
|
||||
|
||||
int spine_mem_pool_tcm_mem_release(int cpu_id) noexcept {
|
||||
return spine_tcm_mem_release(cpu_id);
|
||||
}
|
||||
|
||||
void * spine_mem_pool_alloc(size_t size, size_t alignment) noexcept {
|
||||
try {
|
||||
return get_spine_mem_pool_manager().alloc(size, alignment);
|
||||
} catch (const std::bad_alloc &) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while allocating size %zu\n", __func__, size);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void * spine_mem_pool_shared_mem_alloc(size_t size, size_t alignment) noexcept {
|
||||
try {
|
||||
return get_spine_mem_pool_shared_mem_manager().alloc(size, alignment);
|
||||
} catch (const std::bad_alloc &) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while allocating shared memory size %zu\n", __func__, size);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void spine_mem_pool_free(void * base) noexcept {
|
||||
try {
|
||||
get_spine_mem_pool_manager().free(base);
|
||||
} catch (const std::bad_alloc &) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while freeing allocation %p\n", __func__, base);
|
||||
}
|
||||
}
|
||||
|
||||
void spine_mem_pool_shared_mem_free(void * base) noexcept {
|
||||
try {
|
||||
get_spine_mem_pool_shared_mem_manager().free(base);
|
||||
} catch (const std::bad_alloc &) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while freeing shared allocation %p\n", __func__, base);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ggml::cpu::riscv64_spacemit
|
||||
|
||||
extern "C" {
|
||||
void * ggml_backend_cpu_riscv64_spacemit_alloc_shared(size_t size, size_t alignment) {
|
||||
void * result = ggml::cpu::riscv64_spacemit::spine_mem_pool_shared_mem_alloc(size, alignment);
|
||||
if (result == nullptr) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to allocate shared memory size %zu alignment %zu\n", __func__,
|
||||
size, alignment);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_riscv64_spacemit_free_shared(void * ptr) {
|
||||
ggml::cpu::riscv64_spacemit::spine_mem_pool_shared_mem_free(ptr);
|
||||
}
|
||||
}
|
||||
32
ggml/src/ggml-cpu/spacemit/spine_mem_pool.h
Normal file
32
ggml/src/ggml-cpu/spacemit/spine_mem_pool.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
namespace ggml::cpu::riscv64_spacemit {
|
||||
|
||||
enum class spine_mem_pool_backend : uint8_t {
|
||||
none,
|
||||
posix_memalign,
|
||||
transparent_hugepage,
|
||||
hugetlb_1g,
|
||||
};
|
||||
|
||||
struct spine_mem_pool_tcm_info {
|
||||
bool available{ false };
|
||||
size_t blk_size{ 0 };
|
||||
size_t blk_num{ 0 };
|
||||
bool is_fake_tcm{ false };
|
||||
};
|
||||
|
||||
bool spine_mem_pool_tcm_init(spine_mem_pool_tcm_info * info) noexcept;
|
||||
void * spine_mem_pool_tcm_mem_get(int cpu_id) noexcept;
|
||||
void * spine_mem_pool_tcm_mem_wait(int cpu_id) noexcept;
|
||||
int spine_mem_pool_tcm_mem_release(int cpu_id) noexcept;
|
||||
|
||||
void * spine_mem_pool_alloc(size_t size, size_t alignment) noexcept;
|
||||
void * spine_mem_pool_shared_mem_alloc(size_t size, size_t alignment) noexcept;
|
||||
void spine_mem_pool_free(void * base) noexcept;
|
||||
void spine_mem_pool_shared_mem_free(void * base) noexcept;
|
||||
|
||||
} // namespace ggml::cpu::riscv64_spacemit
|
||||
409
ggml/src/ggml-cpu/spacemit/spine_tcm.h
Normal file
409
ggml/src/ggml-cpu/spacemit/spine_tcm.h
Normal file
@@ -0,0 +1,409 @@
|
||||
#ifndef SPINE_TCM_PUBLIC_H_
|
||||
#define SPINE_TCM_PUBLIC_H_
|
||||
|
||||
/*
|
||||
* spine_tcm public API
|
||||
*
|
||||
* Usage:
|
||||
* 1. Direct link mode
|
||||
* Define SPINE_TCM_DIRECT_LINK and link against libspine_tcm.so.
|
||||
*
|
||||
* if (spine_tcm_is_available()) {
|
||||
* void *buffer = spine_tcm_mem_get(0);
|
||||
* spine_tcm_mem_free(0);
|
||||
* }
|
||||
*
|
||||
* 2. Header-only loader mode
|
||||
* Include this header without linking libspine_tcm.so. The loader first
|
||||
* tries to reuse a process-global spine_tcm instance and falls back to
|
||||
* dlopen("libspine_tcm.so") when needed.
|
||||
*
|
||||
* spine_tcm_open_handle(NULL); // optional pre-bind
|
||||
* if (spine_tcm_is_available()) {
|
||||
* void *buffer = spine_tcm_mem_get(0);
|
||||
* spine_tcm_mem_free(0);
|
||||
* }
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#if !defined(SPINE_TCM_BUILD_SHARED) && !defined(SPINE_TCM_DIRECT_LINK)
|
||||
# include <dlfcn.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
# if defined(SPINE_TCM_BUILD_SHARED)
|
||||
# define SPINE_TCM_API __declspec(dllexport)
|
||||
# else
|
||||
# define SPINE_TCM_API __declspec(dllimport)
|
||||
# endif
|
||||
#else
|
||||
# define SPINE_TCM_API __attribute__((visibility("default")))
|
||||
#endif
|
||||
|
||||
typedef struct spine_tcm_mem_info {
|
||||
size_t blk_size;
|
||||
size_t blk_num;
|
||||
int is_fake_tcm;
|
||||
} spine_tcm_mem_info_t;
|
||||
|
||||
typedef struct spine_tcm_block_info {
|
||||
int id;
|
||||
void * va;
|
||||
size_t size;
|
||||
uint64_t phys_addr;
|
||||
uint64_t cpu_affinity_mask;
|
||||
int owner_tid;
|
||||
int is_acquired;
|
||||
} spine_tcm_block_info_t;
|
||||
|
||||
/* Shared-library runtime ABI exported by libspine_tcm.so. */
|
||||
SPINE_TCM_API const char * spine_tcm_runtime_version(void);
|
||||
SPINE_TCM_API int spine_tcm_runtime_is_available(void);
|
||||
SPINE_TCM_API int spine_tcm_runtime_layout_info(spine_tcm_mem_info_t * info);
|
||||
SPINE_TCM_API int spine_tcm_runtime_mem_info(int id, spine_tcm_block_info_t * info);
|
||||
SPINE_TCM_API void * spine_tcm_runtime_mem_get(int id);
|
||||
SPINE_TCM_API int spine_tcm_runtime_mem_free(int id);
|
||||
SPINE_TCM_API void * spine_tcm_runtime_mem_try_wait(int id, size_t timeout_us);
|
||||
SPINE_TCM_API int spine_tcm_runtime_mem_release(int id);
|
||||
SPINE_TCM_API int spine_tcm_runtime_mem_force_release(int id);
|
||||
SPINE_TCM_API int spine_tcm_runtime_mem_query(int id);
|
||||
|
||||
#if defined(SPINE_TCM_DIRECT_LINK)
|
||||
/* Optional no-op in direct-link mode. */
|
||||
static inline int spine_tcm_open_handle(const char * so_path) {
|
||||
(void) so_path;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline const char * spine_tcm_version(void) {
|
||||
return spine_tcm_runtime_version();
|
||||
}
|
||||
|
||||
/* Returns 1 when the runtime driver is available, otherwise 0. */
|
||||
static inline int spine_tcm_is_available(void) {
|
||||
return spine_tcm_runtime_is_available();
|
||||
}
|
||||
|
||||
/* Returns runtime memory geometry and whether the current backend is fake TCM. */
|
||||
static inline int spine_tcm_mem_info(spine_tcm_mem_info_t * info) {
|
||||
return spine_tcm_runtime_layout_info(info);
|
||||
}
|
||||
|
||||
/* Returns per-block runtime metadata for the given TCM id. */
|
||||
static inline int spine_tcm_block_info(int id, spine_tcm_block_info_t * info) {
|
||||
return spine_tcm_runtime_mem_info(id, info);
|
||||
}
|
||||
|
||||
/* Returns a cached buffer for the given TCM id, or NULL on failure. */
|
||||
static inline void * spine_tcm_mem_get(int id) {
|
||||
return spine_tcm_runtime_mem_get(id);
|
||||
}
|
||||
|
||||
/* Releases one reference acquired by spine_tcm_mem_get(id). */
|
||||
static inline int spine_tcm_mem_free(int id) {
|
||||
return spine_tcm_runtime_mem_free(id);
|
||||
}
|
||||
|
||||
/* Waits for a TCM block handoff and returns the driver-owned buffer when available. */
|
||||
static inline void * spine_tcm_mem_try_wait(int id, size_t over_time) {
|
||||
return spine_tcm_runtime_mem_try_wait(id, over_time);
|
||||
}
|
||||
|
||||
/* Releases a buffer acquired by spine_tcm_mem_try_wait(id, over_time). */
|
||||
static inline int spine_tcm_mem_release(int id) {
|
||||
return spine_tcm_runtime_mem_release(id);
|
||||
}
|
||||
|
||||
/* Forces a release for the given TCM id when the backend supports it. */
|
||||
static inline int spine_tcm_mem_force_release(int id) {
|
||||
return spine_tcm_runtime_mem_force_release(id);
|
||||
}
|
||||
|
||||
/* Returns whether the given TCM id is currently acquired. */
|
||||
static inline int spine_tcm_mem_query(int id) {
|
||||
return spine_tcm_runtime_mem_query(id);
|
||||
}
|
||||
#elif !defined(SPINE_TCM_BUILD_SHARED)
|
||||
typedef struct spine_tcm_handle {
|
||||
void * module_handle;
|
||||
int use_global_scope;
|
||||
int owns_module_handle;
|
||||
const char * (*runtime_version)(void);
|
||||
int (*runtime_is_available)(void);
|
||||
int (*runtime_layout_info)(spine_tcm_mem_info_t * info);
|
||||
int (*runtime_mem_info)(int id, spine_tcm_block_info_t * info);
|
||||
void * (*runtime_mem_get)(int id);
|
||||
int (*runtime_mem_free)(int id);
|
||||
void * (*runtime_mem_try_wait)(int id, size_t over_time);
|
||||
int (*runtime_mem_release)(int id);
|
||||
int (*runtime_mem_force_release)(int id);
|
||||
int (*runtime_mem_query)(int id);
|
||||
} spine_tcm_handle_t;
|
||||
|
||||
static inline spine_tcm_handle_t * spine_tcm_default_handle(void) {
|
||||
static spine_tcm_handle_t handle = { 0 };
|
||||
return &handle;
|
||||
}
|
||||
|
||||
static inline void spine_tcm_handle_reset(spine_tcm_handle_t * handle) {
|
||||
if (handle != NULL) {
|
||||
memset(handle, 0, sizeof(*handle));
|
||||
}
|
||||
}
|
||||
|
||||
static inline int spine_tcm_handle_bind(spine_tcm_handle_t * handle) {
|
||||
void * symbol_scope = handle->use_global_scope ? RTLD_DEFAULT : handle->module_handle;
|
||||
|
||||
handle->runtime_version = (const char * (*) (void) ) dlsym(symbol_scope, "spine_tcm_runtime_version");
|
||||
handle->runtime_is_available = (int (*)(void)) dlsym(symbol_scope, "spine_tcm_runtime_is_available");
|
||||
handle->runtime_layout_info =
|
||||
(int (*)(spine_tcm_mem_info_t *)) dlsym(symbol_scope, "spine_tcm_runtime_layout_info");
|
||||
handle->runtime_mem_info =
|
||||
(int (*)(int, spine_tcm_block_info_t *)) dlsym(symbol_scope, "spine_tcm_runtime_mem_info");
|
||||
handle->runtime_mem_get = (void * (*) (int) ) dlsym(symbol_scope, "spine_tcm_runtime_mem_get");
|
||||
handle->runtime_mem_free = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_free");
|
||||
handle->runtime_mem_try_wait = (void * (*) (int, size_t)) dlsym(symbol_scope, "spine_tcm_runtime_mem_try_wait");
|
||||
handle->runtime_mem_release = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_release");
|
||||
handle->runtime_mem_force_release = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_force_release");
|
||||
handle->runtime_mem_query = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_query");
|
||||
|
||||
return handle->runtime_version != NULL && handle->runtime_is_available != NULL &&
|
||||
handle->runtime_layout_info != NULL && handle->runtime_mem_info != NULL &&
|
||||
handle->runtime_mem_get != NULL && handle->runtime_mem_free != NULL &&
|
||||
handle->runtime_mem_try_wait != NULL && handle->runtime_mem_release != NULL &&
|
||||
handle->runtime_mem_force_release != NULL && handle->runtime_mem_query != NULL ?
|
||||
0 :
|
||||
-1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to bind against an already-loaded process-global spine_tcm instance.
|
||||
* The shared library exports spine_tcm_runtime_marker only for this probe.
|
||||
*/
|
||||
static inline int spine_tcm_try_bind_global(spine_tcm_handle_t * handle) {
|
||||
if (dlsym(RTLD_DEFAULT, "spine_tcm_runtime_marker") == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
handle->use_global_scope = 1;
|
||||
return spine_tcm_handle_bind(handle);
|
||||
}
|
||||
|
||||
/*
|
||||
* Optional pre-bind entry point.
|
||||
*
|
||||
* Behavior:
|
||||
* - Reuses an already-loaded global spine_tcm instance when available.
|
||||
* - Otherwise loads the shared library from so_path or the default soname.
|
||||
* - Repeated calls are safe and return 0 after the first successful bind.
|
||||
*/
|
||||
static inline int spine_tcm_open_handle(const char * so_path) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
const char * library = (so_path != NULL && so_path[0] != '\0') ? so_path : "libspine_tcm.so";
|
||||
|
||||
if (resolved->module_handle != NULL || resolved->use_global_scope) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (spine_tcm_try_bind_global(resolved) == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
spine_tcm_handle_reset(resolved);
|
||||
|
||||
resolved->module_handle = dlopen(library, RTLD_LAZY | RTLD_GLOBAL);
|
||||
resolved->owns_module_handle = resolved->module_handle != NULL ? 1 : 0;
|
||||
|
||||
if (resolved->module_handle == NULL) {
|
||||
spine_tcm_handle_reset(resolved);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (spine_tcm_handle_bind(resolved) != 0) {
|
||||
if (resolved->owns_module_handle) {
|
||||
dlclose(resolved->module_handle);
|
||||
}
|
||||
spine_tcm_handle_reset(resolved);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Returns 1 when the runtime driver is available, otherwise 0. */
|
||||
static inline int spine_tcm_is_available(void) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_is_available == NULL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return resolved->runtime_is_available();
|
||||
}
|
||||
|
||||
/* Returns runtime memory geometry and whether the current backend is fake TCM. */
|
||||
static inline int spine_tcm_mem_info(spine_tcm_mem_info_t * info) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_layout_info == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return resolved->runtime_layout_info(info);
|
||||
}
|
||||
|
||||
static inline const char * spine_tcm_version(void) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_version == NULL) {
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
return resolved->runtime_version();
|
||||
}
|
||||
|
||||
/* Returns per-block runtime metadata for the given TCM id. */
|
||||
static inline int spine_tcm_block_info(int id, spine_tcm_block_info_t * info) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_info == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return resolved->runtime_mem_info(id, info);
|
||||
}
|
||||
|
||||
/* Returns a cached buffer for the given TCM id, or NULL on failure. */
|
||||
static inline void * spine_tcm_mem_get(int id) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (resolved->runtime_mem_get == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return resolved->runtime_mem_get(id);
|
||||
}
|
||||
|
||||
/* Releases one reference acquired by spine_tcm_mem_get(id). */
|
||||
static inline int spine_tcm_mem_free(int id) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_free == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return resolved->runtime_mem_free(id);
|
||||
}
|
||||
|
||||
/* Waits for a TCM block handoff and returns the driver-owned buffer when available. */
|
||||
static inline void * spine_tcm_mem_try_wait(int id, size_t over_time) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (resolved->runtime_mem_try_wait == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return resolved->runtime_mem_try_wait(id, over_time);
|
||||
}
|
||||
|
||||
/* Releases a buffer acquired by spine_tcm_mem_try_wait(id, over_time). */
|
||||
static inline int spine_tcm_mem_release(int id) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_release == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return resolved->runtime_mem_release(id);
|
||||
}
|
||||
|
||||
/* Forces a release for the given TCM id when the backend supports it. */
|
||||
static inline int spine_tcm_mem_force_release(int id) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if ((resolved->module_handle == NULL && !resolved->use_global_scope) ||
|
||||
resolved->runtime_mem_force_release == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return resolved->runtime_mem_force_release(id);
|
||||
}
|
||||
|
||||
/* Returns whether the given TCM id is currently acquired. */
|
||||
static inline int spine_tcm_mem_query(int id) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_query == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return resolved->runtime_mem_query(id);
|
||||
}
|
||||
#else
|
||||
static inline const char * spine_tcm_version(void) {
|
||||
return spine_tcm_runtime_version();
|
||||
}
|
||||
#endif
|
||||
|
||||
#define SPINE_TCM_VERSION (spine_tcm_version())
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
971
ggml/src/ggml-cuda/allreduce.cu
Normal file
971
ggml/src/ggml-cuda/allreduce.cu
Normal file
@@ -0,0 +1,971 @@
|
||||
#include "allreduce.cuh"
|
||||
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
|
||||
#include "convert.cuh"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// CUDA AllReduce for tensor-parallel inference across two GPUs.
|
||||
//
|
||||
// Provides an in-place sum reduction over matching tensors on two CUDA
|
||||
// devices in the same process. Used by the tensor-split path alongside
|
||||
// NCCL; targets setups without NVLink, where data is exchanged between the
|
||||
// GPUs by staging it through pinned host memory over PCIe.
|
||||
//
|
||||
// Two reduction strategies are selected per call by tensor size:
|
||||
//
|
||||
// * Chunked kernel path (small reductions): a single CUDA kernel both
|
||||
// stages data through pinned host memory and performs the local sum.
|
||||
// Cross-GPU synchronization happens *inside the kernel* (busy-wait on
|
||||
// a host-memory flag), which keeps launch overhead low for the
|
||||
// latency-sensitive token-generation case.
|
||||
//
|
||||
// * Copy-engine path (large reductions): the transfer is split into
|
||||
// D2H + H2D cudaMemcpyAsync chunks driven by the GPU's copy engine,
|
||||
// followed by a small device-side add kernel. Cross-GPU
|
||||
// synchronization happens *outside the kernel*, via CUDA events
|
||||
// between streams. This keeps the compute engine free while large
|
||||
// transfers are in flight, which matters for prefill-sized tensors.
|
||||
// Reductions larger than the per-call inner cap are processed by an
|
||||
// outer chunker that issues sequential inner calls.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Cross-GPU signal mechanism
|
||||
//
|
||||
// One int per (slot, rank) pair in pinned host memory. Each AR call writes a
|
||||
// strictly increasing token (= the AR call number) into its own arrival int.
|
||||
// The peer spins until its read of the other's arrival int equals the token
|
||||
// it expects for this call -- a mismatch means the peer hasn't arrived yet.
|
||||
// Tokens never repeat over realistic call rates (32-bit int wraps in tens of
|
||||
// days at thousands of ARs/sec), so arrival ints don't need to be reset
|
||||
// between calls; we initialize once at pipeline init and let the values
|
||||
// accumulate.
|
||||
//
|
||||
// There is exactly one writer (the owning GPU) and one reader (the peer), so
|
||||
// we don't need atomics. A volatile store paired with __threadfence_system()
|
||||
// provides the release ordering that makes the D2H writes visible system-wide
|
||||
// before the arrival token is observed.
|
||||
//
|
||||
// atomicAdd_system() requires hostNativeAtomicSupported, which is unavailable
|
||||
// on PCIe-attached consumer GPUs without NVLink, so the volatile path is the
|
||||
// portable choice.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static __device__ __forceinline__ void ggml_cuda_ar_signal_set(int * p, int token) {
|
||||
*(volatile int *)p = token;
|
||||
}
|
||||
static __device__ __forceinline__ int ggml_cuda_ar_signal_get(const int * p) {
|
||||
return *(const volatile int *)p;
|
||||
}
|
||||
|
||||
// Byte spacing between adjacent arrival ints. 64 bytes (one cache line)
|
||||
// ensures each GPU/block's arrival slot lives on its own line, preventing
|
||||
// false-sharing stalls on the polling GPU.
|
||||
static constexpr size_t GGML_CUDA_AR_ARRIVAL_STRIDE = 64;
|
||||
|
||||
// Number of blocks the chunked kernel launches with. Each block stripes a
|
||||
// disjoint slice of the data and synchronizes through its own arrival-token
|
||||
// slot so multiple SMs can pump PCIe stores in parallel.
|
||||
static constexpr int GGML_CUDA_AR_KERNEL_BLOCKS = 8;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Chunked kernel AllReduce -- 2 GPUs, supports float, half, and bfloat16.
|
||||
//
|
||||
// Both GPUs run this kernel simultaneously on independent streams. sendbuf
|
||||
// and recvbuf live in T_dst (the caller's tensor type); host_mine / host_other
|
||||
// carry data in T_wire (the on-wire type, possibly narrower than T_dst -- e.g.
|
||||
// T_dst=F32 with T_wire=BF16 halves the bytes pushed across PCIe). When
|
||||
// T_dst == T_wire the casts below are no-ops.
|
||||
//
|
||||
// Each GPU runs three phases:
|
||||
//
|
||||
// Phase 1 (all threads): cast sendbuf (T_dst) -> T_wire and store as
|
||||
// single-instruction-width vectors into host_mine.
|
||||
// __threadfence_system() commits these writes to host
|
||||
// memory.
|
||||
// Phase 2 (thread 0): write token to arrival_mine; spin until
|
||||
// arrival_other == token.
|
||||
// Phase 3 (all threads): read T_wire vectors from host_other, cast
|
||||
// each element to T_dst, and sum with the local
|
||||
// sendbuf value (also rounded through T_wire so that
|
||||
// both GPUs truncate identically -- this guarantees
|
||||
// bit-equivalent results across the two devices).
|
||||
//
|
||||
// Multi-block: blocks stripe vectors across (gridDim.x * blockDim.x) global
|
||||
// threads to keep multiple SMs issuing PCIe stores in parallel. Each block
|
||||
// has its own arrival-token slot (offset by blockIdx.x * ARRIVAL_STRIDE);
|
||||
// thread 0 of each block signals/spins on that slot independently of other
|
||||
// blocks. Tail elements (the leftover < ELEMS_PER_VEC at the end) are
|
||||
// handled only by block 0 to avoid cross-block writes to the same slots.
|
||||
// ---------------------------------------------------------------------------
|
||||
template <typename T_dst, typename T_wire>
|
||||
static __global__ void ggml_cuda_ar_kernel(
|
||||
const T_dst * sendbuf,
|
||||
T_dst * recvbuf,
|
||||
T_wire * __restrict__ host_mine,
|
||||
const T_wire * __restrict__ host_other,
|
||||
int count,
|
||||
int * arrival_mine,
|
||||
int * arrival_other,
|
||||
int token) {
|
||||
|
||||
// Vector unit for the wire type, sized to the arch's widest single-instruction
|
||||
// copy (16 B on Volta+). Each phase-1 iter writes one vector to host memory;
|
||||
// each phase-3 iter reads one and produces ELEMS_PER_VEC sums.
|
||||
constexpr int ELEMS_PER_VEC = ggml_cuda_get_max_cpy_bytes() / sizeof(T_wire);
|
||||
constexpr int ARRIVAL_INTS = (int)(GGML_CUDA_AR_ARRIVAL_STRIDE / sizeof(int));
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int nt = blockDim.x;
|
||||
const int bid = blockIdx.x;
|
||||
const int gtid = bid * nt + tid;
|
||||
const int gnt = gridDim.x * nt;
|
||||
const int count_vec = count / ELEMS_PER_VEC;
|
||||
const int tail = count_vec * ELEMS_PER_VEC;
|
||||
|
||||
// Phase 1: cast sendbuf (T_dst) -> host_mine (T_wire) and store as vectors.
|
||||
{
|
||||
for (int i = gtid; i < count_vec; i += gnt) {
|
||||
const int off = i * ELEMS_PER_VEC;
|
||||
T_wire wire[ELEMS_PER_VEC];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < ELEMS_PER_VEC; ++k) {
|
||||
wire[k] = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
|
||||
}
|
||||
ggml_cuda_memcpy_1<sizeof(wire)>(&host_mine[off], wire);
|
||||
}
|
||||
if (bid == 0 && tid < count - tail) {
|
||||
host_mine[tail + tid] = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
|
||||
}
|
||||
}
|
||||
|
||||
// Commit this block's host writes before signalling.
|
||||
__threadfence_system();
|
||||
__syncthreads();
|
||||
|
||||
// Phase 2: thread 0 of each block signals on its own arrival slot, then
|
||||
// spins for the matching slot from peer. Per-block tokens mean blocks
|
||||
// proceed independently -- no inter-block barrier needed.
|
||||
if (tid == 0) {
|
||||
int * my_slot = arrival_mine + bid * ARRIVAL_INTS;
|
||||
const int * other_slot = arrival_other + bid * ARRIVAL_INTS;
|
||||
|
||||
ggml_cuda_ar_signal_set(my_slot, token);
|
||||
__threadfence_system(); // make our signal visible system-wide
|
||||
|
||||
while (ggml_cuda_ar_signal_get(other_slot) != token) {
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
__nanosleep(100);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Acquire peer's host_other writes (this block's stripe of them).
|
||||
__threadfence_system();
|
||||
|
||||
// Phase 3: read peer's T_wire vector, cast both sides through T_wire for
|
||||
// bit-equivalence, sum in T_dst precision, and write back to recvbuf.
|
||||
{
|
||||
for (int i = gtid; i < count_vec; i += gnt) {
|
||||
const int off = i * ELEMS_PER_VEC;
|
||||
T_wire wire[ELEMS_PER_VEC];
|
||||
ggml_cuda_memcpy_1<sizeof(wire)>(wire, &host_other[off]);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < ELEMS_PER_VEC; ++k) {
|
||||
const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
|
||||
recvbuf[off + k] = ggml_cuda_cast<T_dst>(
|
||||
ggml_cuda_cast<float>(d_low) + ggml_cuda_cast<float>(wire[k]));
|
||||
}
|
||||
}
|
||||
if (bid == 0 && tid < count - tail) {
|
||||
const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
|
||||
recvbuf[tail + tid] = ggml_cuda_cast<T_dst>(
|
||||
ggml_cuda_cast<float>(d_low) +
|
||||
ggml_cuda_cast<float>(host_other[tail + tid]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Combined load-convert-add kernel. The peer's contribution arrives as T_src
|
||||
// (which may be a lower-precision type than T_dst when the BF16 round-trip is
|
||||
// active). For bit-equivalence between the two GPUs, dst is first rounded
|
||||
// through T_src's precision via ggml_cuda_cast -- peer already truncated its
|
||||
// own value the same way before sending -- so both sides perform identical
|
||||
// arithmetic. When T_dst == T_src the round-trip cast is a no-op.
|
||||
template <typename T_dst, typename T_src>
|
||||
static __global__ void ggml_cuda_ar_add_kernel(
|
||||
T_dst * __restrict__ dst,
|
||||
const T_src * __restrict__ src,
|
||||
int count) {
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int nt = gridDim.x * blockDim.x;
|
||||
for (int i = tid; i < count; i += nt) {
|
||||
const T_src d_low = ggml_cuda_cast<T_src>(dst[i]);
|
||||
dst[i] = ggml_cuda_cast<T_dst>(
|
||||
ggml_cuda_cast<float>(d_low) + ggml_cuda_cast<float>(src[i]));
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Pipeline structure
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Number of slots in the event / arrival ring. Two slots is sufficient:
|
||||
// lockstep guarantees the two GPUs are at most one AR (or chunk) apart, so
|
||||
// slot[N%2] is always safe to reuse -- peer has already consumed slot[N%2]
|
||||
// from AR N-2 by the time we get to AR N. acquire_slot's
|
||||
// cudaEventSynchronize on ev.ker for both devices makes that consumption
|
||||
// explicit before we overwrite host_buf[slot] for the new AR.
|
||||
static constexpr int GGML_CUDA_AR_POOL_SIZE = 2;
|
||||
|
||||
// Maximum chunk size (bytes per GPU) handled by one chunked kernel launch.
|
||||
// Larger tensors are reduced by issuing multiple chunked launches.
|
||||
static constexpr size_t GGML_CUDA_AR_MAX_BYTES = 1024 * 1024; // 1 MB
|
||||
|
||||
// Copy-engine path: largest tensor accepted on this path; sets host_large /
|
||||
// dev_tmp allocation size.
|
||||
static constexpr size_t GGML_CUDA_AR_COPY_MAX_BYTES = 32 * 1024 * 1024; // 32 MB
|
||||
|
||||
// AR wire size at which the copy-engine path takes over from the chunked-
|
||||
// kernel path. Override via GGML_CUDA_AR_COPY_THRESHOLD.
|
||||
static constexpr size_t GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT = 1024 * 1024; // 1 MB
|
||||
// Per-call CE chunk-size heuristic: chunk_bytes = clamp(nbytes / 4, MIN, MAX).
|
||||
// The /4 keeps ~4 chunks in flight at any moment (good D2H/H2D overlap with
|
||||
// the peer); the clamps cover the cases where nbytes/4 is too small (per-
|
||||
// memcpy fixed cost dominates) or too large (chunk-level pipelining stalls).
|
||||
// Env var GGML_CUDA_AR_COPY_CHUNK_BYTES can override with a fixed value.
|
||||
static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN = 512 * 1024; // 512 KB
|
||||
static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX = 2 * 1024 * 1024; // 2 MB
|
||||
// Absolute floor that an env-var override is allowed to set; this caps the
|
||||
// per-slot copy-event array. 256 KB -> up to 128 chunks per 32 MB tensor.
|
||||
static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN = 256 * 1024;
|
||||
static constexpr int GGML_CUDA_AR_COPY_MAX_CHUNKS =
|
||||
static_cast<int>((GGML_CUDA_AR_COPY_MAX_BYTES + GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN - 1) /
|
||||
GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
|
||||
|
||||
struct ggml_cuda_ar_event_slot {
|
||||
cudaEvent_t app = nullptr; // upstream computation complete
|
||||
cudaEvent_t cpy[GGML_CUDA_AR_COPY_MAX_CHUNKS] = {}; // copy-engine D2H chunks complete
|
||||
cudaEvent_t h2d = nullptr; // copy-engine H2Ds complete (handoff AR stream -> compute stream)
|
||||
cudaEvent_t ker = nullptr; // AllReduce kernel complete
|
||||
};
|
||||
|
||||
// Mapped pinned host allocation: cudaHostAlloc + cudaHostGetDevicePointer
|
||||
// in one place, with the host handle preserved for cudaFreeHost. Used where
|
||||
// the CPU never touches the buffer -- only the device reads/writes via the
|
||||
// mapped device pointer. Required on systems where cudaDevAttrCanUseHost-
|
||||
// PointerForRegisteredMem is 0 and the host pointer can't be used as a
|
||||
// device pointer.
|
||||
struct ggml_cuda_ar_host_mapping {
|
||||
uint8_t * host = nullptr; // cudaFreeHost handle; also the H-side ptr for cudaMemcpyAsync
|
||||
uint8_t * dev = nullptr; // device-side pointer for kernels / cudaMemset
|
||||
|
||||
cudaError_t alloc(size_t bytes) {
|
||||
cudaError_t rc = cudaHostAlloc(reinterpret_cast<void **>(&host), bytes,
|
||||
cudaHostAllocPortable | cudaHostAllocMapped);
|
||||
if (rc != cudaSuccess) {
|
||||
host = nullptr;
|
||||
return rc;
|
||||
}
|
||||
rc = cudaHostGetDevicePointer(reinterpret_cast<void **>(&dev), host, 0);
|
||||
if (rc != cudaSuccess) {
|
||||
cudaFreeHost(host);
|
||||
host = nullptr;
|
||||
dev = nullptr;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
void free() {
|
||||
if (host) {
|
||||
cudaFreeHost(host);
|
||||
host = nullptr;
|
||||
dev = nullptr;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_cuda_ar_pipeline {
|
||||
int n_devices;
|
||||
int devices[GGML_CUDA_MAX_DEVICES];
|
||||
size_t buf_bytes; // bytes per device in host_buf[]
|
||||
size_t copy_bytes; // bytes per device in host_large[] / dev_tmp[]
|
||||
size_t copy_threshold;
|
||||
size_t copy_chunk_bytes;
|
||||
size_t bf16_threshold; // tensors >= this size (bytes) are reduced via FP32->BF16 round-trip; 0 disables
|
||||
uint64_t call_count;
|
||||
|
||||
// Per-device resources.
|
||||
ggml_cuda_ar_host_mapping host_buf[GGML_CUDA_MAX_DEVICES]; // pinned staging (chunked kernel)
|
||||
ggml_cuda_ar_host_mapping host_large[GGML_CUDA_MAX_DEVICES]; // pinned staging (copy-engine)
|
||||
char * dev_tmp[GGML_CUDA_MAX_DEVICES]; // device scratch for copy-engine path
|
||||
cudaStream_t streams[GGML_CUDA_MAX_DEVICES]; // non-blocking
|
||||
ggml_cuda_ar_event_slot ev_pool[GGML_CUDA_MAX_DEVICES][GGML_CUDA_AR_POOL_SIZE];
|
||||
|
||||
// Copy-engine: per-device "I finished reading my peer's host_large"
|
||||
// event. Indexed by RECORDER device. Recorded same-device on streams[i]
|
||||
// after stage 2's last H2D from host_large[peer]. Waited cross-device
|
||||
// by peer's stage-1 stream before the next AR overwrites host_large[peer].
|
||||
cudaEvent_t host_large_read_done[GGML_CUDA_MAX_DEVICES];
|
||||
bool host_large_read_done_valid;
|
||||
|
||||
// Copy-engine: per-device "my add_kernel is done with dev_tmp" event.
|
||||
// Recorded on the compute stream after each add_kernel; the AR stream
|
||||
// waits on it before the next copy_impl's H2D overwrites dev_tmp. Lets us
|
||||
// single-buffer dev_tmp despite add_kernel running on a separate stream.
|
||||
cudaEvent_t dev_tmp_kernel_done[GGML_CUDA_MAX_DEVICES];
|
||||
bool dev_tmp_kernel_done_valid;
|
||||
|
||||
// Arrival ring: ARRIVAL_STRIDE bytes between adjacent ints. Mapped pinned
|
||||
// memory; CPU never reads/writes -- only the kernel and cudaMemset.
|
||||
// Use ggml_cuda_ar_arrival_ptr() to index.
|
||||
ggml_cuda_ar_host_mapping arrival;
|
||||
};
|
||||
|
||||
// Base pointer for the (slot, rank) per-block token block. The kernel adds
|
||||
// blockIdx.x * (ARRIVAL_STRIDE/sizeof(int)) internally to land on its own slot.
|
||||
static int * ggml_cuda_ar_arrival_ptr(const ggml_cuda_ar_pipeline * p, int slot, int rank) {
|
||||
const size_t offset = ((size_t)slot * p->n_devices + rank) *
|
||||
GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
|
||||
return reinterpret_cast<int *>(p->arrival.dev + offset);
|
||||
}
|
||||
|
||||
static uint64_t ggml_cuda_ar_env_u64(const char * name, uint64_t default_value) {
|
||||
const char * value = getenv(name);
|
||||
if (value == nullptr || value[0] == '\0') {
|
||||
return default_value;
|
||||
}
|
||||
|
||||
char * end = nullptr;
|
||||
const unsigned long long parsed = strtoull(value, &end, 10);
|
||||
return end != value ? (uint64_t) parsed : default_value;
|
||||
}
|
||||
|
||||
struct ggml_cuda_ar_slot_info {
|
||||
int slot;
|
||||
int token;
|
||||
};
|
||||
|
||||
static ggml_cuda_ar_slot_info ggml_cuda_ar_acquire_slot(ggml_cuda_ar_pipeline * p) {
|
||||
const int slot = static_cast<int>(p->call_count % GGML_CUDA_AR_POOL_SIZE);
|
||||
const bool pool_lapped = p->call_count >= GGML_CUDA_AR_POOL_SIZE;
|
||||
p->call_count++;
|
||||
|
||||
if (pool_lapped) {
|
||||
for (int i = 0; i < p->n_devices; ++i) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
CUDA_CHECK(cudaEventSynchronize(p->ev_pool[i][slot].ker));
|
||||
}
|
||||
}
|
||||
|
||||
return { slot, (int) p->call_count };
|
||||
}
|
||||
|
||||
// Per-AR copy-engine chunk size: env-var override if set, else heuristic
|
||||
// (clamp(nbytes/4, HEURISTIC_MIN, HEURISTIC_MAX)).
|
||||
static size_t ggml_cuda_ar_chunk_bytes(const ggml_cuda_ar_pipeline * p, size_t nbytes) {
|
||||
if (p->copy_chunk_bytes > 0) {
|
||||
return p->copy_chunk_bytes;
|
||||
}
|
||||
return std::min(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX,
|
||||
std::max(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN, nbytes / 4));
|
||||
}
|
||||
|
||||
static void ggml_cuda_ar_wait_for_compute(
|
||||
ggml_cuda_ar_pipeline * p, ggml_backend_cuda_context * cuda_ctx, int rank, int slot) {
|
||||
ggml_cuda_ar_event_slot & ev = p->ev_pool[rank][slot];
|
||||
CUDA_CHECK(cudaEventRecord(ev.app, cuda_ctx->stream()));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(p->streams[rank], ev.app));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Init / free
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int * devices, size_t n_devices) {
|
||||
|
||||
if (n_devices != 2) {
|
||||
GGML_LOG_DEBUG("%s: internal AllReduce only supports n_devices=2 (got %zu); "
|
||||
"falling back\n", __func__, n_devices);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// The chunked kernel uses __nanosleep, which is sm70+ (Volta+).
|
||||
for (size_t i = 0; i < n_devices; ++i) {
|
||||
const int cc = ggml_cuda_info().devices[devices[i]].cc;
|
||||
if (cc < GGML_CUDA_CC_VOLTA) {
|
||||
GGML_LOG_DEBUG("%s: internal AllReduce requires compute capability >= %d "
|
||||
"(device %d has cc=%d); falling back\n",
|
||||
__func__, GGML_CUDA_CC_VOLTA, devices[i], cc);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
auto * p = new ggml_cuda_ar_pipeline{};
|
||||
p->n_devices = n_devices;
|
||||
p->copy_bytes = GGML_CUDA_AR_COPY_MAX_BYTES;
|
||||
p->copy_threshold = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_THRESHOLD", GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT);
|
||||
// 0 = use the per-call heuristic (default). Non-zero env value forces a
|
||||
// fixed chunk size for diagnostics, with a floor at COPY_CHUNK_BYTES_MIN.
|
||||
p->copy_chunk_bytes = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_CHUNK_BYTES", 0);
|
||||
if (p->copy_chunk_bytes > 0 && p->copy_chunk_bytes < GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN) {
|
||||
GGML_LOG_WARN("%s: GGML_CUDA_AR_COPY_CHUNK_BYTES=%zu below minimum %zu; clamping\n",
|
||||
__func__, p->copy_chunk_bytes, GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
|
||||
p->copy_chunk_bytes = GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN;
|
||||
}
|
||||
// Default 1: BF16 round-trip is always on for F32 inputs (any non-zero
|
||||
// ne). Set GGML_CUDA_AR_BF16_THRESHOLD=0 to disable, or to a larger
|
||||
// byte threshold to opt out for small tensors.
|
||||
p->bf16_threshold = ggml_cuda_ar_env_u64("GGML_CUDA_AR_BF16_THRESHOLD", 1);
|
||||
for (size_t i = 0; i < n_devices; ++i) {
|
||||
p->devices[i] = devices[i];
|
||||
}
|
||||
|
||||
// Per-device streams and event pools.
|
||||
for (size_t i = 0; i < n_devices; ++i) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
|
||||
cudaStream_t stream = nullptr;
|
||||
if (cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: cudaStreamCreateWithFlags failed for device %d\n",
|
||||
__func__, p->devices[i]);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
p->streams[i] = stream;
|
||||
|
||||
for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
|
||||
bool ok =
|
||||
cudaEventCreateWithFlags(&p->ev_pool[i][s].app, cudaEventDisableTiming) == cudaSuccess &&
|
||||
cudaEventCreateWithFlags(&p->ev_pool[i][s].h2d, cudaEventDisableTiming) == cudaSuccess &&
|
||||
cudaEventCreateWithFlags(&p->ev_pool[i][s].ker, cudaEventDisableTiming) == cudaSuccess;
|
||||
for (int c = 0; ok && c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
|
||||
ok = cudaEventCreateWithFlags(&p->ev_pool[i][s].cpy[c], cudaEventDisableTiming) == cudaSuccess;
|
||||
}
|
||||
if (!ok) {
|
||||
GGML_LOG_ERROR("%s: cudaEventCreate failed for device %d slot %d\n",
|
||||
__func__, p->devices[i], s);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
if (cudaEventCreateWithFlags(&p->host_large_read_done[i], cudaEventDisableTiming) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: cudaEventCreate for host_large_read_done failed for device %d\n",
|
||||
__func__, p->devices[i]);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
if (cudaEventCreateWithFlags(&p->dev_tmp_kernel_done[i], cudaEventDisableTiming) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: cudaEventCreate for dev_tmp_kernel_done failed for device %d\n",
|
||||
__func__, p->devices[i]);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Arrival ring: cache-line padded so each GPU's int is on its own line.
|
||||
const size_t arrival_bytes =
|
||||
(size_t)GGML_CUDA_AR_POOL_SIZE * n_devices *
|
||||
GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
|
||||
if (p->arrival.alloc(arrival_bytes) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: alloc for arrival ring failed (%zu bytes)\n",
|
||||
__func__, arrival_bytes);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
ggml_cuda_set_device(p->devices[0]);
|
||||
if (cudaMemset(p->arrival.dev, 0, arrival_bytes) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: cudaMemset for arrival ring failed (%zu bytes)\n",
|
||||
__func__, arrival_bytes);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Per-device pinned staging buffers -- POOL_SIZE-deep ring so the chunked-
|
||||
// kernel can write the next slot's data while the peer is still reading
|
||||
// the previous slot's. Indexed by (slot * buf_bytes) at the call site.
|
||||
p->buf_bytes = GGML_CUDA_AR_MAX_BYTES;
|
||||
const size_t host_buf_total = (size_t) GGML_CUDA_AR_POOL_SIZE * p->buf_bytes;
|
||||
for (size_t i = 0; i < n_devices; ++i) {
|
||||
if (p->host_buf[i].alloc(host_buf_total) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: alloc for staging failed (%zu bytes)\n",
|
||||
__func__, host_buf_total);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Copy-engine path: pinned host staging + device scratch, sized for the
|
||||
// largest tensor we accept on this path (GGML_CUDA_AR_COPY_MAX_BYTES).
|
||||
// dev_tmp is single-buffered; cross-AR safety is enforced by an explicit
|
||||
// cross-stream wait in copy_impl on the prior AR's add_kernel-done event.
|
||||
for (size_t i = 0; i < n_devices; ++i) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
if (p->host_large[i].alloc(p->copy_bytes) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: alloc for large staging failed (%zu bytes)\n",
|
||||
__func__, p->copy_bytes);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
if (cudaMalloc(reinterpret_cast<void **>(&p->dev_tmp[i]), p->copy_bytes) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: cudaMalloc for copy scratch failed (%zu bytes) on device %d\n",
|
||||
__func__, p->copy_bytes, p->devices[i]);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: initialized AllReduce pipeline: %zu GPUs, "
|
||||
"%zu KB chunked kernel staging + %zu MB copy-engine staging per GPU\n",
|
||||
__func__, n_devices, p->buf_bytes >> 10, p->copy_bytes >> 20);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * p) {
|
||||
if (!p) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Drain all in-flight kernels before tearing down resources.
|
||||
for (int i = 0; i < p->n_devices; ++i) {
|
||||
if (p->streams[i]) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cudaStreamSynchronize(p->streams[i]);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < p->n_devices; ++i) {
|
||||
p->host_buf[i].free();
|
||||
p->host_large[i].free();
|
||||
if (p->dev_tmp[i]) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cudaFree(p->dev_tmp[i]);
|
||||
}
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
|
||||
if (p->ev_pool[i][s].app) { cudaEventDestroy(p->ev_pool[i][s].app); }
|
||||
for (int c = 0; c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
|
||||
if (p->ev_pool[i][s].cpy[c]) { cudaEventDestroy(p->ev_pool[i][s].cpy[c]); }
|
||||
}
|
||||
if (p->ev_pool[i][s].h2d) { cudaEventDestroy(p->ev_pool[i][s].h2d); }
|
||||
if (p->ev_pool[i][s].ker) { cudaEventDestroy(p->ev_pool[i][s].ker); }
|
||||
}
|
||||
if (p->host_large_read_done[i]) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cudaEventDestroy(p->host_large_read_done[i]);
|
||||
}
|
||||
if (p->dev_tmp_kernel_done[i]) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cudaEventDestroy(p->dev_tmp_kernel_done[i]);
|
||||
}
|
||||
if (p->streams[i]) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cudaStreamDestroy(p->streams[i]);
|
||||
}
|
||||
}
|
||||
p->arrival.free();
|
||||
delete p;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Dispatch
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Asymmetric copy_impl: data sent over PCIe in T_src precision (one element of
|
||||
// nbytes per ne element); accumulated locally into a T_dst buffer. When
|
||||
// T_src == T_dst this is the original homogeneous reduction. When they differ
|
||||
// (e.g. BF16 wire / F32 accumulator) the add kernel rounds dst through T_src
|
||||
// for bit-equivalence between GPUs and we skip the otherwise-needed
|
||||
// post-conversion entirely.
|
||||
template <typename T_src, typename T_dst>
|
||||
static bool ggml_cuda_ar_allreduce_copy_impl(
|
||||
ggml_cuda_ar_pipeline * p,
|
||||
ggml_backend_t * backends,
|
||||
T_src * const src_buf[GGML_CUDA_MAX_DEVICES],
|
||||
T_dst * const dst_buf[GGML_CUDA_MAX_DEVICES],
|
||||
const bool compute[GGML_CUDA_MAX_DEVICES],
|
||||
int64_t ne,
|
||||
size_t nbytes) {
|
||||
GGML_ASSERT(p->n_devices == 2);
|
||||
GGML_ASSERT(nbytes <= p->copy_bytes);
|
||||
GGML_ASSERT(ne <= std::numeric_limits<int>::max());
|
||||
|
||||
const size_t chunk_bytes = ggml_cuda_ar_chunk_bytes(p, nbytes);
|
||||
GGML_ASSERT(chunk_bytes > 0);
|
||||
|
||||
const int slot = ggml_cuda_ar_acquire_slot(p).slot;
|
||||
const size_t copy_chunks = (nbytes + chunk_bytes - 1) / chunk_bytes;
|
||||
GGML_ASSERT(copy_chunks <= GGML_CUDA_AR_COPY_MAX_CHUNKS);
|
||||
|
||||
ggml_backend_cuda_context * cuda_ctx[2] = {};
|
||||
|
||||
// Stage 1: both GPUs copy their local contribution to pinned host memory.
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cuda_ctx[i] = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
|
||||
GGML_ASSERT(cuda_ctx[i]->device == p->devices[i]);
|
||||
|
||||
ggml_cuda_ar_wait_for_compute(p, cuda_ctx[i], i, slot);
|
||||
|
||||
// Wait for peer's H2D from our host_large[i] (recorded in the
|
||||
// previous AR's stage 2) to complete before we overwrite host_large[i].
|
||||
// host_large_read_done[peer] = peer finished reading host_large[i].
|
||||
// No-op on the first AR -- no prior record exists.
|
||||
if (p->host_large_read_done_valid) {
|
||||
const int peer = 1 - i;
|
||||
CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->host_large_read_done[peer]));
|
||||
}
|
||||
|
||||
if (!compute[i]) {
|
||||
CUDA_CHECK(cudaMemsetAsync(src_buf[i], 0, nbytes, p->streams[i]));
|
||||
}
|
||||
|
||||
for (size_t c = 0; c < copy_chunks; ++c) {
|
||||
const size_t offset = c * chunk_bytes;
|
||||
const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
|
||||
(nbytes - offset) : chunk_bytes;
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
p->host_large[i].host + offset, reinterpret_cast<char *>(src_buf[i]) + offset, this_bytes,
|
||||
cudaMemcpyDeviceToHost, p->streams[i]));
|
||||
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].cpy[c], p->streams[i]));
|
||||
}
|
||||
}
|
||||
|
||||
// Stage 2: each GPU waits for each peer D2H chunk, pulls that chunk back to
|
||||
// local device scratch (dev_tmp), then performs one device-local add over
|
||||
// the assembled peer tensor. The H2Ds run on the AR stream (copy engine)
|
||||
// and the add_kernel runs on the caller's compute stream, so the AR stream
|
||||
// stays pure-copy and avoids an in-stream copy->compute engine switch every
|
||||
// AR. dev_tmp is single-buffered: the AR stream waits cross-stream on the
|
||||
// prior AR's add_kernel-done event before overwriting it.
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
const int peer = 1 - i;
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
|
||||
// Wait for the previous AR's add_kernel (on the compute stream) to
|
||||
// finish reading dev_tmp before our H2D overwrites it. No-op on the
|
||||
// first copy_impl call.
|
||||
if (p->dev_tmp_kernel_done_valid) {
|
||||
CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->dev_tmp_kernel_done[i]));
|
||||
}
|
||||
|
||||
for (size_t c = 0; c < copy_chunks; ++c) {
|
||||
const size_t offset = c * chunk_bytes;
|
||||
const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
|
||||
(nbytes - offset) : chunk_bytes;
|
||||
|
||||
CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->ev_pool[peer][slot].cpy[c]));
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
p->dev_tmp[i] + offset, p->host_large[peer].host + offset, this_bytes,
|
||||
cudaMemcpyHostToDevice, p->streams[i]));
|
||||
}
|
||||
|
||||
// Mark our reads of host_large[peer] complete so peer's next AR can
|
||||
// safely overwrite it.
|
||||
CUDA_CHECK(cudaEventRecord(p->host_large_read_done[i], p->streams[i]));
|
||||
|
||||
// Hand off from AR stream (copy engine) to compute stream: compute
|
||||
// stream waits for all H2Ds to finish, then runs the add_kernel.
|
||||
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].h2d, p->streams[i]));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx[i]->stream(), p->ev_pool[i][slot].h2d));
|
||||
|
||||
const int block_size = 256;
|
||||
int n_blocks = (int) ((ne + block_size - 1) / block_size);
|
||||
if (n_blocks > 1024) {
|
||||
n_blocks = 1024;
|
||||
}
|
||||
ggml_cuda_ar_add_kernel<T_dst, T_src><<<n_blocks, block_size, 0, cuda_ctx[i]->stream()>>>(
|
||||
dst_buf[i],
|
||||
reinterpret_cast<const T_src *>(p->dev_tmp[i]),
|
||||
(int) ne);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
// Record dev_tmp-released on the compute stream so the next copy_impl
|
||||
// can wait for the kernel to finish before overwriting dev_tmp. Also
|
||||
// record AR-done as ev.ker for acquire_slot's pool-wraparound sync.
|
||||
CUDA_CHECK(cudaEventRecord(p->dev_tmp_kernel_done[i], cuda_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, cuda_ctx[i]->stream()));
|
||||
}
|
||||
p->host_large_read_done_valid = true;
|
||||
p->dev_tmp_kernel_done_valid = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Outer-level chunker: copy_impl handles up to copy_bytes per call (limited by
|
||||
// the host_large / dev_tmp allocation size). When the full AR exceeds that,
|
||||
// slice the tensor into copy_bytes-sized pieces and call copy_impl repeatedly.
|
||||
// Each slice goes through its own stage 1 -> stage 2 cycle and acquires its own
|
||||
// slot, so cross-AR fences and pool wraparound work the same way as for any
|
||||
// other sequence of small ARs.
|
||||
template <typename T_src, typename T_dst>
|
||||
static bool ggml_cuda_ar_allreduce_copy_outer(
|
||||
ggml_cuda_ar_pipeline * p,
|
||||
ggml_backend_t * backends,
|
||||
T_src * const src_buf[GGML_CUDA_MAX_DEVICES],
|
||||
T_dst * const dst_buf[GGML_CUDA_MAX_DEVICES],
|
||||
const bool compute[GGML_CUDA_MAX_DEVICES],
|
||||
int64_t ne) {
|
||||
const int64_t outer_max_elems = (int64_t) (p->copy_bytes / sizeof(T_src));
|
||||
GGML_ASSERT(outer_max_elems > 0);
|
||||
|
||||
bool ok = true;
|
||||
for (int64_t outer_start = 0; outer_start < ne && ok; outer_start += outer_max_elems) {
|
||||
const int64_t outer_ne = std::min(outer_max_elems, ne - outer_start);
|
||||
const size_t outer_nbytes = (size_t) outer_ne * sizeof(T_src);
|
||||
|
||||
T_src * src[GGML_CUDA_MAX_DEVICES] = {};
|
||||
T_dst * dst[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < p->n_devices; ++i) {
|
||||
src[i] = src_buf[i] + outer_start;
|
||||
dst[i] = dst_buf[i] + outer_start;
|
||||
}
|
||||
ok = ggml_cuda_ar_allreduce_copy_impl<T_src, T_dst>(
|
||||
p, backends, src, dst, compute, outer_ne, outer_nbytes);
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
bool ggml_cuda_ar_allreduce(
|
||||
ggml_cuda_ar_pipeline * p,
|
||||
ggml_backend_t * backends,
|
||||
ggml_tensor ** tensors) {
|
||||
GGML_ASSERT(p != nullptr);
|
||||
|
||||
const int n = p->n_devices;
|
||||
GGML_ASSERT(n == 2);
|
||||
|
||||
const ggml_type input_type = tensors[0]->type;
|
||||
GGML_ASSERT(input_type == GGML_TYPE_F32 || input_type == GGML_TYPE_F16 || input_type == GGML_TYPE_BF16);
|
||||
|
||||
const int64_t ne = ggml_nelements(tensors[0]);
|
||||
GGML_ASSERT(ne > 0);
|
||||
|
||||
const size_t input_nbytes = ggml_nbytes(tensors[0]);
|
||||
|
||||
// BF16 round-trip: F32 inputs >= bf16_threshold are converted to BF16 for
|
||||
// the reduction (chunked or copy-engine), halving on-wire bytes. Matches
|
||||
// NCCL's behaviour. The pre-conversion zeroes inactive shards so the
|
||||
// inner paths see them as already-prepared compute tensors.
|
||||
const bool use_bf16 =
|
||||
input_type == GGML_TYPE_F32 &&
|
||||
p->bf16_threshold > 0 &&
|
||||
input_nbytes >= p->bf16_threshold;
|
||||
|
||||
const ggml_type kernel_type = use_bf16 ? GGML_TYPE_BF16 : input_type;
|
||||
const size_t type_size = ggml_type_size(kernel_type);
|
||||
GGML_ASSERT(p->buf_bytes >= type_size);
|
||||
const size_t nbytes = (size_t) ne * type_size;
|
||||
|
||||
bool compute_flag[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < n; ++i) {
|
||||
compute_flag[i] = (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) != 0;
|
||||
}
|
||||
|
||||
// Decide between copy-engine and chunked kernel paths based on the working
|
||||
// type's actual byte count. No upper bound: copy_outer slices reductions
|
||||
// larger than copy_bytes into copy_bytes-sized pieces.
|
||||
const bool use_copy_engine =
|
||||
p->copy_threshold > 0 &&
|
||||
nbytes >= p->copy_threshold;
|
||||
|
||||
// BF16 inactive-shard zeroing: when use_bf16 is on, the combined kernel
|
||||
// (chunked kernel path) and the combined add kernel (copy_engine path)
|
||||
// both accumulate into the F32 tensor data directly, so an inactive
|
||||
// shard's accumulator must start at zero.
|
||||
if (use_bf16) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
if (!compute_flag[i]) {
|
||||
auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
|
||||
GGML_ASSERT(cuda_ctx->device == p->devices[i]);
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, (size_t) ne * sizeof(float), cuda_ctx->stream()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pre-convert F32 -> BF16 into bf16_tmp ONLY for the copy_engine + use_bf16
|
||||
// path; the chunked kernel path's combined kernel does the conversion
|
||||
// inline as it writes to host_buf.
|
||||
ggml_cuda_pool_alloc<nv_bfloat16> bf16_tmp[GGML_CUDA_MAX_DEVICES];
|
||||
void * copy_src_ptr[GGML_CUDA_MAX_DEVICES] = {};
|
||||
|
||||
if (use_copy_engine && use_bf16) {
|
||||
to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
|
||||
GGML_ASSERT(cuda_ctx->device == p->devices[i]);
|
||||
bf16_tmp[i].pool = &cuda_ctx->pool();
|
||||
bf16_tmp[i].alloc(ne);
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
if (compute_flag[i]) {
|
||||
to_bf16(tensors[i]->data, bf16_tmp[i].get(), ne, cuda_ctx->stream());
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
} else {
|
||||
CUDA_CHECK(cudaMemsetAsync(bf16_tmp[i].get(), 0, nbytes, cuda_ctx->stream()));
|
||||
}
|
||||
copy_src_ptr[i] = bf16_tmp[i].get();
|
||||
}
|
||||
}
|
||||
|
||||
bool ok = true;
|
||||
if (use_copy_engine) {
|
||||
// After up-front BF16 conversion, the tmp buffers already hold the
|
||||
// (possibly zeroed-for-inactive) data, so the inner path can treat
|
||||
// every shard as compute.
|
||||
bool inner_compute[GGML_CUDA_MAX_DEVICES];
|
||||
for (int i = 0; i < n; ++i) {
|
||||
inner_compute[i] = use_bf16 ? true : compute_flag[i];
|
||||
}
|
||||
|
||||
// Dispatch into copy_impl with explicit src/dst types. When use_bf16
|
||||
// is on, the wire type is BF16 (src = bf16_tmp) and the accumulator
|
||||
// is F32 (dst = tensors[i]->data); the combined add kernel rounds dst
|
||||
// through BF16 for bit-equivalence and writes F32 directly, so no
|
||||
// post-conversion is needed. Otherwise src == dst (same native type).
|
||||
if (use_bf16) {
|
||||
GGML_ASSERT(kernel_type == GGML_TYPE_BF16);
|
||||
nv_bfloat16 * src[GGML_CUDA_MAX_DEVICES] = {};
|
||||
float * dst[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < n; ++i) {
|
||||
src[i] = static_cast<nv_bfloat16 *>(copy_src_ptr[i]);
|
||||
dst[i] = static_cast<float *>(tensors[i]->data);
|
||||
}
|
||||
ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, float>(
|
||||
p, backends, src, dst, inner_compute, ne);
|
||||
} else {
|
||||
switch (kernel_type) {
|
||||
case GGML_TYPE_F32: {
|
||||
float * buf[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < n; ++i) {
|
||||
buf[i] = static_cast<float *>(tensors[i]->data);
|
||||
}
|
||||
ok = ggml_cuda_ar_allreduce_copy_outer<float, float>(
|
||||
p, backends, buf, buf, inner_compute, ne);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_BF16: {
|
||||
nv_bfloat16 * buf[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < n; ++i) {
|
||||
buf[i] = static_cast<nv_bfloat16 *>(tensors[i]->data);
|
||||
}
|
||||
ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, nv_bfloat16>(
|
||||
p, backends, buf, buf, inner_compute, ne);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_F16: {
|
||||
half * buf[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < n; ++i) {
|
||||
buf[i] = static_cast<half *>(tensors[i]->data);
|
||||
}
|
||||
ok = ggml_cuda_ar_allreduce_copy_outer<half, half>(
|
||||
p, backends, buf, buf, inner_compute, ne);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// host_buf carries T_wire-typed data; max_chunk_elems is the count that
|
||||
// fits in one host_buf at the wire size.
|
||||
const size_t max_chunk_elems = p->buf_bytes / type_size;
|
||||
const size_t input_type_size = ggml_type_size(input_type);
|
||||
|
||||
// Chunked kernel path runs entirely on the caller's compute stream:
|
||||
// since AR is a barrier here, same-stream ordering subsumes any
|
||||
// cross-stream event handshake that the copy-engine path needs, and
|
||||
// skips the cross-stream scheduling overhead that was hurting the
|
||||
// small-tensor (tg) latency on the AR-stream variant. Only ev.ker is
|
||||
// still recorded at end-of-AR for acquire_slot's pool-wraparound check.
|
||||
for (int64_t chunk_start = 0; chunk_start < ne; chunk_start += (int64_t) max_chunk_elems) {
|
||||
const size_t remaining_elems = (size_t) (ne - chunk_start);
|
||||
const size_t chunk_elems = remaining_elems < max_chunk_elems ? remaining_elems : max_chunk_elems;
|
||||
const size_t chunk_dst_bytes = chunk_elems * input_type_size;
|
||||
|
||||
const auto [slot, token] = ggml_cuda_ar_acquire_slot(p);
|
||||
const bool last_chunk = chunk_start + (int64_t) chunk_elems == ne;
|
||||
|
||||
for (int i = 0; i < n; ++i) {
|
||||
const int peer = 1 - i; // valid for n == 2 only
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
|
||||
GGML_ASSERT(cuda_ctx->device == p->devices[i]);
|
||||
cudaStream_t stream = cuda_ctx->stream();
|
||||
|
||||
char * data = static_cast<char *>(tensors[i]->data) + chunk_start * (int64_t) input_type_size;
|
||||
|
||||
// Match NCCL/meta-backend semantics: inactive shards contribute
|
||||
// zeros. On the BF16 path the F32 tensor data was already
|
||||
// zeroed up-front (above), so per-chunk zeroing isn't needed.
|
||||
if (!compute_flag[i] && !use_bf16) {
|
||||
CUDA_CHECK(cudaMemsetAsync(data, 0, chunk_dst_bytes, stream));
|
||||
}
|
||||
|
||||
#define LAUNCH_AR_KERNEL(T_dst, T_wire) \
|
||||
ggml_cuda_ar_kernel<T_dst, T_wire><<<dim3(GGML_CUDA_AR_KERNEL_BLOCKS), dim3(256), 0, stream>>>( \
|
||||
reinterpret_cast<const T_dst *>(data), \
|
||||
reinterpret_cast<T_dst *>(data), \
|
||||
reinterpret_cast<T_wire *>(p->host_buf[i].dev + (size_t) slot * p->buf_bytes), \
|
||||
reinterpret_cast<const T_wire *>(p->host_buf[peer].dev + (size_t) slot * p->buf_bytes), \
|
||||
static_cast<int>(chunk_elems), \
|
||||
ggml_cuda_ar_arrival_ptr(p, slot, i), \
|
||||
ggml_cuda_ar_arrival_ptr(p, slot, peer), \
|
||||
token)
|
||||
|
||||
if (use_bf16) {
|
||||
GGML_ASSERT(input_type == GGML_TYPE_F32);
|
||||
LAUNCH_AR_KERNEL(float, nv_bfloat16);
|
||||
} else {
|
||||
switch (input_type) {
|
||||
case GGML_TYPE_F32: LAUNCH_AR_KERNEL(float, float); break;
|
||||
case GGML_TYPE_F16: LAUNCH_AR_KERNEL(half, half); break;
|
||||
case GGML_TYPE_BF16: LAUNCH_AR_KERNEL(nv_bfloat16, nv_bfloat16); break;
|
||||
default: GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
#undef LAUNCH_AR_KERNEL
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (last_chunk) {
|
||||
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, stream));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ok;
|
||||
}
|
||||
|
||||
#else // defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
|
||||
|
||||
// HIP and MUSA lack the host-mapped pinned-memory APIs (cudaHostAllocPortable
|
||||
// / cudaHostAllocMapped / cudaHostGetDevicePointer) and __nanosleep that this
|
||||
// implementation relies on, so the internal AllReduce is a CUDA-only feature.
|
||||
// The dispatcher in ggml-cuda.cu treats a nullptr pipeline as "init failed"
|
||||
// and silently falls back to the meta backend's generic AllReduce.
|
||||
ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int *, size_t) {
|
||||
return nullptr;
|
||||
}
|
||||
void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline *) {
|
||||
}
|
||||
bool ggml_cuda_ar_allreduce(ggml_cuda_ar_pipeline *, ggml_backend_t *, ggml_tensor **) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
29
ggml/src/ggml-cuda/allreduce.cuh
Normal file
29
ggml/src/ggml-cuda/allreduce.cuh
Normal file
@@ -0,0 +1,29 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.cuh"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
// Opaque pipeline context -- owns all pinned buffers, streams, and events.
|
||||
struct ggml_cuda_ar_pipeline;
|
||||
|
||||
// Allocate a pipeline for n_devices GPUs.
|
||||
// devices[] holds the CUDA device IDs in rank order.
|
||||
// Returns nullptr on allocation failure.
|
||||
ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(
|
||||
const int * devices, size_t n_devices);
|
||||
|
||||
// Release all resources owned by the pipeline.
|
||||
void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * pipeline);
|
||||
|
||||
// Execute an in-place AllReduce (sum) across tensors[0..n_devices-1].
|
||||
// tensors[i] must live on the device managed by backends[i] and be
|
||||
// contiguous F32, F16, or BF16.
|
||||
// Preconditions are checked by the CUDA comm dispatcher before calling this.
|
||||
// Returns true once the reduction work has been enqueued successfully.
|
||||
bool ggml_cuda_ar_allreduce(
|
||||
ggml_cuda_ar_pipeline * pipeline,
|
||||
ggml_backend_t * backends,
|
||||
ggml_tensor ** tensors);
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
# include <cub/cub.cuh>
|
||||
# if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 1)
|
||||
# define STRIDED_ITERATOR_AVAILABLE
|
||||
# include <cuda/iterator>
|
||||
# endif
|
||||
using namespace cub;
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
|
||||
@@ -61,6 +61,11 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 32, 128, 2, 64, 64, 64, 64, 2, true);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 64, 128, 2, 64, 64, 64, 64, 2, true);
|
||||
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 8, 64, 4, 64, 96, 64, 64, 2, true);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 16, 64, 4, 32, 96, 64, 64, 2, true);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 32, 128, 2, 32, 96, 64, 64, 2, true);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 64, 128, 2, 32, 96, 64, 64, 2, true);
|
||||
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 8, 64, 4, 64, 128, 128, 128, 2, true);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16, 64, 4, 32, 128, 128, 128, 2, true);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 32, 128, 128, 128, 2, true);
|
||||
@@ -1561,6 +1566,10 @@ static __global__ void flash_attn_ext_f16(
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
if (DKQ == 192 && ncols2 != 8 && ncols2 != 16) {
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
#ifdef VOLTA_MMA_AVAILABLE
|
||||
if (ncols1*ncols2 < 32) {
|
||||
NO_DEVICE_CODE;
|
||||
|
||||
@@ -34,6 +34,10 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
GGML_ASSERT(V->ne[0] == K->ne[0]);
|
||||
ggml_cuda_flash_attn_ext_tile_case<128, 128>(ctx, dst);
|
||||
} break;
|
||||
case 192: {
|
||||
GGML_ASSERT(V->ne[0] == 128);
|
||||
ggml_cuda_flash_attn_ext_tile_case<192, 128>(ctx, dst);
|
||||
} break;
|
||||
case 256: {
|
||||
GGML_ASSERT(V->ne[0] == K->ne[0]);
|
||||
ggml_cuda_flash_attn_ext_tile_case<256, 256>(ctx, dst);
|
||||
|
||||
@@ -62,6 +62,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 16, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2, 64, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 2, 64, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 4, 128, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 8, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 2, 64, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 2, 64, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 4, 128, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 8, 256, 2, 64, 64)
|
||||
@@ -124,6 +130,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 16, 128, 3, 32, 128)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2, 64, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 2, 128, 3, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 4, 128, 3, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 8, 256, 2, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 2, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 2, 32, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 2, 128, 3, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 4, 128, 3, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 8, 256, 2, 32, 256)
|
||||
@@ -193,6 +205,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 64, 256, 2, 64, 32)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 2, 256, 2, 128, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 4, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 8, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 2, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 2, 32, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 2, 256, 2, 128, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 4, 256, 2, 64, 128)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 8, 256, 2, 64, 128)
|
||||
@@ -264,6 +282,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 3, 128, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 64, 256, 3, 64, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 2, 64, 8, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 4, 128, 6, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 8, 128, 6, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 5, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 3, 64, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 2, 64, 8, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 4, 128, 6, 32, 256)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 8, 128, 6, 32, 256)
|
||||
@@ -1250,7 +1274,20 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (DKQ <= 512 && DKQ != 320) {
|
||||
if constexpr (DKQ == 192) {
|
||||
// MiMo-V2.5 / V2.5-Pro / V2-Flash: gqa_ratio is 8 (SWA) or 16 (full attn)
|
||||
if (use_gqa_opt && gqa_ratio % 16 == 0) {
|
||||
launch_fattn_tile_switch_ncols1<DKQ, DV, 16, use_logit_softcap>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
if (use_gqa_opt && gqa_ratio % 8 == 0) {
|
||||
launch_fattn_tile_switch_ncols1<DKQ, DV, 8, use_logit_softcap>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
GGML_ABORT("flash-attn tile (192/128): expected GQA ratio multiple of 8");
|
||||
}
|
||||
|
||||
if constexpr (DKQ <= 512 && DKQ != 320 && DKQ != 192) {
|
||||
if (use_gqa_opt && gqa_ratio % 8 == 0) {
|
||||
launch_fattn_tile_switch_ncols1<DKQ, DV, 8, use_logit_softcap>(ctx, dst);
|
||||
return;
|
||||
@@ -1303,6 +1340,7 @@ extern DECL_FATTN_TILE_CASE( 80, 80);
|
||||
extern DECL_FATTN_TILE_CASE( 96, 96);
|
||||
extern DECL_FATTN_TILE_CASE(112, 112);
|
||||
extern DECL_FATTN_TILE_CASE(128, 128);
|
||||
extern DECL_FATTN_TILE_CASE(192, 128);
|
||||
extern DECL_FATTN_TILE_CASE(256, 256);
|
||||
extern DECL_FATTN_TILE_CASE(320, 256);
|
||||
extern DECL_FATTN_TILE_CASE(512, 512);
|
||||
|
||||
@@ -139,6 +139,22 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
|
||||
GGML_ASSERT(V->ne[0] == 128);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<128, 128>(ctx, dst);
|
||||
break;
|
||||
case 192: {
|
||||
// MiMo-V2.5 / V2.5-Pro / V2-Flash: gqa_ratio is 8 (SWA) or 16 (full attn)
|
||||
GGML_ASSERT(V->ne[0] == 128);
|
||||
float max_bias = 0.0f;
|
||||
memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
|
||||
const bool use_gqa_opt = mask && max_bias == 0.0f;
|
||||
GGML_ASSERT(use_gqa_opt);
|
||||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||
if (gqa_ratio % 16 == 0) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<192, 128, 16>(ctx, dst);
|
||||
} else {
|
||||
GGML_ASSERT(gqa_ratio % 8 == 0);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<192, 128, 8>(ctx, dst);
|
||||
}
|
||||
} break;
|
||||
case 256:
|
||||
GGML_ASSERT(V->ne[0] == 256);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
|
||||
@@ -368,6 +384,14 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
break;
|
||||
case 192:
|
||||
if (V->ne[0] != 128 || !gqa_opt_applies) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
if (gqa_ratio % 8 != 0) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
break;
|
||||
case 320:
|
||||
if (V->ne[0] != 256 || !gqa_opt_applies) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
@@ -425,7 +449,8 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
}
|
||||
|
||||
// For small batch sizes the vector kernel may be preferable over the kernels optimized for large batch sizes:
|
||||
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
||||
// 192 satisfies % 64 == 0 but has no vec instance (DKQ != DV); force it onto the MMA path.
|
||||
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && Q->ne[0] != 192 && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
||||
|
||||
// If Turing tensor cores are available, use them:
|
||||
if (turing_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
|
||||
@@ -454,7 +479,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
|
||||
if (volta_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
|
||||
int gqa_ratio_eff = 1;
|
||||
const int ncols2_max = Q->ne[0] == 576 ? 16 : 8;
|
||||
const int ncols2_max = (Q->ne[0] == 576 || Q->ne[0] == 192) ? 16 : 8;
|
||||
while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) {
|
||||
gqa_ratio_eff *= 2;
|
||||
}
|
||||
@@ -468,7 +493,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
}
|
||||
|
||||
// Use the WMMA kernel if possible:
|
||||
if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 512 && Q->ne[0] != 576) {
|
||||
if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 192 && Q->ne[0] != 512 && Q->ne[0] != 576) {
|
||||
if (can_use_vector_kernel && Q->ne[1] <= 2) {
|
||||
return BEST_FATTN_KERNEL_VEC;
|
||||
}
|
||||
@@ -501,7 +526,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
}
|
||||
|
||||
// Use MFMA flash attention for CDNA (MI100+):
|
||||
if (amd_mfma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 256 && Q->ne[0] != 512 && Q->ne[0] != 576) {
|
||||
if (amd_mfma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 192 && Q->ne[0] != 256 && Q->ne[0] != 512 && Q->ne[0] != 576) {
|
||||
const int64_t eff_nq = Q->ne[1] * (gqa_opt_applies ? gqa_ratio : 1);
|
||||
// MMA vs tile crossover benchmarked on MI300X @ d32768:
|
||||
// hsk=64 (gqa=4): MMA wins at eff >= 128 (+11%)
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-cuda/allreduce.cuh"
|
||||
#include "ggml-cuda/common.cuh"
|
||||
#include "ggml-cuda/acc.cuh"
|
||||
#include "ggml-cuda/add-id.cuh"
|
||||
@@ -86,6 +87,9 @@
|
||||
|
||||
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||
|
||||
#define GGML_LOG_WARN_ONCE(str) \
|
||||
{ static std::once_flag warn_flag; std::call_once(warn_flag, []() { GGML_LOG_WARN(str); }); }
|
||||
|
||||
[[noreturn]]
|
||||
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
||||
int id = -1; // in case cudaGetDevice fails
|
||||
@@ -1139,70 +1143,46 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
|
||||
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
// Communication context for multi-GPU AllReduce during tensor parallelism.
|
||||
//
|
||||
// Created once per meta backend instance. Resources for the selected mode
|
||||
// (NCCL communicators or the internal AllReduce pipeline) are initialised
|
||||
// eagerly during comm_init so any init failure surfaces at startup rather
|
||||
// than mid-run.
|
||||
struct ggml_backend_cuda_comm_context {
|
||||
using try_allreduce_fn = bool(*)(ggml_backend_cuda_comm_context *, struct ggml_tensor **);
|
||||
|
||||
std::vector<ggml_backend_t> backends;
|
||||
std::vector<ncclComm_t> comms;
|
||||
std::vector<int> dev_ids;
|
||||
|
||||
// Set by the init chain (comm_init_{nccl, internal, none}) to one of
|
||||
// try_allreduce_{nccl, internal, butterfly}. nccl needs `comms`,
|
||||
// internal needs `ar_pipeline`, butterfly needs nothing. Per-call
|
||||
// failures return false; the meta backend's generic implementation then
|
||||
// handles that call.
|
||||
try_allreduce_fn try_allreduce = nullptr;
|
||||
|
||||
ggml_cuda_ar_pipeline * ar_pipeline = nullptr;
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
std::vector<ncclComm_t> comms;
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
~ggml_backend_cuda_comm_context() {
|
||||
#ifdef GGML_USE_NCCL
|
||||
for (ncclComm_t comm : comms) {
|
||||
NCCL_CHECK(ncclCommDestroy(comm));
|
||||
}
|
||||
#endif // GGML_USE_NCCL
|
||||
ggml_cuda_ar_pipeline_free(ar_pipeline);
|
||||
}
|
||||
};
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
if (comm_ctx_v == nullptr) {
|
||||
return;
|
||||
}
|
||||
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
|
||||
delete comm_ctx;
|
||||
#else
|
||||
GGML_UNUSED(comm_ctx_v);
|
||||
#endif // GGML_USE_NCCL
|
||||
}
|
||||
|
||||
static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
for (size_t i = 0; i < n_backends; i++) {
|
||||
if (!ggml_backend_is_cuda(backends[i])) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
ggml_backend_cuda_comm_context * ret = new ggml_backend_cuda_comm_context;
|
||||
std::vector<int> dev_ids;
|
||||
ret->backends.reserve(n_backends);
|
||||
dev_ids.reserve(n_backends);
|
||||
for (size_t i = 0; i < n_backends; i++) {
|
||||
ret->backends.push_back(backends[i]);
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
|
||||
dev_ids.push_back(cuda_ctx->device);
|
||||
}
|
||||
|
||||
ret->comms.resize(n_backends);
|
||||
NCCL_CHECK(ncclCommInitAll(ret->comms.data(), n_backends, dev_ids.data()));
|
||||
return ret;
|
||||
#else
|
||||
// If NCCL is installed it is used by default for optimal performance.
|
||||
// However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package.
|
||||
// RCCL is disabled by default, users are explicitly opting in.
|
||||
// Therefore print no warning for RCCL.
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
static bool warning_printed = false;
|
||||
if (!warning_printed) {
|
||||
GGML_LOG_WARN("%s: NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal\n", __func__);
|
||||
warning_printed = true;
|
||||
}
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
GGML_UNUSED_VARS(backends, n_backends);
|
||||
return nullptr;
|
||||
#endif // GGML_USE_NCCL
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
// AllReduce via NCCL. Reduces as FP32 for small tensors and BF16 for large
|
||||
// tensors (bandwidth-bound), then converts back to FP32.
|
||||
static bool ggml_backend_cuda_comm_allreduce_nccl(
|
||||
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
|
||||
const int64_t ne = ggml_nelements(tensors[0]);
|
||||
// FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
|
||||
// This then causes a crash in this function
|
||||
@@ -1210,8 +1190,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_ASSERT(comm_ctx_v != nullptr);
|
||||
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
|
||||
const size_t n_backends = comm_ctx->backends.size();
|
||||
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
@@ -1236,7 +1214,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
|
||||
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, comm_ctx->comms[i], cuda_ctx->stream()));
|
||||
}
|
||||
NCCL_CHECK(ncclGroupEnd());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1275,10 +1252,184 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
|
||||
}
|
||||
|
||||
return true;
|
||||
#else
|
||||
GGML_UNUSED_VARS(comm_ctx_v, tensors);
|
||||
return false;
|
||||
}
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
// Run the internal AR pipeline. Returns false on unsupported / failed input
|
||||
// -- the caller decides whether to abort (env-forced) or fall back silently.
|
||||
static bool ggml_backend_cuda_comm_allreduce_internal(
|
||||
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
|
||||
GGML_ASSERT(comm_ctx->ar_pipeline != nullptr);
|
||||
|
||||
const size_t n_backends = comm_ctx->backends.size();
|
||||
GGML_ASSERT(n_backends == 2);
|
||||
GGML_ASSERT(tensors[0] != nullptr);
|
||||
|
||||
const int64_t ne = ggml_nelements(tensors[0]);
|
||||
const ggml_type type = tensors[0]->type;
|
||||
|
||||
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16 && type != GGML_TYPE_BF16) {
|
||||
GGML_LOG_DEBUG("%s: internal unsupported: type=%d\n", __func__, (int) type);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ne == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
if (tensors[i] == nullptr) {
|
||||
GGML_LOG_ERROR("%s: internal failed: tensor[%zu] is null\n", __func__, i);
|
||||
return false;
|
||||
}
|
||||
if (ggml_nelements(tensors[i]) != ne || tensors[i]->type != type) {
|
||||
GGML_LOG_ERROR("%s: internal failed: tensor[%zu] ne=%" PRId64 " type=%d expected ne=%" PRId64 " type=%d\n",
|
||||
__func__, i, ggml_nelements(tensors[i]), (int) tensors[i]->type, ne, (int) type);
|
||||
return false;
|
||||
}
|
||||
if (!ggml_is_contiguously_allocated(tensors[i])) {
|
||||
GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] is not contiguously allocated: ne=%" PRId64 " nbytes=%zu packed=%zu type=%d\n",
|
||||
__func__, i, ne, ggml_nbytes(tensors[i]),
|
||||
(size_t) ne * ggml_type_size(type) / ggml_blck_size(type), (int) type);
|
||||
return false;
|
||||
}
|
||||
if (((uintptr_t) tensors[i]->data & 0xF) != 0) {
|
||||
GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] data pointer is not 16-byte aligned: %p type=%d ne=%" PRId64 "\n",
|
||||
__func__, i, tensors[i]->data, (int) type, ne);
|
||||
return false;
|
||||
}
|
||||
GGML_ASSERT((ggml_nbytes(tensors[i]) & 0xF) == 0);
|
||||
}
|
||||
|
||||
return ggml_cuda_ar_allreduce(comm_ctx->ar_pipeline, comm_ctx->backends.data(), tensors);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Per-call dispatch -- three variants, one per backend. Each is set as
|
||||
// comm_ctx->try_allreduce by the matching init step. Per-call failure
|
||||
// returns false; the meta backend's generic implementation handles that call.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
static bool ggml_backend_cuda_comm_try_allreduce_nccl(
|
||||
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
|
||||
return ggml_backend_cuda_comm_allreduce_nccl(comm_ctx, tensors);
|
||||
}
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
static bool ggml_backend_cuda_comm_try_allreduce_internal(
|
||||
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
|
||||
return ggml_backend_cuda_comm_allreduce_internal(comm_ctx, tensors);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_comm_try_allreduce_butterfly(
|
||||
ggml_backend_cuda_comm_context *, struct ggml_tensor **) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
|
||||
if (comm_ctx_v == nullptr) {
|
||||
return;
|
||||
}
|
||||
delete static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Init -- chained nccl -> internal -> none. Each step tries to bring up its
|
||||
// resource; on failure it warns and recurses into the next step.
|
||||
// ---------------------------------------------------------------------------
|
||||
static void ggml_backend_cuda_comm_init_none(ggml_backend_cuda_comm_context * ret) {
|
||||
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_butterfly;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_comm_init_internal(ggml_backend_cuda_comm_context * ret) {
|
||||
ret->ar_pipeline = ggml_cuda_ar_pipeline_init(ret->dev_ids.data(), ret->dev_ids.size());
|
||||
if (ret->ar_pipeline) {
|
||||
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_internal;
|
||||
return;
|
||||
}
|
||||
|
||||
// Clear sticky CUDA error from the failed init.
|
||||
(void) cudaGetLastError();
|
||||
GGML_LOG_WARN("internal AllReduce init failed (n_devices != 2?); "
|
||||
"falling back to meta-backend butterfly\n");
|
||||
ggml_backend_cuda_comm_init_none(ret);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_comm_init_nccl(ggml_backend_cuda_comm_context * ret) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
const size_t n = ret->dev_ids.size();
|
||||
ret->comms.resize(n);
|
||||
ncclResult_t rc = ncclCommInitAll(ret->comms.data(), (int) n, ret->dev_ids.data());
|
||||
if (rc == ncclSuccess) {
|
||||
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_nccl;
|
||||
return;
|
||||
}
|
||||
|
||||
ret->comms.clear();
|
||||
GGML_LOG_WARN("NCCL init failed (%s); falling back to internal AllReduce\n",
|
||||
ncclGetErrorString(rc));
|
||||
#else // GGML_USE_NCCL
|
||||
#ifndef GGML_USE_HIP
|
||||
GGML_LOG_WARN("NCCL not compiled in; falling back to internal AllReduce. "
|
||||
"Recompile with -DGGML_CUDA_NCCL=ON for best multi-GPU performance.\n");
|
||||
#endif // !GGML_USE_HIP
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
ggml_backend_cuda_comm_init_internal(ret);
|
||||
}
|
||||
|
||||
// Top-level init. Picks one of the three init paths based on
|
||||
// GGML_CUDA_ALLREDUCE (or the platform default) and lets the chain handle
|
||||
// any fallback. Unrecognised env values warn and fall through to the
|
||||
// platform default.
|
||||
static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
|
||||
for (size_t i = 0; i < n_backends; i++) {
|
||||
if (!ggml_backend_is_cuda(backends[i])) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
auto * ret = new ggml_backend_cuda_comm_context;
|
||||
ret->backends.assign(backends, backends + n_backends);
|
||||
ret->dev_ids.reserve(n_backends);
|
||||
for (size_t i = 0; i < n_backends; i++) {
|
||||
ret->dev_ids.push_back(static_cast<ggml_backend_cuda_context *>(backends[i]->context)->device);
|
||||
}
|
||||
|
||||
const char * env = getenv("GGML_CUDA_ALLREDUCE");
|
||||
if (!env) {
|
||||
// Platform default: Linux uses NCCL, otherwise (generally Windows) internal
|
||||
#if defined(__linux__)
|
||||
ggml_backend_cuda_comm_init_nccl(ret);
|
||||
#else
|
||||
ggml_backend_cuda_comm_init_internal(ret);
|
||||
#endif // defined(__linux__)
|
||||
} else {
|
||||
std::string env_str(env);
|
||||
if (env_str == "nccl") {
|
||||
ggml_backend_cuda_comm_init_nccl(ret);
|
||||
} else if (env_str == "internal") {
|
||||
ggml_backend_cuda_comm_init_internal(ret);
|
||||
} else if (env_str == "none") {
|
||||
ggml_backend_cuda_comm_init_none(ret);
|
||||
} else {
|
||||
GGML_LOG_WARN("unknown GGML_CUDA_ALLREDUCE value: %s\n", env);
|
||||
ggml_backend_cuda_comm_init_none(ret);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Top-level dispatch -- calls the function pointer chosen by comm_init.
|
||||
// Returns false to let the meta-backend's butterfly run.
|
||||
static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
|
||||
if (comm_ctx_v == nullptr) {
|
||||
return false;
|
||||
}
|
||||
auto * comm_ctx = static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
|
||||
return comm_ctx->try_allreduce(comm_ctx, tensors);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
|
||||
@@ -3778,10 +3929,25 @@ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph
|
||||
// closure check: the trailing add must read the same x as the leading mul
|
||||
const ggml_tensor * x_in_add = (add->src[0] == mul1) ? add->src[1] : add->src[0];
|
||||
|
||||
const bool type_ok = (x->type == GGML_TYPE_F32 || x->type == GGML_TYPE_F16 || x->type == GGML_TYPE_BF16);
|
||||
// Kernel iterates over total = T * C, so x and add must be 2D and
|
||||
// a / inv_b must collapse to [1, C, 1, 1]. Higher dims are not handled.
|
||||
const bool dim_ok = (x->ne[2] == 1 && x->ne[3] == 1) &&
|
||||
(add->ne[2] == 1 && add->ne[3] == 1) &&
|
||||
(a->ne[2] == 1 && a->ne[3] == 1);
|
||||
const bool shape_ok = ggml_are_same_shape(a, inv_b) && a->ne[0] == 1 && a->ne[1] == x->ne[1];
|
||||
|
||||
if (type_ok && shape_ok && x_in_add == x && add->type == x->type) {
|
||||
// x must be in the supported whitelist and every operand / intermediate
|
||||
// result must share x's type, since launch_snake casts a / inv_b as
|
||||
// float and templates the kernel on a single T. Mixed precision chains
|
||||
// fall back to the naive path.
|
||||
const ggml_tensor * sin1 = cgraph->nodes[i + 1];
|
||||
const bool types_ok = (x->type == GGML_TYPE_F32 || x->type == GGML_TYPE_F16 || x->type == GGML_TYPE_BF16) &&
|
||||
(a->type == x->type) && (inv_b->type == x->type) &&
|
||||
(mul0->type == x->type) && (sin1->type == x->type) &&
|
||||
(sqr->type == x->type) && (mul1->type == x->type) &&
|
||||
(add->type == x->type);
|
||||
|
||||
if (types_ok && shape_ok && dim_ok && x_in_add == x) {
|
||||
ggml_cuda_op_snake_fused(*cuda_ctx, x, a, inv_b, add);
|
||||
return 4;
|
||||
}
|
||||
@@ -5140,12 +5306,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD_ID:
|
||||
case GGML_OP_ADD1:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
@@ -5154,6 +5316,13 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_LOG:
|
||||
return true;
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
(op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16) &&
|
||||
(op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
|
||||
case GGML_OP_SSM_SCAN: {
|
||||
if (op->src[3]->ne[0] == 1) {
|
||||
// Mamba2
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "im2col.cuh"
|
||||
|
||||
#define MAX_GRIDDIM_Y 65535
|
||||
#define MAX_GRIDDIM_Z 65535
|
||||
|
||||
template <typename T>
|
||||
@@ -18,22 +19,23 @@ static __global__ void im2col_kernel(
|
||||
const int64_t ikh = rem / KW;
|
||||
const int64_t ikw = rem - ikh * KW;
|
||||
|
||||
const int64_t iow = blockIdx.y;
|
||||
for (int64_t iz = blockIdx.z; iz < N_OH; iz+=MAX_GRIDDIM_Z) {
|
||||
const int64_t in = iz / OH;
|
||||
const int64_t ioh = iz - in * OH;
|
||||
for (int64_t iow = blockIdx.y; iow < OW; iow += MAX_GRIDDIM_Y) {
|
||||
for (int64_t iz = blockIdx.z; iz < N_OH; iz += MAX_GRIDDIM_Z) {
|
||||
const int64_t in = iz / OH;
|
||||
const int64_t ioh = iz - in * OH;
|
||||
|
||||
const int64_t iiw = iow * s0 + ikw * d0 - p0;
|
||||
const int64_t iih = ioh * s1 + ikh * d1 - p1;
|
||||
const int64_t iiw = iow * s0 + ikw * d0 - p0;
|
||||
const int64_t iih = ioh * s1 + ikh * d1 - p1;
|
||||
|
||||
const int64_t offset_dst =
|
||||
((in * OH + ioh) * OW + iow) * IC_KH_KW + iic * KH_KW + ikh * KW + ikw;
|
||||
const int64_t offset_dst =
|
||||
((in * OH + ioh) * OW + iow) * IC_KH_KW + iic * KH_KW + ikh * KW + ikw;
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
const int64_t offset_src = iic * IC_IH_IW + in * IH_IW;
|
||||
dst[offset_dst] = x[offset_src + iih * IW + iiw];
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
const int64_t offset_src = iic * IC_IH_IW + in * IH_IW;
|
||||
dst[offset_dst] = x[offset_src + iih * IW + iiw];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,7 +53,7 @@ static void im2col_cuda(const float * x, T* dst,
|
||||
const int64_t num_blocks = (IC_KH_KW + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
||||
const int64_t N_OH = N * OH;
|
||||
const int64_t KH_KW = KW*KH;
|
||||
dim3 block_nums(num_blocks, OW, MIN(N_OH, MAX_GRIDDIM_Z));
|
||||
dim3 block_nums(num_blocks, MIN(OW, MAX_GRIDDIM_Y), MIN(N_OH, MAX_GRIDDIM_Z));
|
||||
im2col_kernel<<<block_nums, MIN(IC_KH_KW, CUDA_IM2COL_BLOCK_SIZE) , 0, stream>>>(x, dst, IC, IW, IH, OH, OW, KW, KH,
|
||||
IC_IH_IW, IH_IW, N_OH, KH_KW, IC_KH_KW,
|
||||
s0, s1, p0, p1, d0, d1);
|
||||
@@ -136,23 +138,24 @@ static __global__ void im2col_3d_kernel(
|
||||
const int64_t ikh = (i - iic * KD_KH_KW - ikd * KH_KW) / KW;
|
||||
const int64_t ikw = i % KW;
|
||||
|
||||
const int64_t iow = blockIdx.y;
|
||||
for (int64_t iz = blockIdx.z; iz < N_OD_OH; iz+=MAX_GRIDDIM_Z) {
|
||||
const int64_t in = iz / OD_OH;
|
||||
const int64_t iod = (iz - in*OD_OH) / OH;
|
||||
const int64_t ioh = iz % OH;
|
||||
for (int64_t iow = blockIdx.y; iow < OW; iow += MAX_GRIDDIM_Y) {
|
||||
for (int64_t iz = blockIdx.z; iz < N_OD_OH; iz += MAX_GRIDDIM_Z) {
|
||||
const int64_t in = iz / OD_OH;
|
||||
const int64_t iod = (iz - in*OD_OH) / OH;
|
||||
const int64_t ioh = iz % OH;
|
||||
|
||||
const int64_t iiw = iow * s0 + ikw * d0 - p0;
|
||||
const int64_t iih = ioh * s1 + ikh * d1 - p1;
|
||||
const int64_t iid = iod * s2 + ikd * d2 - p2;
|
||||
const int64_t iiw = iow * s0 + ikw * d0 - p0;
|
||||
const int64_t iih = ioh * s1 + ikh * d1 - p1;
|
||||
const int64_t iid = iod * s2 + ikd * d2 - p2;
|
||||
|
||||
const int64_t offset_dst = in*OD_OH_OW_IC_KD_KH_KW + iod*OH_OW_IC_KD_KH_KW + ioh*OW_IC_KD_KH_KW + iow*IC_KD_KH_KW + iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw;
|
||||
const int64_t offset_dst = in*OD_OH_OW_IC_KD_KH_KW + iod*OH_OW_IC_KD_KH_KW + ioh*OW_IC_KD_KH_KW + iow*IC_KD_KH_KW + iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw;
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
const int64_t offset_src = ((in * IC + iic) * stride_q) + (iid * stride_z) + (iih * stride_y) + (iiw * stride_x);
|
||||
dst[offset_dst] = src[offset_src];
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
const int64_t offset_src = ((in * IC + iic) * stride_q) + (iid * stride_z) + (iih * stride_y) + (iiw * stride_x);
|
||||
dst[offset_dst] = src[offset_src];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -178,7 +181,7 @@ static void im2col_3d_cuda(const float * src, T* dst,
|
||||
const int64_t OH_OW_IC_KD_KH_KW = OH*OW*IC*KD*KH*KW;
|
||||
const int64_t OW_IC_KD_KH_KW = OW*IC*KD*KH*KW;
|
||||
const int64_t num_blocks = (IC_KD_KH_KW + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
||||
dim3 block_nums(num_blocks, OW, MIN(N_OD_OH, MAX_GRIDDIM_Z));
|
||||
dim3 block_nums(num_blocks, MIN(OW, MAX_GRIDDIM_Y), MIN(N_OD_OH, MAX_GRIDDIM_Z));
|
||||
im2col_3d_kernel<<<block_nums, MIN(IC_KD_KH_KW, CUDA_IM2COL_BLOCK_SIZE) , 0, stream>>>(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
|
||||
OH_OW, KD_KH_KW, ID_IH_IW, KH_KW, IH_IW, IC_ID_IH_IW,
|
||||
IC_KD_KH_KW, OW_KD_KH_KW, OD_OH_OW_IC_KD_KH_KW,
|
||||
|
||||
@@ -2,4 +2,5 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 1, 16);
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
|
||||
|
||||
@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(512, 512, 1, 8);
|
||||
|
||||
@@ -2,4 +2,5 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 2, 16);
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
|
||||
|
||||
@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(512, 512, 2, 8);
|
||||
|
||||
@@ -2,4 +2,5 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 4, 16);
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
|
||||
|
||||
@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(512, 512, 4, 8);
|
||||
|
||||
@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(512, 512, 8, 8);
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-tile.cuh"
|
||||
|
||||
DECL_FATTN_TILE_CASE(192, 128);
|
||||
@@ -3,7 +3,10 @@
|
||||
from glob import glob
|
||||
import os
|
||||
|
||||
HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 256, 320, 512, 576]
|
||||
HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 192, 256, 320, 512, 576]
|
||||
|
||||
# DKQ -> DV override for asymmetric head dims.
|
||||
HEAD_SIZES_V_OVERRIDE = {576: 512, 320: 256, 192: 128}
|
||||
|
||||
TYPES_KV = ["GGML_TYPE_F16", "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_BF16"]
|
||||
|
||||
@@ -62,7 +65,7 @@ for filename in glob("*.cu"):
|
||||
os.remove(filename)
|
||||
|
||||
for head_size_kq in HEAD_SIZES_KQ:
|
||||
head_size_v = 256 if head_size_kq == 320 else (head_size_kq if head_size_kq != 576 else 512)
|
||||
head_size_v = HEAD_SIZES_V_OVERRIDE.get(head_size_kq, head_size_kq)
|
||||
with open(f"fattn-tile-instance-dkq{head_size_kq}-dv{head_size_v}.cu", "w") as f:
|
||||
f.write(SOURCE_FATTN_TILE.format(head_size_kq=head_size_kq, head_size_v=head_size_v))
|
||||
|
||||
@@ -85,15 +88,17 @@ for ncols in [8, 16, 32, 64]:
|
||||
if head_size_kq == 72:
|
||||
continue
|
||||
# Skip compilation of unused ncols2 values for niche head sizes:
|
||||
if head_size_kq == 192 and ncols2 not in (8, 16): # MiMo-V2.5
|
||||
continue
|
||||
if head_size_kq == 320 and ncols2 != 32: # Mistral Small 4
|
||||
continue
|
||||
if head_size_kq == 512 and ncols2 not in (4, 8): # Gemma 4
|
||||
continue
|
||||
if head_size_kq == 576 and ncols2 not in (4, 16, 32): # Deepseek, GLM 4.7 Flash
|
||||
continue
|
||||
if head_size_kq not in (320, 576) and ncols2 in (16, 32):
|
||||
if head_size_kq not in (192, 320, 576) and ncols2 in (16, 32):
|
||||
continue
|
||||
head_size_v = 256 if head_size_kq == 320 else (head_size_kq if head_size_kq != 576 else 512)
|
||||
head_size_v = HEAD_SIZES_V_OVERRIDE.get(head_size_kq, head_size_kq)
|
||||
f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size_kq=head_size_kq, head_size_v=head_size_v))
|
||||
|
||||
for type in TYPES_MMQ:
|
||||
|
||||
@@ -2261,6 +2261,58 @@ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_sess
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * q = op->src[0];
|
||||
const struct ggml_tensor * k = op->src[1];
|
||||
const struct ggml_tensor * v = op->src[2];
|
||||
const struct ggml_tensor * g = op->src[3];
|
||||
const struct ggml_tensor * beta = op->src[4];
|
||||
const struct ggml_tensor * state = op->src[5];
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
if (!q || !k || !v || !g || !beta || !state) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (q->type != GGML_TYPE_F32 || k->type != GGML_TYPE_F32 || v->type != GGML_TYPE_F32 ||
|
||||
g->type != GGML_TYPE_F32 || beta->type != GGML_TYPE_F32 || state->type != GGML_TYPE_F32 ||
|
||||
dst->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggml_is_contiguous_rows(q) || !ggml_is_contiguous_rows(k) || !ggml_is_contiguous_rows(v) ||
|
||||
!ggml_is_contiguous(g) || !ggml_is_contiguous(beta) || !ggml_is_contiguous(state) ||
|
||||
!ggml_is_contiguous(dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const int64_t S_v = v->ne[0];
|
||||
const int64_t H = v->ne[1];
|
||||
const int64_t n_tokens = v->ne[2];
|
||||
const int64_t n_seqs = v->ne[3];
|
||||
|
||||
if (S_v <= 0 || S_v > 128 || H <= 0 || n_tokens <= 0 || n_seqs <= 0) {
|
||||
return false;
|
||||
}
|
||||
if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] <= 0 || k->ne[1] <= 0 ||
|
||||
q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] <= 0 || k->ne[3] <= 0 ||
|
||||
(n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
|
||||
return false;
|
||||
}
|
||||
if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
|
||||
return false;
|
||||
}
|
||||
if (ggml_nelements(state) != S_v * S_v * H * n_seqs) {
|
||||
return false;
|
||||
}
|
||||
if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) {
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_UNUSED(sess);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
@@ -2420,8 +2472,8 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: add support for non-contiguous elements within a row
|
||||
if (!ggml_is_contiguous_rows(src0) || !ggml_is_contiguous_rows(dst)) {
|
||||
// dst must be contiguous; src0 may be non-contiguous
|
||||
if (!ggml_is_contiguous(dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2777,32 +2829,34 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
|
||||
|
||||
static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
|
||||
switch (t->op) {
|
||||
case GGML_OP_FLASH_ATTN_EXT: return HTP_OP_FLASH_ATTN_EXT;
|
||||
case GGML_OP_MUL_MAT: return HTP_OP_MUL_MAT;
|
||||
case GGML_OP_MUL_MAT_ID: return HTP_OP_MUL_MAT_ID;
|
||||
case GGML_OP_MUL: return HTP_OP_MUL;
|
||||
case GGML_OP_ADD: return HTP_OP_ADD;
|
||||
case GGML_OP_ADD_ID: return HTP_OP_ADD_ID;
|
||||
case GGML_OP_SUB: return HTP_OP_SUB;
|
||||
case GGML_OP_DIV: return HTP_OP_DIV;
|
||||
case GGML_OP_CPY: return HTP_OP_CPY;
|
||||
case GGML_OP_CONT: return HTP_OP_CPY;
|
||||
case GGML_OP_GET_ROWS: return HTP_OP_GET_ROWS;
|
||||
case GGML_OP_SET_ROWS: return HTP_OP_SET_ROWS;
|
||||
case GGML_OP_SUM_ROWS: return HTP_OP_SUM_ROWS;
|
||||
case GGML_OP_ARGSORT: return HTP_OP_ARGSORT;
|
||||
case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM;
|
||||
case GGML_OP_SCALE: return HTP_OP_SCALE;
|
||||
case GGML_OP_SQR: return HTP_OP_SQR;
|
||||
case GGML_OP_SQRT: return HTP_OP_SQRT;
|
||||
case GGML_OP_SOFT_MAX: return HTP_OP_SOFTMAX;
|
||||
case GGML_OP_SSM_CONV: return HTP_OP_SSM_CONV;
|
||||
case GGML_OP_ROPE: return HTP_OP_ROPE;
|
||||
case GGML_OP_REPEAT: return HTP_OP_REPEAT;
|
||||
case GGML_OP_CUMSUM: return HTP_OP_CUMSUM;
|
||||
case GGML_OP_FILL: return HTP_OP_FILL;
|
||||
case GGML_OP_DIAG: return HTP_OP_DIAG;
|
||||
case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI;
|
||||
case GGML_OP_FLASH_ATTN_EXT: return HTP_OP_FLASH_ATTN_EXT;
|
||||
case GGML_OP_MUL_MAT: return HTP_OP_MUL_MAT;
|
||||
case GGML_OP_MUL_MAT_ID: return HTP_OP_MUL_MAT_ID;
|
||||
case GGML_OP_MUL: return HTP_OP_MUL;
|
||||
case GGML_OP_ADD: return HTP_OP_ADD;
|
||||
case GGML_OP_ADD_ID: return HTP_OP_ADD_ID;
|
||||
case GGML_OP_SUB: return HTP_OP_SUB;
|
||||
case GGML_OP_DIV: return HTP_OP_DIV;
|
||||
case GGML_OP_CPY: return HTP_OP_CPY;
|
||||
case GGML_OP_CONT: return HTP_OP_CPY;
|
||||
case GGML_OP_GET_ROWS: return HTP_OP_GET_ROWS;
|
||||
case GGML_OP_SET_ROWS: return HTP_OP_SET_ROWS;
|
||||
case GGML_OP_SUM_ROWS: return HTP_OP_SUM_ROWS;
|
||||
case GGML_OP_ARGSORT: return HTP_OP_ARGSORT;
|
||||
case GGML_OP_L2_NORM: return HTP_OP_L2_NORM;
|
||||
case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM;
|
||||
case GGML_OP_SCALE: return HTP_OP_SCALE;
|
||||
case GGML_OP_SQR: return HTP_OP_SQR;
|
||||
case GGML_OP_SQRT: return HTP_OP_SQRT;
|
||||
case GGML_OP_SOFT_MAX: return HTP_OP_SOFTMAX;
|
||||
case GGML_OP_SSM_CONV: return HTP_OP_SSM_CONV;
|
||||
case GGML_OP_GATED_DELTA_NET: return HTP_OP_GATED_DELTA_NET;
|
||||
case GGML_OP_ROPE: return HTP_OP_ROPE;
|
||||
case GGML_OP_REPEAT: return HTP_OP_REPEAT;
|
||||
case GGML_OP_CUMSUM: return HTP_OP_CUMSUM;
|
||||
case GGML_OP_FILL: return HTP_OP_FILL;
|
||||
case GGML_OP_DIAG: return HTP_OP_DIAG;
|
||||
case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(t)) {
|
||||
case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU;
|
||||
@@ -2811,6 +2865,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
|
||||
case GGML_UNARY_OP_NEG: return HTP_OP_UNARY_NEG;
|
||||
case GGML_UNARY_OP_EXP: return HTP_OP_UNARY_EXP;
|
||||
case GGML_UNARY_OP_SOFTPLUS: return HTP_OP_UNARY_SOFTPLUS;
|
||||
case GGML_UNARY_OP_TANH: return HTP_OP_UNARY_TANH;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -3253,6 +3308,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
supp = ggml_hexagon_supported_add_id(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_L2_NORM:
|
||||
supp = ggml_hexagon_supported_unary(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_SCALE:
|
||||
supp = ggml_hexagon_supported_unary(sess, op);
|
||||
@@ -3277,6 +3336,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
supp = ggml_hexagon_supported_unary(sess, op);
|
||||
break;
|
||||
case GGML_UNARY_OP_SILU:
|
||||
@@ -3336,6 +3396,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
supp = ggml_hexagon_supported_ssm_conv(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_GATED_DELTA_NET:
|
||||
supp = ggml_hexagon_supported_gated_delta_net(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_CUMSUM:
|
||||
supp = ggml_hexagon_supported_cumsum(sess, op);
|
||||
break;
|
||||
|
||||
@@ -37,6 +37,7 @@ add_library(${HTP_LIB} SHARED
|
||||
fill-ops.c
|
||||
diag-ops.c
|
||||
solve-tri-ops.c
|
||||
gated-delta-net-ops.c
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE
|
||||
|
||||
955
ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c
Normal file
955
ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c
Normal file
@@ -0,0 +1,955 @@
|
||||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "hvx-utils.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
#include "htp-ctx.h"
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
#define HTP_GDN_MAX_SV 128
|
||||
|
||||
struct htp_gdn_context {
|
||||
struct htp_ops_context * octx;
|
||||
uint32_t rows_per_thread;
|
||||
size_t state_bytes;
|
||||
bool use_vtcm;
|
||||
uint8_t * vtcm_state_base;
|
||||
size_t vtcm_state_per_thread;
|
||||
};
|
||||
|
||||
static inline float gdn_mul_dot_f32(float * restrict dst, const float * restrict mul,
|
||||
const float * restrict dot, uint32_t n) {
|
||||
HVX_Vector acc = Q6_V_vzero();
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vd = hvx_vmemu(dst + i * epv);
|
||||
HVX_Vector vm = hvx_vmem(mul + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
HVX_Vector out = hvx_vec_mul_f32_f32(vd, vm);
|
||||
hvx_vmemu(dst + i * epv) = out;
|
||||
acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vd = hvx_vmemu(dst + off);
|
||||
HVX_Vector vm = hvx_vmem(mul + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_Vector out = hvx_vec_mul_f32_f32(vd, vm);
|
||||
hvx_vec_store_u(dst + off, tail * sizeof(float), out);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
|
||||
acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
|
||||
}
|
||||
|
||||
return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc));
|
||||
}
|
||||
|
||||
static inline float gdn_mul_scalar_dot_f32(float * restrict dst, float mul,
|
||||
const float * restrict dot, uint32_t n) {
|
||||
HVX_Vector acc = Q6_V_vzero();
|
||||
const HVX_Vector vmul = hvx_vec_splat_f32(mul);
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vd = hvx_vmemu(dst + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
HVX_Vector out = hvx_vec_mul_f32_f32(vd, vmul);
|
||||
hvx_vmemu(dst + i * epv) = out;
|
||||
acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vd = hvx_vmemu(dst + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_Vector out = hvx_vec_mul_f32_f32(vd, vmul);
|
||||
hvx_vec_store_u(dst + off, tail * sizeof(float), out);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
|
||||
acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
|
||||
}
|
||||
|
||||
return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc));
|
||||
}
|
||||
|
||||
static inline float gdn_add_scaled_dot_f32(float * restrict dst, const float * restrict src,
|
||||
float scale, const float * restrict dot, uint32_t n) {
|
||||
HVX_Vector acc = Q6_V_vzero();
|
||||
const HVX_Vector vscale = hvx_vec_splat_f32(scale);
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vd = hvx_vmemu(dst + i * epv);
|
||||
HVX_Vector vs = hvx_vmem(src + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
HVX_Vector out = hvx_vec_add_f32_f32(vd, hvx_vec_mul_f32_f32(vs, vscale));
|
||||
hvx_vmemu(dst + i * epv) = out;
|
||||
acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vd = hvx_vmemu(dst + off);
|
||||
HVX_Vector vs = hvx_vmem(src + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_Vector out = hvx_vec_add_f32_f32(vd, hvx_vec_mul_f32_f32(vs, vscale));
|
||||
hvx_vec_store_u(dst + off, tail * sizeof(float), out);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
|
||||
acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
|
||||
}
|
||||
|
||||
return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc));
|
||||
}
|
||||
|
||||
static inline void gdn_mul_dot4_f32(float * restrict dst0, float * restrict dst1,
|
||||
float * restrict dst2, float * restrict dst3, const float * restrict mul,
|
||||
const float * restrict dot, uint32_t n, float * restrict sums) {
|
||||
HVX_Vector acc0 = Q6_V_vzero();
|
||||
HVX_Vector acc1 = Q6_V_vzero();
|
||||
HVX_Vector acc2 = Q6_V_vzero();
|
||||
HVX_Vector acc3 = Q6_V_vzero();
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vm = hvx_vmem(mul + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vm);
|
||||
HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vm);
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vm);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vm);
|
||||
|
||||
hvx_vmemu(dst0 + i * epv) = out0;
|
||||
hvx_vmemu(dst1 + i * epv) = out1;
|
||||
hvx_vmemu(dst2 + i * epv) = out2;
|
||||
hvx_vmemu(dst3 + i * epv) = out3;
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vm = hvx_vmem(mul + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vm);
|
||||
HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vm);
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vm);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vm);
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
|
||||
}
|
||||
|
||||
HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } };
|
||||
hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc));
|
||||
}
|
||||
|
||||
static inline void gdn_mul_scalar_dot4_f32(float * restrict dst0, float * restrict dst1,
|
||||
float * restrict dst2, float * restrict dst3, float mul,
|
||||
const float * restrict dot, uint32_t n, float * restrict sums) {
|
||||
HVX_Vector acc0 = Q6_V_vzero();
|
||||
HVX_Vector acc1 = Q6_V_vzero();
|
||||
HVX_Vector acc2 = Q6_V_vzero();
|
||||
HVX_Vector acc3 = Q6_V_vzero();
|
||||
const HVX_Vector vmul = hvx_vec_splat_f32(mul);
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vmul);
|
||||
HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vmul);
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vmul);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vmul);
|
||||
|
||||
hvx_vmemu(dst0 + i * epv) = out0;
|
||||
hvx_vmemu(dst1 + i * epv) = out1;
|
||||
hvx_vmemu(dst2 + i * epv) = out2;
|
||||
hvx_vmemu(dst3 + i * epv) = out3;
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vmul);
|
||||
HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vmul);
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vmul);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vmul);
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
|
||||
}
|
||||
|
||||
HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } };
|
||||
hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc));
|
||||
}
|
||||
|
||||
static inline void gdn_add_scaled_dot4_f32(float * restrict dst0, float * restrict dst1,
|
||||
float * restrict dst2, float * restrict dst3, const float * restrict src,
|
||||
const float * restrict scale, const float * restrict dot, uint32_t n,
|
||||
float * restrict sums) {
|
||||
HVX_Vector acc0 = Q6_V_vzero();
|
||||
HVX_Vector acc1 = Q6_V_vzero();
|
||||
HVX_Vector acc2 = Q6_V_vzero();
|
||||
HVX_Vector acc3 = Q6_V_vzero();
|
||||
const HVX_Vector scale0 = hvx_vec_splat_f32(scale[0]);
|
||||
const HVX_Vector scale1 = hvx_vec_splat_f32(scale[1]);
|
||||
const HVX_Vector scale2 = hvx_vec_splat_f32(scale[2]);
|
||||
const HVX_Vector scale3 = hvx_vec_splat_f32(scale[3]);
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vs = hvx_vmem(src + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
|
||||
HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + i * epv), hvx_vec_mul_f32_f32(vs, scale0));
|
||||
HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + i * epv), hvx_vec_mul_f32_f32(vs, scale1));
|
||||
HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + i * epv), hvx_vec_mul_f32_f32(vs, scale2));
|
||||
HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + i * epv), hvx_vec_mul_f32_f32(vs, scale3));
|
||||
|
||||
hvx_vmemu(dst0 + i * epv) = out0;
|
||||
hvx_vmemu(dst1 + i * epv) = out1;
|
||||
hvx_vmemu(dst2 + i * epv) = out2;
|
||||
hvx_vmemu(dst3 + i * epv) = out3;
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vs = hvx_vmem(src + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + off), hvx_vec_mul_f32_f32(vs, scale0));
|
||||
HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + off), hvx_vec_mul_f32_f32(vs, scale1));
|
||||
HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + off), hvx_vec_mul_f32_f32(vs, scale2));
|
||||
HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + off), hvx_vec_mul_f32_f32(vs, scale3));
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
|
||||
}
|
||||
|
||||
HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } };
|
||||
hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc));
|
||||
}
|
||||
|
||||
static inline void gdn_mul_dot8_f32(float * restrict dst0, float * restrict dst1,
|
||||
float * restrict dst2, float * restrict dst3, float * restrict dst4,
|
||||
float * restrict dst5, float * restrict dst6, float * restrict dst7,
|
||||
const float * restrict mul, const float * restrict dot, uint32_t n,
|
||||
float * restrict sums) {
|
||||
HVX_Vector acc0 = Q6_V_vzero();
|
||||
HVX_Vector acc1 = Q6_V_vzero();
|
||||
HVX_Vector acc2 = Q6_V_vzero();
|
||||
HVX_Vector acc3 = Q6_V_vzero();
|
||||
HVX_Vector acc4 = Q6_V_vzero();
|
||||
HVX_Vector acc5 = Q6_V_vzero();
|
||||
HVX_Vector acc6 = Q6_V_vzero();
|
||||
HVX_Vector acc7 = Q6_V_vzero();
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vm = hvx_vmem(mul + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vm);
|
||||
HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vm);
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vm);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vm);
|
||||
HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + i * epv), vm);
|
||||
HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + i * epv), vm);
|
||||
HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + i * epv), vm);
|
||||
HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + i * epv), vm);
|
||||
|
||||
hvx_vmemu(dst0 + i * epv) = out0;
|
||||
hvx_vmemu(dst1 + i * epv) = out1;
|
||||
hvx_vmemu(dst2 + i * epv) = out2;
|
||||
hvx_vmemu(dst3 + i * epv) = out3;
|
||||
hvx_vmemu(dst4 + i * epv) = out4;
|
||||
hvx_vmemu(dst5 + i * epv) = out5;
|
||||
hvx_vmemu(dst6 + i * epv) = out6;
|
||||
hvx_vmemu(dst7 + i * epv) = out7;
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
|
||||
acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot));
|
||||
acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot));
|
||||
acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot));
|
||||
acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vm = hvx_vmem(mul + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vm);
|
||||
HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vm);
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vm);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vm);
|
||||
HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + off), vm);
|
||||
HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + off), vm);
|
||||
HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + off), vm);
|
||||
HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + off), vm);
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4);
|
||||
hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5);
|
||||
hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6);
|
||||
hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
|
||||
acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero));
|
||||
acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero));
|
||||
acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero));
|
||||
acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero));
|
||||
}
|
||||
|
||||
HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } };
|
||||
HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } };
|
||||
hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA));
|
||||
hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB));
|
||||
}
|
||||
|
||||
static inline void gdn_mul_scalar_dot8_f32(float * restrict dst0, float * restrict dst1,
|
||||
float * restrict dst2, float * restrict dst3, float * restrict dst4,
|
||||
float * restrict dst5, float * restrict dst6, float * restrict dst7,
|
||||
float mul, const float * restrict dot, uint32_t n, float * restrict sums) {
|
||||
HVX_Vector acc0 = Q6_V_vzero();
|
||||
HVX_Vector acc1 = Q6_V_vzero();
|
||||
HVX_Vector acc2 = Q6_V_vzero();
|
||||
HVX_Vector acc3 = Q6_V_vzero();
|
||||
HVX_Vector acc4 = Q6_V_vzero();
|
||||
HVX_Vector acc5 = Q6_V_vzero();
|
||||
HVX_Vector acc6 = Q6_V_vzero();
|
||||
HVX_Vector acc7 = Q6_V_vzero();
|
||||
const HVX_Vector vmul = hvx_vec_splat_f32(mul);
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vmul);
|
||||
HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vmul);
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vmul);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vmul);
|
||||
HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + i * epv), vmul);
|
||||
HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + i * epv), vmul);
|
||||
HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + i * epv), vmul);
|
||||
HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + i * epv), vmul);
|
||||
|
||||
hvx_vmemu(dst0 + i * epv) = out0;
|
||||
hvx_vmemu(dst1 + i * epv) = out1;
|
||||
hvx_vmemu(dst2 + i * epv) = out2;
|
||||
hvx_vmemu(dst3 + i * epv) = out3;
|
||||
hvx_vmemu(dst4 + i * epv) = out4;
|
||||
hvx_vmemu(dst5 + i * epv) = out5;
|
||||
hvx_vmemu(dst6 + i * epv) = out6;
|
||||
hvx_vmemu(dst7 + i * epv) = out7;
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
|
||||
acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot));
|
||||
acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot));
|
||||
acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot));
|
||||
acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vmul);
|
||||
HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vmul);
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vmul);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vmul);
|
||||
HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + off), vmul);
|
||||
HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + off), vmul);
|
||||
HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + off), vmul);
|
||||
HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + off), vmul);
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4);
|
||||
hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5);
|
||||
hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6);
|
||||
hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
|
||||
acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero));
|
||||
acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero));
|
||||
acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero));
|
||||
acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero));
|
||||
}
|
||||
|
||||
HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } };
|
||||
HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } };
|
||||
hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA));
|
||||
hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB));
|
||||
}
|
||||
|
||||
static inline void gdn_add_scaled_dot8_f32(float * restrict dst0, float * restrict dst1,
|
||||
float * restrict dst2, float * restrict dst3, float * restrict dst4,
|
||||
float * restrict dst5, float * restrict dst6, float * restrict dst7,
|
||||
const float * restrict src, const float * restrict scale,
|
||||
const float * restrict dot, uint32_t n, float * restrict sums) {
|
||||
HVX_Vector acc0 = Q6_V_vzero();
|
||||
HVX_Vector acc1 = Q6_V_vzero();
|
||||
HVX_Vector acc2 = Q6_V_vzero();
|
||||
HVX_Vector acc3 = Q6_V_vzero();
|
||||
HVX_Vector acc4 = Q6_V_vzero();
|
||||
HVX_Vector acc5 = Q6_V_vzero();
|
||||
HVX_Vector acc6 = Q6_V_vzero();
|
||||
HVX_Vector acc7 = Q6_V_vzero();
|
||||
const HVX_Vector scale0 = hvx_vec_splat_f32(scale[0]);
|
||||
const HVX_Vector scale1 = hvx_vec_splat_f32(scale[1]);
|
||||
const HVX_Vector scale2 = hvx_vec_splat_f32(scale[2]);
|
||||
const HVX_Vector scale3 = hvx_vec_splat_f32(scale[3]);
|
||||
const HVX_Vector scale4 = hvx_vec_splat_f32(scale[4]);
|
||||
const HVX_Vector scale5 = hvx_vec_splat_f32(scale[5]);
|
||||
const HVX_Vector scale6 = hvx_vec_splat_f32(scale[6]);
|
||||
const HVX_Vector scale7 = hvx_vec_splat_f32(scale[7]);
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vs = hvx_vmem(src + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
|
||||
HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + i * epv), hvx_vec_mul_f32_f32(vs, scale0));
|
||||
HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + i * epv), hvx_vec_mul_f32_f32(vs, scale1));
|
||||
HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + i * epv), hvx_vec_mul_f32_f32(vs, scale2));
|
||||
HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + i * epv), hvx_vec_mul_f32_f32(vs, scale3));
|
||||
HVX_Vector out4 = hvx_vec_add_f32_f32(hvx_vmemu(dst4 + i * epv), hvx_vec_mul_f32_f32(vs, scale4));
|
||||
HVX_Vector out5 = hvx_vec_add_f32_f32(hvx_vmemu(dst5 + i * epv), hvx_vec_mul_f32_f32(vs, scale5));
|
||||
HVX_Vector out6 = hvx_vec_add_f32_f32(hvx_vmemu(dst6 + i * epv), hvx_vec_mul_f32_f32(vs, scale6));
|
||||
HVX_Vector out7 = hvx_vec_add_f32_f32(hvx_vmemu(dst7 + i * epv), hvx_vec_mul_f32_f32(vs, scale7));
|
||||
|
||||
hvx_vmemu(dst0 + i * epv) = out0;
|
||||
hvx_vmemu(dst1 + i * epv) = out1;
|
||||
hvx_vmemu(dst2 + i * epv) = out2;
|
||||
hvx_vmemu(dst3 + i * epv) = out3;
|
||||
hvx_vmemu(dst4 + i * epv) = out4;
|
||||
hvx_vmemu(dst5 + i * epv) = out5;
|
||||
hvx_vmemu(dst6 + i * epv) = out6;
|
||||
hvx_vmemu(dst7 + i * epv) = out7;
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
|
||||
acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot));
|
||||
acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot));
|
||||
acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot));
|
||||
acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vs = hvx_vmem(src + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + off), hvx_vec_mul_f32_f32(vs, scale0));
|
||||
HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + off), hvx_vec_mul_f32_f32(vs, scale1));
|
||||
HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + off), hvx_vec_mul_f32_f32(vs, scale2));
|
||||
HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + off), hvx_vec_mul_f32_f32(vs, scale3));
|
||||
HVX_Vector out4 = hvx_vec_add_f32_f32(hvx_vmemu(dst4 + off), hvx_vec_mul_f32_f32(vs, scale4));
|
||||
HVX_Vector out5 = hvx_vec_add_f32_f32(hvx_vmemu(dst5 + off), hvx_vec_mul_f32_f32(vs, scale5));
|
||||
HVX_Vector out6 = hvx_vec_add_f32_f32(hvx_vmemu(dst6 + off), hvx_vec_mul_f32_f32(vs, scale6));
|
||||
HVX_Vector out7 = hvx_vec_add_f32_f32(hvx_vmemu(dst7 + off), hvx_vec_mul_f32_f32(vs, scale7));
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4);
|
||||
hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5);
|
||||
hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6);
|
||||
hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
|
||||
acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero));
|
||||
acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero));
|
||||
acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero));
|
||||
acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero));
|
||||
}
|
||||
|
||||
HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } };
|
||||
HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } };
|
||||
hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA));
|
||||
hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB));
|
||||
}
|
||||
|
||||
static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_gdn_context * gctx = (struct htp_gdn_context *) data;
|
||||
struct htp_ops_context * octx = gctx->octx;
|
||||
|
||||
const struct htp_tensor * q = octx->src[0];
|
||||
const struct htp_tensor * k = octx->src[1];
|
||||
const struct htp_tensor * v = octx->src[2];
|
||||
const struct htp_tensor * g = octx->src[3];
|
||||
const struct htp_tensor * beta = octx->src[4];
|
||||
const struct htp_tensor * state = octx->src[5];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
const uint32_t S_v = v->ne[0];
|
||||
const uint32_t H = v->ne[1];
|
||||
const uint32_t n_tokens = v->ne[2];
|
||||
const uint32_t n_seqs = v->ne[3];
|
||||
|
||||
const uint32_t total_rows = H * n_seqs;
|
||||
if (ith >= total_rows) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint32_t rq3 = n_seqs / q->ne[3];
|
||||
const uint32_t rk3 = n_seqs / k->ne[3];
|
||||
const float scale = 1.0f / sqrtf((float) S_v);
|
||||
|
||||
float * dst_base = (float *) (uintptr_t) dst->data;
|
||||
float * state_out_base = dst_base + (uint64_t) S_v * H * n_tokens * n_seqs;
|
||||
const float * state_in_base = (const float *) (uintptr_t) state->data;
|
||||
|
||||
const bool kda = (g->ne[0] == S_v);
|
||||
float local_gate[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_q[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_sums[4] __attribute__((aligned(128)));
|
||||
|
||||
for (uint32_t ir = ith; ir < total_rows; ir += nth) {
|
||||
const uint32_t iv1 = ir % H;
|
||||
const uint32_t iv3 = ir / H;
|
||||
|
||||
const uint32_t iq1 = iv1 % q->ne[1];
|
||||
const uint32_t ik1 = iv1 % k->ne[1];
|
||||
const uint32_t iq3 = iv3 / rq3;
|
||||
const uint32_t ik3 = iv3 / rk3;
|
||||
|
||||
float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
const float * s_in = state_in_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
|
||||
memcpy(s_out, s_in, gctx->state_bytes);
|
||||
float * s_work = s_out;
|
||||
|
||||
float * attn_data = dst_base + ((uint64_t) iv3 * n_tokens * H + iv1) * S_v;
|
||||
|
||||
for (uint32_t t = 0; t < n_tokens; ++t) {
|
||||
const float * q_t = (const float *) ((const uint8_t *) (uintptr_t) q->data +
|
||||
(uint64_t) iq3 * q->nb[3] + (uint64_t) t * q->nb[2] + (uint64_t) iq1 * q->nb[1]);
|
||||
const float * k_t = (const float *) ((const uint8_t *) (uintptr_t) k->data +
|
||||
(uint64_t) ik3 * k->nb[3] + (uint64_t) t * k->nb[2] + (uint64_t) ik1 * k->nb[1]);
|
||||
const float * v_t = (const float *) ((const uint8_t *) (uintptr_t) v->data +
|
||||
(uint64_t) iv3 * v->nb[3] + (uint64_t) t * v->nb[2] + (uint64_t) iv1 * v->nb[1]);
|
||||
const float * g_t = (const float *) ((const uint8_t *) (uintptr_t) g->data +
|
||||
(uint64_t) iv3 * g->nb[3] + (uint64_t) t * g->nb[2] + (uint64_t) iv1 * g->nb[1]);
|
||||
const float beta_val = *(const float *) ((const uint8_t *) (uintptr_t) beta->data +
|
||||
(uint64_t) iv3 * beta->nb[3] + (uint64_t) t * beta->nb[2] + (uint64_t) iv1 * beta->nb[1]);
|
||||
|
||||
memcpy(local_q, q_t, (size_t) S_v * sizeof(float));
|
||||
memcpy(local_k, k_t, (size_t) S_v * sizeof(float));
|
||||
|
||||
if (kda) {
|
||||
hvx_exp_f32((uint8_t *) local_gate, (const uint8_t *) g_t, S_v, false);
|
||||
|
||||
uint32_t j = 0;
|
||||
for (; j + 4 <= S_v; j += 4) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
gdn_mul_dot4_f32(row0, row1, row2, row3, local_gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[4] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
}
|
||||
for (; j < S_v; ++j) {
|
||||
float * row = s_work + (uint64_t) j * S_v;
|
||||
const float sum = gdn_mul_dot_f32(row, local_gate, local_k, S_v);
|
||||
const float dj = (v_t[j] - sum) * beta_val;
|
||||
attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
|
||||
}
|
||||
} else {
|
||||
const float gate = expf(g_t[0]);
|
||||
uint32_t j = 0;
|
||||
for (; j + 4 <= S_v; j += 4) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
gdn_mul_scalar_dot4_f32(row0, row1, row2, row3, gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[4] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
}
|
||||
for (; j < S_v; ++j) {
|
||||
float * row = s_work + (uint64_t) j * S_v;
|
||||
const float sum = gdn_mul_scalar_dot_f32(row, gate, local_k, S_v);
|
||||
const float dj = (v_t[j] - sum) * beta_val;
|
||||
attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
|
||||
}
|
||||
}
|
||||
|
||||
attn_data += (uint64_t) S_v * H;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_gdn_context * gctx = (struct htp_gdn_context *) data;
|
||||
struct htp_ops_context * octx = gctx->octx;
|
||||
|
||||
const struct htp_tensor * q = octx->src[0];
|
||||
const struct htp_tensor * k = octx->src[1];
|
||||
const struct htp_tensor * v = octx->src[2];
|
||||
const struct htp_tensor * g = octx->src[3];
|
||||
const struct htp_tensor * beta = octx->src[4];
|
||||
const struct htp_tensor * state = octx->src[5];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
const uint32_t S_v = v->ne[0];
|
||||
const uint32_t H = v->ne[1];
|
||||
const uint32_t n_seqs = v->ne[3];
|
||||
|
||||
const uint32_t total_rows = H * n_seqs;
|
||||
if (ith >= total_rows) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint32_t rq3 = n_seqs / q->ne[3];
|
||||
const uint32_t rk3 = n_seqs / k->ne[3];
|
||||
const float scale = 1.0f / sqrtf((float) S_v);
|
||||
|
||||
float * dst_base = (float *) (uintptr_t) dst->data;
|
||||
float * state_out_base = dst_base + (uint64_t) S_v * H * n_seqs;
|
||||
const float * state_in_base = (const float *) (uintptr_t) state->data;
|
||||
|
||||
const bool kda = (g->ne[0] == S_v);
|
||||
float local_gate[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_q[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_sums[8] __attribute__((aligned(128)));
|
||||
|
||||
dma_queue * dma = octx->ctx->dma[ith];
|
||||
|
||||
uint8_t * spad = NULL;
|
||||
if (gctx->use_vtcm) {
|
||||
spad = gctx->vtcm_state_base + gctx->vtcm_state_per_thread * ith;
|
||||
}
|
||||
|
||||
for (uint32_t ir = ith; ir < total_rows; ir += nth) {
|
||||
const uint32_t iv1 = ir % H;
|
||||
const uint32_t iv3 = ir / H;
|
||||
|
||||
const uint32_t iq1 = iv1 % q->ne[1];
|
||||
const uint32_t ik1 = iv1 % k->ne[1];
|
||||
const uint32_t iq3 = iv3 / rq3;
|
||||
const uint32_t ik3 = iv3 / rk3;
|
||||
|
||||
float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
const float * s_in = state_in_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
float * s_work;
|
||||
|
||||
if (spad) {
|
||||
dma_queue_push(dma, dma_make_ptr(spad, s_in),
|
||||
S_v * sizeof(float), S_v * sizeof(float),
|
||||
S_v * sizeof(float), S_v);
|
||||
dma_queue_pop(dma);
|
||||
s_work = (float *) spad;
|
||||
} else {
|
||||
s_work = s_out;
|
||||
memcpy(s_work, s_in, gctx->state_bytes);
|
||||
}
|
||||
|
||||
float * attn_data = dst_base + ((uint64_t) iv3 * H + iv1) * S_v;
|
||||
|
||||
const float * q_t = (const float *) ((const uint8_t *) (uintptr_t) q->data +
|
||||
(uint64_t) iq3 * q->nb[3] + (uint64_t) iq1 * q->nb[1]);
|
||||
const float * k_t = (const float *) ((const uint8_t *) (uintptr_t) k->data +
|
||||
(uint64_t) ik3 * k->nb[3] + (uint64_t) ik1 * k->nb[1]);
|
||||
const float * v_t = (const float *) ((const uint8_t *) (uintptr_t) v->data +
|
||||
(uint64_t) iv3 * v->nb[3] + (uint64_t) iv1 * v->nb[1]);
|
||||
const float * g_t = (const float *) ((const uint8_t *) (uintptr_t) g->data +
|
||||
(uint64_t) iv3 * g->nb[3] + (uint64_t) iv1 * g->nb[1]);
|
||||
const float beta_val = *(const float *) ((const uint8_t *) (uintptr_t) beta->data +
|
||||
(uint64_t) iv3 * beta->nb[3] + (uint64_t) iv1 * beta->nb[1]);
|
||||
|
||||
memcpy(local_q, q_t, (size_t) S_v * sizeof(float));
|
||||
memcpy(local_k, k_t, (size_t) S_v * sizeof(float));
|
||||
|
||||
if (kda) {
|
||||
hvx_exp_f32((uint8_t *) local_gate, (const uint8_t *) g_t, S_v, false);
|
||||
|
||||
uint32_t j = 0;
|
||||
for (; j + 8 <= S_v; j += 8) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
float * row4 = s_work + (uint64_t) (j + 4) * S_v;
|
||||
float * row5 = s_work + (uint64_t) (j + 5) * S_v;
|
||||
float * row6 = s_work + (uint64_t) (j + 6) * S_v;
|
||||
float * row7 = s_work + (uint64_t) (j + 7) * S_v;
|
||||
gdn_mul_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
local_gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[8] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 8; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 8; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
}
|
||||
for (; j + 4 <= S_v; j += 4) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
gdn_mul_dot4_f32(row0, row1, row2, row3, local_gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[4] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
}
|
||||
for (; j < S_v; ++j) {
|
||||
float * row = s_work + (uint64_t) j * S_v;
|
||||
const float sum = gdn_mul_dot_f32(row, local_gate, local_k, S_v);
|
||||
const float dj = (v_t[j] - sum) * beta_val;
|
||||
attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
|
||||
}
|
||||
} else {
|
||||
const float gate = expf(g_t[0]);
|
||||
uint32_t j = 0;
|
||||
for (; j + 8 <= S_v; j += 8) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
float * row4 = s_work + (uint64_t) (j + 4) * S_v;
|
||||
float * row5 = s_work + (uint64_t) (j + 5) * S_v;
|
||||
float * row6 = s_work + (uint64_t) (j + 6) * S_v;
|
||||
float * row7 = s_work + (uint64_t) (j + 7) * S_v;
|
||||
gdn_mul_scalar_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[8] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 8; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 8; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
}
|
||||
for (; j + 4 <= S_v; j += 4) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
gdn_mul_scalar_dot4_f32(row0, row1, row2, row3, gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[4] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
}
|
||||
for (; j < S_v; ++j) {
|
||||
float * row = s_work + (uint64_t) j * S_v;
|
||||
const float sum = gdn_mul_scalar_dot_f32(row, gate, local_k, S_v);
|
||||
const float dj = (v_t[j] - sum) * beta_val;
|
||||
attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
|
||||
}
|
||||
}
|
||||
|
||||
if (spad) {
|
||||
dma_queue_push(dma, dma_make_ptr(s_out, spad),
|
||||
S_v * sizeof(float), S_v * sizeof(float),
|
||||
S_v * sizeof(float), S_v);
|
||||
dma_queue_pop(dma);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int op_gated_delta_net(struct htp_ops_context * octx) {
|
||||
const struct htp_tensor * q = octx->src[0];
|
||||
const struct htp_tensor * k = octx->src[1];
|
||||
const struct htp_tensor * v = octx->src[2];
|
||||
const struct htp_tensor * g = octx->src[3];
|
||||
const struct htp_tensor * beta = octx->src[4];
|
||||
const struct htp_tensor * state = octx->src[5];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
if (!q || !k || !v || !g || !beta || !state || !dst) {
|
||||
return HTP_STATUS_INVAL_PARAMS;
|
||||
}
|
||||
|
||||
if (q->type != HTP_TYPE_F32 || k->type != HTP_TYPE_F32 || v->type != HTP_TYPE_F32 ||
|
||||
g->type != HTP_TYPE_F32 || beta->type != HTP_TYPE_F32 || state->type != HTP_TYPE_F32 ||
|
||||
dst->type != HTP_TYPE_F32) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
const uint32_t S_v = v->ne[0];
|
||||
const uint32_t H = v->ne[1];
|
||||
const uint32_t n_tokens = v->ne[2];
|
||||
const uint32_t n_seqs = v->ne[3];
|
||||
|
||||
if (S_v == 0 || S_v > HTP_GDN_MAX_SV || H == 0 || n_tokens == 0 || n_seqs == 0) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] == 0 || k->ne[1] == 0 ||
|
||||
q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] == 0 || k->ne[3] == 0 ||
|
||||
(n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
if (state->ne[0] * state->ne[1] * state->ne[2] * state->ne[3] != S_v * S_v * H * n_seqs) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
struct htp_gdn_context gctx;
|
||||
gctx.octx = octx;
|
||||
gctx.rows_per_thread = (H * n_seqs + octx->n_threads - 1) / octx->n_threads;
|
||||
gctx.state_bytes = (size_t) S_v * S_v * sizeof(float);
|
||||
|
||||
size_t state_aligned = (size_t) S_v * S_v * sizeof(float);
|
||||
state_aligned = (state_aligned + 127) & ~(size_t)127;
|
||||
|
||||
gctx.use_vtcm = false;
|
||||
gctx.vtcm_state_base = NULL;
|
||||
gctx.vtcm_state_per_thread = 0;
|
||||
|
||||
if (n_tokens == 1 && octx->ctx->vtcm_base) {
|
||||
size_t vtcm_total = state_aligned * octx->n_threads;
|
||||
if (octx->ctx->vtcm_size >= vtcm_total) {
|
||||
gctx.use_vtcm = true;
|
||||
gctx.vtcm_state_base = octx->ctx->vtcm_base;
|
||||
gctx.vtcm_state_per_thread = state_aligned;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_tokens == 1) {
|
||||
worker_pool_run_func(octx->ctx->worker_pool, gated_delta_net_f32_tg_thread, &gctx, octx->n_threads);
|
||||
} else {
|
||||
worker_pool_run_func(octx->ctx->worker_pool, gated_delta_net_f32_pp_thread, &gctx, octx->n_threads);
|
||||
}
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
@@ -760,8 +760,9 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) {
|
||||
// ALiBi slopes — only needed when has_alibi (scheme A)
|
||||
HVX_Vector v_slope0, v_slope1;
|
||||
if (args->has_alibi) {
|
||||
v_slope0 = hvx_vec_splat_f16(args->slopes[r + 0]);
|
||||
v_slope1 = (r + 1 < (int) n_rows_g) ? hvx_vec_splat_f16(args->slopes[r + 1]) : Q6_V_vzero();
|
||||
HVX_Vector v_s = hvx_vmemu(args->slopes + r);
|
||||
v_slope0 = hvx_vec_repl_f16(v_s);
|
||||
v_slope1 = (r + 1 < (int) n_rows_g) ? hvx_vec_repl_f16(Q6_V_vror_VR(v_s, 2)) : Q6_V_vzero();
|
||||
}
|
||||
|
||||
const HVX_Vector v_threshold = Q6_Vh_vsplat_R(0xcc00); // fp16 -16.0 (hoisted outside for-c)
|
||||
|
||||
@@ -180,12 +180,10 @@ next_nc:
|
||||
// Dequantize one x4x2 Q4_0 group (32 elements from 32 packed bytes) -> 32 FP16 in first 64 bytes.
|
||||
// In x4x2, sub-blocks 0..3 use lower nibbles, sub-blocks 4..7 use upper nibbles
|
||||
// of the same 32 packed bytes.
|
||||
static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(
|
||||
const uint8_t *packed_32, bool upper_nibbles,
|
||||
const __fp16 *scale, const HVX_Vector vlut_cvt) {
|
||||
static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale, const HVX_Vector vlut_cvt) {
|
||||
HVX_Vector vq = hvx_vmemu(packed_32);
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
HVX_Vector v_scales = hvx_vec_splat_f16(*scale);
|
||||
HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale));
|
||||
// q4x4x2 stores two int4 values per byte. Keep only the selected nibble.
|
||||
HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
|
||||
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
|
||||
@@ -223,9 +221,10 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
|
||||
HVX_Vector v_hi = Q6_V_hi_W(vp); // [group2: 32 fp16 | group3: 32 fp16]
|
||||
|
||||
// Build per-group scale vectors: first 64 bytes use scale_a, last 64 use scale_b
|
||||
HVX_VectorPred q64 = Q6_Q_vsetq_R(64);
|
||||
HVX_Vector v_sc01 = Q6_V_vmux_QVV(q64, hvx_vec_splat_f16(scales_4[0]), hvx_vec_splat_f16(scales_4[1]));
|
||||
HVX_Vector v_sc23 = Q6_V_vmux_QVV(q64, hvx_vec_splat_f16(scales_4[2]), hvx_vec_splat_f16(scales_4[3]));
|
||||
volatile HVX_Vector vscale = hvx_vmemu(scales_4);
|
||||
|
||||
HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale);
|
||||
HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4));
|
||||
|
||||
v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01));
|
||||
v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
|
||||
@@ -237,10 +236,10 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
|
||||
|
||||
// Dequantize one x4x2 Q8_0 group (32 int8 quants) -> 32 FP16 in first 64 bytes.
|
||||
static inline HVX_Vector dequantize_x4x2_q8_0_group_hvx(const int8_t *quants_32, const __fp16 *scale) {
|
||||
HVX_Vector vq = hvx_vmemu(quants_32);
|
||||
HVX_Vector v_scales = hvx_vec_splat_f16(*scale);
|
||||
HVX_Vector v0 = Q6_V_lo_W(Q6_Wh_vunpack_Vb(vq));
|
||||
HVX_Vector v_hf = Q6_Vhf_equals_Vh(v0);
|
||||
HVX_Vector vq = hvx_vmemu(quants_32);
|
||||
HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale));
|
||||
HVX_Vector v0 = Q6_V_lo_W(Q6_Wh_vunpack_Vb(vq));
|
||||
HVX_Vector v_hf = Q6_Vhf_equals_Vh(v0);
|
||||
return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales));
|
||||
}
|
||||
|
||||
@@ -521,12 +520,8 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
const uint8_t *r0 = vtcm_src + row0 * row_stride;
|
||||
const uint8_t *r1 = vtcm_src + row1 * row_stride;
|
||||
|
||||
HVX_Vector v0 = dequantize_x4x2_q8_0_group_hvx(
|
||||
(const int8_t *)(r0 + byte_off), (const __fp16 *)(r0 + scale_off));
|
||||
HVX_Vector v1 = (row1 < n_cols)
|
||||
? dequantize_x4x2_q8_0_group_hvx(
|
||||
(const int8_t *)(r1 + byte_off), (const __fp16 *)(r1 + scale_off))
|
||||
: Q6_V_vzero();
|
||||
HVX_Vector v0 = dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r0 + byte_off), (const __fp16 *)(r0 + scale_off));
|
||||
HVX_Vector v1 = (row1 < n_cols) ? dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r1 + byte_off), (const __fp16 *)(r1 + scale_off)) : Q6_V_vzero();
|
||||
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
|
||||
@@ -77,16 +77,18 @@ static inline void hmx_interleave_rows_to_tiles(__fp16 * restrict vtcm_dst,
|
||||
const HVX_Vector v_off0 = Q6_Vw_vadd_VwVw(v_scat_base, Q6_V_vsplat_R(local_r * 4));
|
||||
const HVX_Vector v_off1 = Q6_Vw_vadd_VwVw(v_off0, v_scat_step);
|
||||
|
||||
__fp16 * tile_base = vtcm_dst + (size_t) ct * n_k_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
const uint8_t * p0 = (const uint8_t *) (vtcm_src + r * src_stride);
|
||||
const uint8_t * p1 = next_row_valid ? (const uint8_t *) (vtcm_src + (r + 1) * src_stride) : NULL;
|
||||
__fp16 * tile_base = vtcm_dst + (size_t) ct * n_k_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
const uint8_t * p0 = (const uint8_t *) (vtcm_src + r * src_stride);
|
||||
const uint8_t * p1 = next_row_valid ? (const uint8_t *) (vtcm_src + (r + 1) * src_stride) : NULL;
|
||||
|
||||
assert(hex_is_aligned(p0, 128));
|
||||
assert(hex_is_aligned(p1, 128));
|
||||
assert(c_byte_step % 128 == 0);
|
||||
|
||||
if (p1) {
|
||||
for (int i = 0; i < n_c_iters; ++i) {
|
||||
HVX_Vector v0 = hvx_vmemu(p0);
|
||||
p0 += c_byte_step;
|
||||
HVX_Vector v1 = hvx_vmemu(p1);
|
||||
p1 += c_byte_step;
|
||||
HVX_Vector v0 = hvx_vmem(p0); p0 += c_byte_step;
|
||||
HVX_Vector v1 = hvx_vmem(p1); p1 += c_byte_step;
|
||||
Q6_vscatter_RMVwV((size_t) tile_base, pair_region, v_off0, v0);
|
||||
Q6_vscatter_RMVwV((size_t) tile_base, pair_region, v_off1, v1);
|
||||
tile_base += dst_step;
|
||||
@@ -94,8 +96,7 @@ static inline void hmx_interleave_rows_to_tiles(__fp16 * restrict vtcm_dst,
|
||||
} else {
|
||||
const HVX_Vector vzero = Q6_V_vzero();
|
||||
for (int i = 0; i < n_c_iters; ++i) {
|
||||
HVX_Vector v0 = hvx_vmemu(p0);
|
||||
p0 += c_byte_step;
|
||||
HVX_Vector v0 = hvx_vmem(p0); p0 += c_byte_step;
|
||||
Q6_vscatter_RMVwV((size_t) tile_base, pair_region, v_off0, v0);
|
||||
Q6_vscatter_RMVwV((size_t) tile_base, pair_region, v_off1, vzero);
|
||||
tile_base += dst_step;
|
||||
@@ -116,16 +117,14 @@ static inline void hmx_interleave_rows_to_tiles(__fp16 * restrict vtcm_dst,
|
||||
const HVX_Vector v_off0 = Q6_Vw_vadd_VwVw(v_scat_base, Q6_V_vsplat_R(local_r * 4));
|
||||
const HVX_Vector v_off1 = Q6_Vw_vadd_VwVw(v_off0, v_scat_step);
|
||||
|
||||
__fp16 * tile_base = vtcm_dst + (size_t) ct * n_k_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
const uint8_t * p0 = (const uint8_t *) (vtcm_src + r * src_stride);
|
||||
const uint8_t * p1 = next_row_valid ? (const uint8_t *) (vtcm_src + (r + 1) * src_stride) : NULL;
|
||||
__fp16 * tile_base = vtcm_dst + (size_t) ct * n_k_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
const uint8_t * p0 = (const uint8_t *) (vtcm_src + r * src_stride);
|
||||
const uint8_t * p1 = next_row_valid ? (const uint8_t *) (vtcm_src + (r + 1) * src_stride) : NULL;
|
||||
|
||||
if (p1) {
|
||||
for (int i = 0; i < n_c_iters; ++i) {
|
||||
HVX_Vector v0 = hvx_vmemu(p0);
|
||||
p0 += c_byte_step;
|
||||
HVX_Vector v1 = hvx_vmemu(p1);
|
||||
p1 += c_byte_step;
|
||||
HVX_Vector v0 = hvx_vmemu(p0); p0 += c_byte_step;
|
||||
HVX_Vector v1 = hvx_vmemu(p1); p1 += c_byte_step;
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, single_region, v_off0, v0);
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, single_region, v_off1, v1);
|
||||
tile_base += dst_step;
|
||||
@@ -133,8 +132,7 @@ static inline void hmx_interleave_rows_to_tiles(__fp16 * restrict vtcm_dst,
|
||||
} else {
|
||||
const HVX_Vector vzero = Q6_V_vzero();
|
||||
for (int i = 0; i < n_c_iters; ++i) {
|
||||
HVX_Vector v0 = hvx_vmemu(p0);
|
||||
p0 += c_byte_step;
|
||||
HVX_Vector v0 = hvx_vmemu(p0); p0 += c_byte_step;
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, single_region, v_off0, v0);
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, single_region, v_off1, vzero);
|
||||
tile_base += dst_step;
|
||||
|
||||
@@ -106,5 +106,6 @@ int op_cumsum(struct htp_ops_context * octx);
|
||||
int op_fill(struct htp_ops_context * octx);
|
||||
int op_diag(struct htp_ops_context * octx);
|
||||
int op_solve_tri(struct htp_ops_context * octx);
|
||||
int op_gated_delta_net(struct htp_ops_context * octx);
|
||||
|
||||
#endif /* HTP_CTX_H */
|
||||
|
||||
@@ -62,6 +62,7 @@ enum htp_op_code {
|
||||
HTP_OP_UNARY_EXP,
|
||||
HTP_OP_UNARY_NEG,
|
||||
HTP_OP_UNARY_SOFTPLUS,
|
||||
HTP_OP_UNARY_TANH,
|
||||
HTP_OP_GLU_SWIGLU,
|
||||
HTP_OP_GLU_SWIGLU_OAI,
|
||||
HTP_OP_GLU_GEGLU,
|
||||
@@ -83,6 +84,9 @@ enum htp_op_code {
|
||||
HTP_OP_FILL,
|
||||
HTP_OP_DIAG,
|
||||
HTP_OP_SOLVE_TRI,
|
||||
HTP_OP_L2_NORM,
|
||||
HTP_OP_GATED_DELTA_NET,
|
||||
|
||||
HTP_OP_INVALID
|
||||
};
|
||||
|
||||
|
||||
74
ggml/src/ggml-hexagon/htp/hvx-repl.h
Normal file
74
ggml/src/ggml-hexagon/htp/hvx-repl.h
Normal file
@@ -0,0 +1,74 @@
|
||||
#ifndef HVX_REPL_H
|
||||
#define HVX_REPL_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "hvx-base.h"
|
||||
|
||||
static inline HVX_Vector hvx_vec_repl(HVX_Vector v, const uint8_t * ctrl) {
|
||||
return Q6_V_vdelta_VV(v, hvx_vmem(ctrl));
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_repl_u32(HVX_Vector v) {
|
||||
// vdelta control to replicate first 4 bytes across all lanes
|
||||
static const uint8_t __attribute__((aligned(128))) repl[128] = {
|
||||
0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
|
||||
0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
|
||||
0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
|
||||
0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
|
||||
0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
|
||||
0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
|
||||
0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
|
||||
0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
|
||||
};
|
||||
return hvx_vec_repl(v, repl);
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_repl_f32(HVX_Vector v) {
|
||||
// vdelta control to replicate first 4 bytes across all lanes
|
||||
static const uint8_t __attribute__((aligned(128))) repl[128] = {
|
||||
0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
|
||||
0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
|
||||
0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
|
||||
0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
|
||||
0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
|
||||
0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
|
||||
0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
|
||||
0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
|
||||
};
|
||||
return hvx_vec_repl(v, repl);
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_repl_f16(HVX_Vector v) {
|
||||
// vdelta control to replicate first two bytes across all lanes
|
||||
static const uint8_t __attribute__((aligned(128))) repl[128] = {
|
||||
0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
|
||||
0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
|
||||
0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
|
||||
0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
|
||||
0x40, 0x40, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
|
||||
0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
|
||||
0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
|
||||
0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
|
||||
};
|
||||
return hvx_vec_repl(v, repl);
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_repl_2x_f16(HVX_Vector v) {
|
||||
// vdelta control to splat a pair of f16s: first half = f16[0], second half = f16[1]
|
||||
static const uint8_t __attribute__((aligned(128))) repl[128] = {
|
||||
0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
|
||||
0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
|
||||
0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
|
||||
0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
|
||||
0x02, 0x02, 0x40, 0x40, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04,
|
||||
0x02, 0x02, 0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04,
|
||||
0x02, 0x02, 0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04,
|
||||
0x02, 0x02, 0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04,
|
||||
};
|
||||
return hvx_vec_repl(v, repl);
|
||||
}
|
||||
|
||||
#endif // HVX_REPL_H
|
||||
@@ -5,6 +5,7 @@
|
||||
|
||||
#include "hvx-types.h"
|
||||
#include "hvx-copy.h"
|
||||
#include "hvx-repl.h"
|
||||
#include "hvx-scale.h"
|
||||
#include "hvx-exp.h"
|
||||
#include "hvx-inverse.h"
|
||||
|
||||
@@ -542,6 +542,8 @@ static int execute_op(struct htp_ops_context * octx) {
|
||||
case HTP_OP_UNARY_SIGMOID:
|
||||
case HTP_OP_UNARY_NEG:
|
||||
case HTP_OP_UNARY_EXP:
|
||||
case HTP_OP_UNARY_TANH:
|
||||
case HTP_OP_L2_NORM:
|
||||
return op_unary(octx);
|
||||
|
||||
case HTP_OP_UNARY_SILU:
|
||||
@@ -593,6 +595,9 @@ static int execute_op(struct htp_ops_context * octx) {
|
||||
case HTP_OP_SOLVE_TRI:
|
||||
return op_solve_tri(octx);
|
||||
|
||||
case HTP_OP_GATED_DELTA_NET:
|
||||
return op_gated_delta_net(octx);
|
||||
|
||||
case HTP_OP_INVALID:
|
||||
break;
|
||||
|
||||
|
||||
@@ -298,6 +298,96 @@ static void softplus_f32(const float * restrict src,
|
||||
}
|
||||
}
|
||||
|
||||
// --- L2_NORM HVX kernel ---
|
||||
// Computes y[i] = x[i] / fmax(sqrt(sum(x[j]^2)), epsilon) for each row.
|
||||
// scale = 1/fmax(sqrt(sum), epsilon) is computed entirely in HVX registers
|
||||
// using rsqrt + inverse to avoid scalar extraction.
|
||||
static void hvx_fast_l2_norm_f32(const uint8_t * restrict src,
|
||||
uint8_t * restrict dst,
|
||||
uint8_t * restrict pad,
|
||||
const int num_elems,
|
||||
float epsilon) {
|
||||
(void)pad;
|
||||
|
||||
const HVX_Vector * restrict v_src = (HVX_Vector *) src;
|
||||
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
|
||||
|
||||
HVX_Vector sum_v = hvx_vec_splat_f32(0.0f);
|
||||
|
||||
const int nvec = num_elems / VLEN_FP32;
|
||||
const int nloe = num_elems % VLEN_FP32;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < nvec; i++) {
|
||||
HVX_Vector v1 = v_src[i];
|
||||
HVX_Vector sq = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
|
||||
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, sq);
|
||||
}
|
||||
|
||||
// Include tail elements in the sum-of-squares using a predicate mask
|
||||
if (nloe > 0) {
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
|
||||
HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
|
||||
HVX_Vector sq = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
|
||||
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, sq);
|
||||
}
|
||||
|
||||
// Compute scale = 1/fmax(sqrt(sum), epsilon) entirely in HVX registers.
|
||||
// hvx_vec_rsqrt_f32 + hvx_vec_inverse_f32 avoids scalar extraction.
|
||||
HVX_Vector sum_sf = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v));
|
||||
HVX_Vector rsqrt_v = hvx_vec_rsqrt_f32(sum_sf); // 1/sqrt(sum)
|
||||
HVX_Vector sqrt_v = hvx_vec_inverse_f32(rsqrt_v); // sqrt(sum)
|
||||
HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon);
|
||||
HVX_Vector denom_v = Q6_Vsf_vmax_VsfVsf(sqrt_v, epsilon_v); // fmax(sqrt(sum), epsilon)
|
||||
HVX_Vector scale_v = hvx_vec_inverse_f32(denom_v); // 1/fmax(sqrt(sum), epsilon)
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < nvec; i++) {
|
||||
HVX_Vector v1 = v_src[i];
|
||||
v_dst[i] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(v1, scale_v));
|
||||
}
|
||||
|
||||
if (nloe > 0) {
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
|
||||
HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
|
||||
HVX_Vector result = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(v1, scale_v));
|
||||
hvx_vec_store_a(&v_dst[nvec], nloe * 4, result);
|
||||
}
|
||||
}
|
||||
|
||||
static void l2_norm_f32(const float * restrict src,
|
||||
float * restrict dst,
|
||||
uint8_t * restrict spad,
|
||||
const uint32_t num_rows,
|
||||
const uint32_t row_elems,
|
||||
const size_t row_size,
|
||||
int32_t * op_params) {
|
||||
float epsilon = 0.f;
|
||||
memcpy(&epsilon, op_params, sizeof(float));
|
||||
|
||||
for (uint32_t ir = 0; ir < num_rows; ir++) {
|
||||
const float * restrict src_f = (const float *)((const uint8_t *)src + (ir * row_size));
|
||||
float * restrict dst_f = (float *)((uint8_t *)dst + (ir * row_size));
|
||||
|
||||
hvx_fast_l2_norm_f32((const uint8_t *)src_f, (uint8_t *)dst_f, spad, row_elems, epsilon);
|
||||
}
|
||||
}
|
||||
|
||||
static void tanh_f32(const float * restrict src,
|
||||
float * restrict dst,
|
||||
uint8_t * restrict spad,
|
||||
const uint32_t num_rows,
|
||||
const uint32_t row_elems,
|
||||
const size_t row_size,
|
||||
int32_t * op_params) {
|
||||
for (uint32_t ir = 0; ir < num_rows; ir++) {
|
||||
const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
|
||||
uint8_t * restrict dst_local = (uint8_t *)dst + (ir * row_size);
|
||||
|
||||
hvx_tanh_f32_aa(dst_local, src_local, row_elems);
|
||||
}
|
||||
}
|
||||
|
||||
static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
|
||||
const struct htp_unary_context * uctx = (const struct htp_unary_context *) data;
|
||||
struct htp_ops_context * octx = uctx->octx;
|
||||
@@ -402,6 +492,12 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
|
||||
case HTP_OP_UNARY_SOFTPLUS:
|
||||
softplus_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
case HTP_OP_UNARY_TANH:
|
||||
tanh_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
case HTP_OP_L2_NORM:
|
||||
l2_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -469,7 +565,12 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
|
||||
case HTP_OP_UNARY_SOFTPLUS:
|
||||
op_type = "softplus-f32";
|
||||
break;
|
||||
|
||||
case HTP_OP_UNARY_TANH:
|
||||
op_type = "tanh-f32";
|
||||
break;
|
||||
case HTP_OP_L2_NORM:
|
||||
op_type = "l2norm-f32";
|
||||
break;
|
||||
default:
|
||||
FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
|
||||
@@ -647,19 +647,30 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri(ggml_m
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext(ggml_metal_library_t lib, ggml_type tsrc0, ggml_type tsrc1, int nsg, int nxpsg, int r1ptg) {
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext(ggml_metal_library_t lib, const ggml_tensor * op, int nsg, int nxpsg, int r1ptg) {
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
const ggml_type tsrc0 = op->src[0]->type;
|
||||
const ggml_type tsrc1 = op->src[1]->type;
|
||||
const int ne12 = op->src[1]->ne[2];
|
||||
const int r2 = ne12 / op->src[0]->ne[2];
|
||||
const int r3 = op->src[1]->ne[3] / op->src[0]->ne[3];
|
||||
|
||||
GGML_ASSERT(ne12 <= INT16_MAX && r2 <= INT16_MAX && r3 <= INT16_MAX);
|
||||
|
||||
snprintf(base, 256, "kernel_mul_mv_ext_%s_%s_r1_%d", ggml_type_name(tsrc0), ggml_type_name(tsrc1), r1ptg);
|
||||
snprintf(name, 256, "%s_nsg=%d_nxpsg=%d", base, nsg, nxpsg);
|
||||
snprintf(name, 256, "%s_nsg=%d_nxpsg=%d_ne12=%d_r2=%d_r3=%d", base, nsg, nxpsg, ne12, r2, r3);
|
||||
|
||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (!res.pipeline) {
|
||||
ggml_metal_cv_t cv = ggml_metal_cv_init();
|
||||
|
||||
ggml_metal_cv_set_int16(cv, nsg, FC_MUL_MV + 0);
|
||||
ggml_metal_cv_set_int16(cv, nxpsg, FC_MUL_MV + 1);
|
||||
ggml_metal_cv_set_int16(cv, nsg, FC_MUL_MV + 0);
|
||||
ggml_metal_cv_set_int16(cv, nxpsg, FC_MUL_MV + 1);
|
||||
ggml_metal_cv_set_int16(cv, (int16_t) ne12, FC_MUL_MV + 2);
|
||||
ggml_metal_cv_set_int16(cv, (int16_t) r2, FC_MUL_MV + 3);
|
||||
ggml_metal_cv_set_int16(cv, (int16_t) r3, FC_MUL_MV + 4);
|
||||
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
|
||||
|
||||
@@ -687,8 +698,15 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm(ggml_meta
|
||||
? (op->ne[0] % NRA != 0 || op->ne[1] % NRB != 0)
|
||||
: (op->ne[0] % 64 != 0 || op->ne[1] % 32 != 0);
|
||||
|
||||
GGML_ASSERT(op->src[1]->ne[2] <= INT16_MAX && op->src[1]->ne[3] <= INT16_MAX);
|
||||
const int16_t ne12 = (int16_t) op->src[1]->ne[2];
|
||||
const int16_t ne13 = (int16_t) op->src[1]->ne[3];
|
||||
const int16_t r2 = (int16_t) (ne12 / op->src[0]->ne[2]);
|
||||
const int16_t r3 = (int16_t) (ne13 / op->src[0]->ne[3]);
|
||||
|
||||
snprintf(base, 256, "kernel_mul_mm_%s_%s", ggml_type_name(tsrc0), ggml_type_name(tsrc1));
|
||||
snprintf(name, 256, "%s_bci=%d_bco=%d", base, bc_inp, bc_out);
|
||||
snprintf(name, 256, "%s_bci=%d_bco=%d_ne12=%d_ne13=%d_r2=%d_r3=%d",
|
||||
base, bc_inp, bc_out, ne12, ne13, r2, r3);
|
||||
|
||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (!res.pipeline) {
|
||||
@@ -696,6 +714,10 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm(ggml_meta
|
||||
|
||||
ggml_metal_cv_set_bool(cv, bc_inp, FC_MUL_MM + 0);
|
||||
ggml_metal_cv_set_bool(cv, bc_out, FC_MUL_MM + 1);
|
||||
ggml_metal_cv_set_int16(cv, ne12, FC_MUL_MM + 2);
|
||||
ggml_metal_cv_set_int16(cv, ne13, FC_MUL_MM + 3);
|
||||
ggml_metal_cv_set_int16(cv, r2, FC_MUL_MM + 4);
|
||||
ggml_metal_cv_set_int16(cv, r3, FC_MUL_MM + 5);
|
||||
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
|
||||
|
||||
@@ -877,14 +899,21 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv(ggml_meta
|
||||
}
|
||||
};
|
||||
|
||||
GGML_ASSERT(ne12 <= INT16_MAX && ne13 <= INT16_MAX);
|
||||
const int16_t r2 = (int16_t) (ne12 / ne02);
|
||||
const int16_t r3 = (int16_t) (ne13 / ne03);
|
||||
|
||||
snprintf(base, 256, "kernel_mul_mv_%s_%s%s", ggml_type_name(tsrc0), ggml_type_name(tsrc1), suffix);
|
||||
snprintf(name, 256, "%s_nsg=%d", base, nsg);
|
||||
snprintf(name, 256, "%s_nsg=%d_ne12=%d_r2=%d_r3=%d", base, nsg, ne12, r2, r3);
|
||||
|
||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (!res.pipeline) {
|
||||
ggml_metal_cv_t cv = ggml_metal_cv_init();
|
||||
|
||||
ggml_metal_cv_set_int16(cv, nsg, FC_MUL_MV + 0);
|
||||
ggml_metal_cv_set_int16(cv, nsg, FC_MUL_MV + 0);
|
||||
ggml_metal_cv_set_int16(cv, (int16_t) ne12, FC_MUL_MV + 2);
|
||||
ggml_metal_cv_set_int16(cv, r2, FC_MUL_MV + 3);
|
||||
ggml_metal_cv_set_int16(cv, r3, FC_MUL_MV + 4);
|
||||
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
|
||||
|
||||
@@ -1102,6 +1131,9 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_id(ggml_m
|
||||
ggml_metal_cv_t cv = ggml_metal_cv_init();
|
||||
|
||||
ggml_metal_cv_set_int16(cv, nsg, FC_MUL_MV + 0);
|
||||
ggml_metal_cv_set_int16(cv, 1, FC_MUL_MV + 2);
|
||||
ggml_metal_cv_set_int16(cv, 1, FC_MUL_MV + 3);
|
||||
ggml_metal_cv_set_int16(cv, 1, FC_MUL_MV + 4);
|
||||
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
|
||||
|
||||
|
||||
@@ -129,7 +129,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_gated_delta_net (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, const struct ggml_tensor * op, int nsg, int nxpsg, int r1ptg);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id_map0 (ggml_metal_library_t lib, int ne02, int ne20);
|
||||
|
||||
@@ -2120,7 +2120,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
|
||||
GGML_ABORT("unsupported ne11");
|
||||
};
|
||||
|
||||
auto pipeline = ggml_metal_library_get_pipeline_mul_mv_ext(lib, op->src[0]->type, op->src[1]->type, nsg, nxpsg, r1ptg);
|
||||
auto pipeline = ggml_metal_library_get_pipeline_mul_mv_ext(lib, op, nsg, nxpsg, r1ptg);
|
||||
|
||||
ggml_metal_kargs_mul_mv_ext args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
|
||||
@@ -3353,6 +3353,9 @@ static inline void helper_mv_reduce_and_write(
|
||||
|
||||
constant short FC_mul_mv_nsg [[function_constant(FC_MUL_MV + 0)]];
|
||||
constant short FC_mul_mv_nxpsg [[function_constant(FC_MUL_MV + 1)]];
|
||||
constant short FC_mul_mv_ne12 [[function_constant(FC_MUL_MV + 2)]];
|
||||
constant short FC_mul_mv_r2 [[function_constant(FC_MUL_MV + 3)]];
|
||||
constant short FC_mul_mv_r3 [[function_constant(FC_MUL_MV + 4)]];
|
||||
|
||||
template<typename block_q_type, short NR0, typename args_t>
|
||||
void mul_vec_q_n_f32_impl(
|
||||
@@ -3376,10 +3379,10 @@ void mul_vec_q_n_f32_impl(
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
//const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
//const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
//device const block_q_type * x = (device const block_q_type *) (src0 + offset0);
|
||||
@@ -3388,7 +3391,7 @@ void mul_vec_q_n_f32_impl(
|
||||
// pointers to src0 rows
|
||||
device const block_q_type * ax[NR0];
|
||||
FOR_UNROLL (int row = 0; row < NR0; ++row) {
|
||||
const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
|
||||
ax[row] = (device const block_q_type *) ((device char *) src0 + offset0);
|
||||
}
|
||||
@@ -3462,8 +3465,8 @@ void kernel_mul_mv_q1_0_f32_impl(
|
||||
|
||||
const int first_row = (r0 * NSG + sgitg) * nr0;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12)*args.nb12 + (i13)*args.nb13;
|
||||
|
||||
@@ -3471,7 +3474,7 @@ void kernel_mul_mv_q1_0_f32_impl(
|
||||
|
||||
device const block_q1_0 * ax[nr0];
|
||||
for (int row = 0; row < nr0; ++row) {
|
||||
const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
ax[row] = (device const block_q1_0 *) ((device char *) src0 + offset0);
|
||||
}
|
||||
|
||||
@@ -3590,10 +3593,10 @@ void kernel_mul_mv_q8_0_f32_impl(
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
//const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
//const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
//device const block_q8_0 * x = (device const block_q8_0 *) (src0 + offset0);
|
||||
@@ -3602,7 +3605,7 @@ void kernel_mul_mv_q8_0_f32_impl(
|
||||
// pointers to src0 rows
|
||||
device const block_q8_0 * ax[NR0];
|
||||
FOR_UNROLL (short row = 0; row < NR0; ++row) {
|
||||
const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
|
||||
ax[row] = (device const block_q8_0 *) ((device char *) src0 + offset0);
|
||||
}
|
||||
@@ -3682,10 +3685,10 @@ void kernel_mul_mv_ext_q4_f32_impl(
|
||||
const int i11 = tgpig.y*r1ptg;
|
||||
const int i1m = tgpig.z;
|
||||
|
||||
const int i12 = i1m%args.ne12;
|
||||
const int i13 = i1m/args.ne12;
|
||||
const int i12 = i1m%FC_mul_mv_ne12;
|
||||
const int i13 = i1m/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = i01*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = i01*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = i11*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const q_t * xq = (i01 < args.ne01) ? (device const q_t *) (src0 + offset0) + tx/chpb : (device const q_t *) src0;
|
||||
@@ -3785,10 +3788,10 @@ void kernel_mul_mv_ext_q4x4_f32_impl(
|
||||
const int i11 = tgpig.y*r1ptg;
|
||||
const int i1m = tgpig.z;
|
||||
|
||||
const int i12 = i1m%args.ne12;
|
||||
const int i13 = i1m/args.ne12;
|
||||
const int i12 = i1m%FC_mul_mv_ne12;
|
||||
const int i13 = i1m/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = i01*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = i01*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = i11*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const q_t * xq = (i01 < args.ne01) ? (device const q_t *) (src0 + offset0) + tx/chpb : (device const q_t *) src0;
|
||||
@@ -4000,10 +4003,10 @@ void kernel_mul_mv_t_t_impl(
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
//const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
//const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
//device const T0 * x = (device const T0 *) (src0 + offset0);
|
||||
@@ -4012,7 +4015,7 @@ void kernel_mul_mv_t_t_impl(
|
||||
// pointers to src0 rows
|
||||
device const T0 * ax [NR0];
|
||||
FOR_UNROLL (short row = 0; row < NR0; ++row) {
|
||||
const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
|
||||
ax[row] = (device const T0 *) ((device char *) src0 + offset0);
|
||||
}
|
||||
@@ -4122,10 +4125,10 @@ void kernel_mul_mv_t_t_4_impl(
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
//const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
//const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const T1 * y = (device const T1 *) (src1 + offset1);
|
||||
@@ -4135,7 +4138,7 @@ void kernel_mul_mv_t_t_4_impl(
|
||||
device const T0 * ax [NR0];
|
||||
device const T04 * ax4[NR0];
|
||||
FOR_UNROLL (short row = 0; row < NR0; ++row) {
|
||||
const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
|
||||
ax [row] = (device const T0 *) ((device char *) src0 + offset0);
|
||||
ax4[row] = (device const T04 *) ((device char *) src0 + offset0);
|
||||
@@ -4239,10 +4242,10 @@ void kernel_mul_mv_t_t_short_impl(
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
|
||||
device const T0 * x = (device const T0 *) (src0 + offset0);
|
||||
|
||||
@@ -7462,10 +7465,10 @@ void kernel_mul_mv_q2_K_f32_impl(
|
||||
|
||||
const int first_row = (r0 * NSG + sgitg) * nr0;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const block_q2_K * x = (device const block_q2_K *) (src0 + offset0);
|
||||
@@ -7567,10 +7570,10 @@ void kernel_mul_mv_q3_K_f32_impl(
|
||||
|
||||
const int first_row = (r0 * NSG + sgitg) * nr0;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const block_q3_K * x = (device const block_q3_K *) (src0 + offset0);
|
||||
@@ -7741,10 +7744,10 @@ void kernel_mul_mv_q4_K_f32_impl(
|
||||
|
||||
const int first_row = (r0 * NSG + sgitg) * nr0;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const block_q4_K * x = (device const block_q4_K *) (src0 + offset0);
|
||||
@@ -7853,10 +7856,10 @@ void kernel_mul_mv_q5_K_f32_impl(
|
||||
|
||||
const int first_row = (r0 * NSG + sgitg) * nr0;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const block_q5_K * x = (device const block_q5_K *) (src0 + offset0);
|
||||
@@ -7989,10 +7992,10 @@ void kernel_mul_mv_q6_K_f32_impl(
|
||||
|
||||
const int first_row = (r0 * NSG + sgitg) * nr0;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const block_q6_K * x = (device const block_q6_K *) (src0 + offset0);
|
||||
@@ -8094,10 +8097,10 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
|
||||
|
||||
const int first_row = (r0 * NSG + sgitg) * nr0;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const block_iq2_xxs * x = (device const block_iq2_xxs *) (src0 + offset0);
|
||||
@@ -8202,10 +8205,10 @@ void kernel_mul_mv_iq2_xs_f32_impl(
|
||||
|
||||
const int first_row = (r0 * NSG + sgitg) * nr0;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const block_iq2_xs * x = (device const block_iq2_xs *) (src0 + offset0);
|
||||
@@ -8321,10 +8324,10 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
|
||||
|
||||
const int first_row = (r0 * NSG + sgitg) * nr0;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const block_iq3_xxs * x = (device const block_iq3_xxs *) (src0 + offset0);
|
||||
@@ -8433,10 +8436,10 @@ void kernel_mul_mv_iq3_s_f32_impl(
|
||||
|
||||
const int first_row = (r0 * NSG + sgitg) * nr0;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const block_iq3_s * x = (device const block_iq3_s *) (src0 + offset0);
|
||||
@@ -8545,10 +8548,10 @@ void kernel_mul_mv_iq2_s_f32_impl(
|
||||
|
||||
const int first_row = (r0 * NSG + sgitg) * nr0;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const block_iq2_s * x = (device const block_iq2_s *) (src0 + offset0);
|
||||
@@ -8658,10 +8661,10 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
||||
|
||||
const int first_row = (r0 * NSG + sgitg) * nr0;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const block_iq1_s * x = (device const block_iq1_s *) (src0 + offset0);
|
||||
@@ -8757,10 +8760,10 @@ void kernel_mul_mv_iq1_m_f32_impl(
|
||||
|
||||
const int first_row = (r0 * NSG + sgitg) * nr0;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const block_iq1_m * x = (device const block_iq1_m *) (src0 + offset0);
|
||||
@@ -8866,10 +8869,10 @@ void kernel_mul_mv_iq4_nl_f32_impl(
|
||||
|
||||
const int first_row = (r0 * NSG + sgitg) * NR0;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const block_iq4_nl * x = (device const block_iq4_nl *) (src0 + offset0);
|
||||
@@ -8975,10 +8978,10 @@ void kernel_mul_mv_iq4_xs_f32_impl(
|
||||
const int im = tgpig.z;
|
||||
const int first_row = (r0 * NSG + sgitg) * NR0;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const block_iq4_xs * x = (device const block_iq4_xs *) (src0 + offset0);
|
||||
@@ -9086,10 +9089,10 @@ void kernel_mul_mv_mxfp4_f32_impl(
|
||||
|
||||
const int first_row = (r0 * NSG + sgitg) * NR0;
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
const uint i12 = im%FC_mul_mv_ne12;
|
||||
const uint i13 = im/FC_mul_mv_ne12;
|
||||
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
|
||||
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
|
||||
|
||||
device const block_mxfp4 * x = (device const block_mxfp4 *) (src0 + offset0);
|
||||
@@ -9304,6 +9307,10 @@ kernel void kernel_diag_f32(
|
||||
|
||||
constant bool FC_mul_mm_bc_inp [[function_constant(FC_MUL_MM + 0)]];
|
||||
constant bool FC_mul_mm_bc_out [[function_constant(FC_MUL_MM + 1)]];
|
||||
constant short FC_mul_mm_ne12 [[function_constant(FC_MUL_MM + 2)]];
|
||||
constant short FC_mul_mm_ne13 [[function_constant(FC_MUL_MM + 3)]];
|
||||
constant short FC_mul_mm_r2 [[function_constant(FC_MUL_MM + 4)]];
|
||||
constant short FC_mul_mm_r3 [[function_constant(FC_MUL_MM + 5)]];
|
||||
|
||||
// each block_q contains 16*nl weights
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
@@ -9330,11 +9337,11 @@ kernel void kernel_mul_mm(
|
||||
|
||||
// Batch dimension handling
|
||||
const int im = tgpig.z;
|
||||
const int i12 = im % args.ne12;
|
||||
const int i13 = im / args.ne12;
|
||||
const int i12 = im % FC_mul_mm_ne12;
|
||||
const int i13 = im / FC_mul_mm_ne12;
|
||||
|
||||
// Batch offsets for srcA and srcB
|
||||
const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = (i12/FC_mul_mm_r2)*args.nb02 + (i13/FC_mul_mm_r3)*args.nb03;
|
||||
|
||||
// Tile dimensions
|
||||
constexpr int NRB = SZ_SIMDGROUP * N_MM_BLOCK_X * N_MM_SIMD_GROUP_X;
|
||||
@@ -9473,10 +9480,10 @@ kernel void kernel_mul_mm(
|
||||
|
||||
short il = il0;
|
||||
|
||||
const int i12 = im%args.ne12;
|
||||
const int i13 = im/args.ne12;
|
||||
const int i12 = im % FC_mul_mm_ne12;
|
||||
const int i13 = im / FC_mul_mm_ne12;
|
||||
|
||||
const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const uint64_t offset0 = (i12/FC_mul_mm_r2)*args.nb02 + (i13/FC_mul_mm_r3)*args.nb03;
|
||||
const short offset1 = il0/nl;
|
||||
|
||||
device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0 + lr0) + offset0) + offset1;
|
||||
|
||||
@@ -104,6 +104,12 @@ set(GGML_OPENCL_KERNELS
|
||||
mul_mv_id_mxfp4_f32_flat
|
||||
gemm_moe_q4_0_f32_ns
|
||||
gemv_moe_q4_0_f32_ns
|
||||
gemm_moe_q4_1_f32_ns
|
||||
gemv_moe_q4_1_f32_ns
|
||||
gemm_moe_q5_0_f32_ns
|
||||
gemv_moe_q5_0_f32_ns
|
||||
gemm_moe_q5_1_f32_ns
|
||||
gemv_moe_q5_1_f32_ns
|
||||
gemm_moe_mxfp4_f32
|
||||
gemv_moe_mxfp4_f32
|
||||
gemm_moe_mxfp4_f32_ns
|
||||
@@ -174,6 +180,10 @@ set(GGML_OPENCL_KERNELS
|
||||
flash_attn_f32
|
||||
)
|
||||
|
||||
if (GGML_OPENCL_USE_ADRENO_KERNELS)
|
||||
list(APPEND GGML_OPENCL_KERNELS gemm_xmem_f16_f32_os8)
|
||||
endif ()
|
||||
|
||||
foreach (K ${GGML_OPENCL_KERNELS})
|
||||
ggml_opencl_add_kernel(${K})
|
||||
endforeach()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -56,6 +56,25 @@ struct block_q4_1 {
|
||||
uchar qs[QK4_1 / 2]; // nibbles / quants
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q5_0
|
||||
//------------------------------------------------------------------------------
|
||||
struct block_q5_0 {
|
||||
half d; // delta
|
||||
uchar qh[4]; // 5-th bit of quants
|
||||
uchar qs[QK5_0 / 2]; // nibbles / quants
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q5_1
|
||||
//------------------------------------------------------------------------------
|
||||
struct block_q5_1 {
|
||||
half d; // delta
|
||||
half m; // min
|
||||
uchar qh[4]; // 5-th bit of quants
|
||||
uchar qs[QK5_1 / 2]; // nibbles / quants
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q4_k
|
||||
//------------------------------------------------------------------------------
|
||||
@@ -370,6 +389,281 @@ kernel void kernel_restore_block_q4_1_noshuffle(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q4_1_trans4_ns(
|
||||
__global struct block_q4_1 * src0,
|
||||
__global uint * dst_q,
|
||||
__global half * dst_d,
|
||||
__global half * dst_m,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
uint i00 = get_global_id(1);
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
uint ne00_blk = ne00 / QK4_1;
|
||||
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
|
||||
global struct block_q4_1 * b = src0 + src_blk_offset;
|
||||
dst_d[dst_blk_offset] = b->d;
|
||||
dst_m[dst_blk_offset] = b->m;
|
||||
|
||||
// extract quantization and unshuffle
|
||||
ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
|
||||
|
||||
ushort8 post_block = (ushort8)(0);
|
||||
|
||||
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
||||
uchar * post_block_ptr = (uchar *)(&post_block);
|
||||
|
||||
for (int i = 0; i < QK4_1 / 4; ++i) {
|
||||
uchar x0 = pre_block_ptr[2*i + 0];
|
||||
uchar x1 = pre_block_ptr[2*i + 1];
|
||||
|
||||
post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
post_block_ptr[i + QK4_1 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
}
|
||||
|
||||
uint4 q_block = as_uint4(post_block);
|
||||
|
||||
uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
||||
dst_q[offset] = q_block.x;
|
||||
dst_q[offset + ne01] = q_block.y;
|
||||
dst_q[offset + ne01 * 2] = q_block.z;
|
||||
dst_q[offset + ne01 * 3] = q_block.w;
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q4_1_trans4_ns(
|
||||
__global uint * src_q,
|
||||
__global half * src_d,
|
||||
__global half * src_m,
|
||||
__global struct block_q4_1 * dst0,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
int i00 = get_global_id(1);
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
uint ne00_blk = ne00 / QK4_1;
|
||||
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint src_dm_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
|
||||
__global struct block_q4_1 * b = dst0 + dst_blk_offset;
|
||||
b->d = src_d[src_dm_offset];
|
||||
b->m = src_m[src_dm_offset];
|
||||
|
||||
// collect transposed quantization parts for a block
|
||||
uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
||||
uint4 q_block;
|
||||
q_block.x = src_q[src_q_offset];
|
||||
q_block.y = src_q[src_q_offset + ne01];
|
||||
q_block.z = src_q[src_q_offset + ne01 * 2];
|
||||
q_block.w = src_q[src_q_offset + ne01 * 3];
|
||||
|
||||
ushort8 post_block = as_ushort8(q_block);
|
||||
ushort8 pre_block = (ushort8)(0);
|
||||
|
||||
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
||||
uchar * post_block_ptr = (uchar *)(&post_block);
|
||||
|
||||
for (int i = 0; i < QK4_0 / 4; ++i) {
|
||||
uchar x0 = post_block_ptr[i + 0];
|
||||
uchar x1 = post_block_ptr[i + QK4_0 / 4];
|
||||
|
||||
pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
}
|
||||
|
||||
((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q5_0_trans4_ns(
|
||||
__global struct block_q5_0 * src0,
|
||||
__global uint * dst_qs,
|
||||
__global uint * dst_qh,
|
||||
__global half * dst_d,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
uint i00 = get_global_id(1);
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
uint ne00_blk = ne00 / QK5_0;
|
||||
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
|
||||
global struct block_q5_0 * b = src0 + src_blk_offset;
|
||||
dst_d[dst_blk_offset] = b->d;
|
||||
|
||||
dst_qh[dst_blk_offset] = ((global uint *)(&(b->qh[0])))[0];
|
||||
|
||||
// extract quantization and unshuffle
|
||||
ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
|
||||
ushort8 post_block = (ushort8)(0);
|
||||
|
||||
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
||||
uchar * post_block_ptr = (uchar *)(&post_block);
|
||||
|
||||
for (int i = 0; i < QK5_0 / 4; ++i) {
|
||||
uchar x0 = pre_block_ptr[2*i + 0];
|
||||
uchar x1 = pre_block_ptr[2*i + 1];
|
||||
|
||||
post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
post_block_ptr[i + QK5_0 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
}
|
||||
|
||||
uint4 q_block = as_uint4(post_block);
|
||||
|
||||
uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
||||
dst_qs[offset] = q_block.x;
|
||||
dst_qs[offset + ne01] = q_block.y;
|
||||
dst_qs[offset + ne01 * 2] = q_block.z;
|
||||
dst_qs[offset + ne01 * 3] = q_block.w;
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q5_0_trans4_ns(
|
||||
__global uint * src_qs,
|
||||
__global uint * src_qh,
|
||||
__global half * src_d,
|
||||
__global struct block_q5_0 * dst0,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
int i00 = get_global_id(1);
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
uint ne00_blk = ne00 / QK5_0;
|
||||
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
|
||||
__global struct block_q5_0 * b = dst0 + dst_blk_offset;
|
||||
b->d = src_d[src_blk_offset];
|
||||
|
||||
((__global uint *)(&(b->qh[0])))[0] = src_qh[src_blk_offset];
|
||||
|
||||
// collect transposed quantization parts for a block
|
||||
uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
||||
uint4 q_block;
|
||||
q_block.x = src_qs[src_q_offset];
|
||||
q_block.y = src_qs[src_q_offset + ne01];
|
||||
q_block.z = src_qs[src_q_offset + ne01 * 2];
|
||||
q_block.w = src_qs[src_q_offset + ne01 * 3];
|
||||
|
||||
ushort8 post_block = as_ushort8(q_block);
|
||||
ushort8 pre_block = (ushort8)(0);
|
||||
|
||||
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
||||
uchar * post_block_ptr = (uchar *)(&post_block);
|
||||
|
||||
for (int i = 0; i < QK5_0 / 4; ++i) {
|
||||
uchar x0 = post_block_ptr[i + 0];
|
||||
uchar x1 = post_block_ptr[i + QK5_0 / 4];
|
||||
|
||||
pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
}
|
||||
|
||||
((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q5_1_trans4_ns(
|
||||
__global struct block_q5_1 * src0,
|
||||
__global uint * dst_qs,
|
||||
__global uint * dst_qh,
|
||||
__global half * dst_d,
|
||||
__global half * dst_m,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
uint i00 = get_global_id(1);
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
uint ne00_blk = ne00 / QK5_1;
|
||||
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
|
||||
global struct block_q5_1 * b = src0 + src_blk_offset;
|
||||
dst_d[dst_blk_offset] = b->d;
|
||||
dst_m[dst_blk_offset] = b->m;
|
||||
|
||||
dst_qh[dst_blk_offset] = ((global uint *)(&(b->qh[0])))[0];
|
||||
|
||||
// extract quantization and unshuffle
|
||||
ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
|
||||
ushort8 post_block = (ushort8)(0);
|
||||
|
||||
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
||||
uchar * post_block_ptr = (uchar *)(&post_block);
|
||||
|
||||
for (int i = 0; i < QK5_1 / 4; ++i) {
|
||||
uchar x0 = pre_block_ptr[2*i + 0];
|
||||
uchar x1 = pre_block_ptr[2*i + 1];
|
||||
|
||||
post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
post_block_ptr[i + QK5_1 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
}
|
||||
|
||||
uint4 q_block = as_uint4(post_block);
|
||||
|
||||
uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
||||
dst_qs[offset] = q_block.x;
|
||||
dst_qs[offset + ne01] = q_block.y;
|
||||
dst_qs[offset + ne01 * 2] = q_block.z;
|
||||
dst_qs[offset + ne01 * 3] = q_block.w;
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q5_1_trans4_ns(
|
||||
__global uint * src_qs,
|
||||
__global uint * src_qh,
|
||||
__global half * src_d,
|
||||
__global half * src_m,
|
||||
__global struct block_q5_1 * dst0,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
int i00 = get_global_id(1);
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
uint ne00_blk = ne00 / QK5_1;
|
||||
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
|
||||
__global struct block_q5_1 * b = dst0 + dst_blk_offset;
|
||||
b->d = src_d[src_blk_offset];
|
||||
b->m = src_m[src_blk_offset];
|
||||
|
||||
((__global uint *)(&(b->qh[0])))[0] = src_qh[src_blk_offset];
|
||||
|
||||
// collect transposed quantization parts for a block
|
||||
uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
||||
uint4 q_block;
|
||||
q_block.x = src_qs[src_q_offset];
|
||||
q_block.y = src_qs[src_q_offset + ne01];
|
||||
q_block.z = src_qs[src_q_offset + ne01 * 2];
|
||||
q_block.w = src_qs[src_q_offset + ne01 * 3];
|
||||
|
||||
ushort8 post_block = as_ushort8(q_block);
|
||||
ushort8 pre_block = (ushort8)(0);
|
||||
|
||||
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
||||
uchar * post_block_ptr = (uchar *)(&post_block);
|
||||
|
||||
for (int i = 0; i < QK5_1 / 4; ++i) {
|
||||
uchar x0 = post_block_ptr[i + 0];
|
||||
uchar x1 = post_block_ptr[i + QK5_1 / 4];
|
||||
|
||||
pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
}
|
||||
((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_mxfp4
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
254
ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl
Normal file
254
ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl
Normal file
@@ -0,0 +1,254 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
|
||||
|
||||
#define TILESIZE_K 16
|
||||
#define TILESIZE_M 64
|
||||
#define TILESIZE_N 32
|
||||
|
||||
|
||||
#define dequantize_q4_1(q4, a_f16, scale, m) \
|
||||
a_f16.s0 = (half)(q4.s0 & 0x000F) * scale + m; \
|
||||
a_f16.s1 = (half)((q4.s0 & 0x00F0) >> 4) * scale + m; \
|
||||
a_f16.s2 = (half)((q4.s0 & 0x0F00) >> 8) * scale + m; \
|
||||
a_f16.s3 = (half)((q4.s0 & 0xF000) >> 12) * scale + m; \
|
||||
a_f16.s4 = (half)(q4.s1 & 0x000F) * scale + m; \
|
||||
a_f16.s5 = (half)((q4.s1 & 0x00F0) >> 4) * scale + m; \
|
||||
a_f16.s6 = (half)((q4.s1 & 0x0F00) >> 8) * scale + m; \
|
||||
a_f16.s7 = (half)((q4.s1 & 0xF000) >> 12) * scale + m; \
|
||||
a_f16.s8 = (half)(q4.s2 & 0x000F) * scale + m; \
|
||||
a_f16.s9 = (half)((q4.s2 & 0x00F0) >> 4) * scale + m; \
|
||||
a_f16.sa = (half)((q4.s2 & 0x0F00) >> 8) * scale + m; \
|
||||
a_f16.sb = (half)((q4.s2 & 0xF000) >> 12) * scale + m; \
|
||||
a_f16.sc = (half)(q4.s3 & 0x000F) * scale + m; \
|
||||
a_f16.sd = (half)((q4.s3 & 0x00F0) >> 4) * scale + m; \
|
||||
a_f16.se = (half)((q4.s3 & 0x0F00) >> 8) * scale + m; \
|
||||
a_f16.sf = (half)((q4.s3 & 0xF000) >> 12) * scale + m; \
|
||||
|
||||
|
||||
#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \
|
||||
acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \
|
||||
acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \
|
||||
acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \
|
||||
acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \
|
||||
acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \
|
||||
acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \
|
||||
acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \
|
||||
acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \
|
||||
acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \
|
||||
acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \
|
||||
acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \
|
||||
acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \
|
||||
acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \
|
||||
acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \
|
||||
acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \
|
||||
acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \
|
||||
acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \
|
||||
acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \
|
||||
acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \
|
||||
acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \
|
||||
acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \
|
||||
acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \
|
||||
acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \
|
||||
acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \
|
||||
acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \
|
||||
acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \
|
||||
acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \
|
||||
acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \
|
||||
acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \
|
||||
acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \
|
||||
acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \
|
||||
acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \
|
||||
c_reg.lo += convert_float8(acc.lo); \
|
||||
c_reg.hi += convert_float8(acc.hi); \
|
||||
acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \
|
||||
acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \
|
||||
acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \
|
||||
acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \
|
||||
acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \
|
||||
acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \
|
||||
acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \
|
||||
acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \
|
||||
acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \
|
||||
acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \
|
||||
acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \
|
||||
acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \
|
||||
acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \
|
||||
acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \
|
||||
acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \
|
||||
acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \
|
||||
acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \
|
||||
acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \
|
||||
acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \
|
||||
acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \
|
||||
acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \
|
||||
acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \
|
||||
acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \
|
||||
acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \
|
||||
acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \
|
||||
acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \
|
||||
acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \
|
||||
acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \
|
||||
acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \
|
||||
acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \
|
||||
acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \
|
||||
acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \
|
||||
c_reg.lo += convert_float8(acc.lo); \
|
||||
c_reg.hi += convert_float8(acc.hi); \
|
||||
|
||||
|
||||
__attribute__((qcom_wave_pair_mode(1))) // 1=force single 2=force pair
|
||||
kernel void kernel_gemm_moe_q4_1_f32_ns(
|
||||
__read_only image1d_buffer_t src0_q,
|
||||
__global half * src0_d,
|
||||
__global half * src0_m,
|
||||
__read_only image1d_buffer_t src1,
|
||||
__global uint * src2,
|
||||
__global ushort * src2_emap,
|
||||
__write_only image1d_buffer_t dst,
|
||||
__global int * total_tiles,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
uint block_id_m = get_global_id(1); // m_tile
|
||||
uint block_id_n = get_global_id(2); // n_tile
|
||||
|
||||
// Boundary check
|
||||
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
|
||||
return;
|
||||
}
|
||||
|
||||
__private half16 reg_a;
|
||||
__private float32 reg_c = (float32)(0);
|
||||
__local half4 shared_b[128];
|
||||
|
||||
const ushort expert_id = src2_emap[block_id_n];
|
||||
|
||||
const uint row = block_id_m * TILESIZE_M;
|
||||
const uint col = block_id_n * TILESIZE_N;
|
||||
|
||||
uint sub_block_id_m = get_local_id(0);
|
||||
uint2 b_global_offset;
|
||||
b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00;
|
||||
b_global_offset.y = b_global_offset.x + (16 * ne00);
|
||||
uint2 b_local_offset;
|
||||
b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2);
|
||||
b_local_offset.y = b_local_offset.x + 16;
|
||||
|
||||
// Loop along K axis, 32 elements (one block) for each iteration, divided into 2 sub-blocks
|
||||
for (uint step = 0; step < ne00; step += TILESIZE_K * 2) {
|
||||
// First sub-block
|
||||
uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
|
||||
uint s_sub_offset = row + ((ne01 * step) >> 5) + ((expert_id * ne00 * ne01) >> 5);
|
||||
uint b_sub_offset = col * ne00 + step;
|
||||
|
||||
// Load scale and m for current Q4_1 block
|
||||
uint sm_offset = s_sub_offset + get_global_id(0);
|
||||
half s = src0_d[sm_offset];
|
||||
half m = src0_m[sm_offset];
|
||||
|
||||
// Load 16 q (64-bits) in transposed layout
|
||||
uint2 q4x16;
|
||||
q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x;
|
||||
q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x;
|
||||
|
||||
// Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
|
||||
float8 bx8_f32;
|
||||
bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
|
||||
bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
|
||||
// Convert to half and store to LM to share within the subgroup
|
||||
half8 bx8_f16 = convert_half8(bx8_f32);
|
||||
shared_b[b_local_offset.x] = bx8_f16.lo;
|
||||
shared_b[b_local_offset.y] = bx8_f16.hi;
|
||||
|
||||
// Dequantization
|
||||
dequantize_q4_1(as_ushort4(q4x16), reg_a, s, m);
|
||||
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// 32 16x16 fp16 dot product with 8 elements reduction for better precision
|
||||
half16 acc;
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
|
||||
// Repeat for second sub-block
|
||||
uint half_step = step + TILESIZE_K;
|
||||
q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
|
||||
b_sub_offset = col * ne00 + half_step;
|
||||
|
||||
// Load next 16 q (64-bits) in transposed layout
|
||||
q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x;
|
||||
q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x;
|
||||
|
||||
// Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
|
||||
bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
|
||||
bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
|
||||
// Convert to half and store to LM to share within the subgroup
|
||||
bx8_f16 = convert_half8(bx8_f32);
|
||||
shared_b[b_local_offset.x] = bx8_f16.lo;
|
||||
shared_b[b_local_offset.y] = bx8_f16.hi;
|
||||
|
||||
// Dequantization
|
||||
dequantize_q4_1(as_ushort4(q4x16), reg_a, s, m);
|
||||
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// 32 16x16 fp16 dot product with 3-levels reduction for better precision
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
}
|
||||
|
||||
// Load poster router and share in LM
|
||||
__local uint out_idx[TILESIZE_N];
|
||||
|
||||
if (get_local_id(0) < TILESIZE_N) {
|
||||
uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)];
|
||||
if (idx == 0xFFFFFFFF) {
|
||||
idx = src2[block_id_n * TILESIZE_N + 0];
|
||||
}
|
||||
out_idx[get_local_id(0)] = idx * ne01;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Scatter results back to original position in output grid
|
||||
uint m_offset = row + get_local_id(0);
|
||||
|
||||
write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1));
|
||||
write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2));
|
||||
write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3));
|
||||
write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4));
|
||||
write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5));
|
||||
write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6));
|
||||
write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7));
|
||||
write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8));
|
||||
write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9));
|
||||
write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa));
|
||||
write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb));
|
||||
write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc));
|
||||
write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd));
|
||||
write_imagef(dst, out_idx[14] + m_offset, (reg_c.se));
|
||||
write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf));
|
||||
write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg));
|
||||
write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh));
|
||||
write_imagef(dst, out_idx[18] + m_offset, (reg_c.si));
|
||||
write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj));
|
||||
write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk));
|
||||
write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl));
|
||||
write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm));
|
||||
write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn));
|
||||
write_imagef(dst, out_idx[24] + m_offset, (reg_c.so));
|
||||
write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp));
|
||||
write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq));
|
||||
write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr));
|
||||
write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss));
|
||||
write_imagef(dst, out_idx[29] + m_offset, (reg_c.st));
|
||||
write_imagef(dst, out_idx[30] + m_offset, (reg_c.su));
|
||||
write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv));
|
||||
|
||||
// Store zero padding parts to the index of first output in tile, override correct result in the end
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0));
|
||||
}
|
||||
256
ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl
Normal file
256
ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl
Normal file
@@ -0,0 +1,256 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
|
||||
|
||||
#define TILESIZE_K 16
|
||||
#define TILESIZE_M 64
|
||||
#define TILESIZE_N 32
|
||||
|
||||
|
||||
#define dequantize_q5_0(qs5x16, qh5x16, a_f16, scale) \
|
||||
a_f16.s0 = (half)((( qs5x16.s0 & 0x000F) | (( qh5x16.s0 & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s1 = (half)((((qs5x16.s0 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 1) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s2 = (half)((((qs5x16.s0 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 2) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s3 = (half)((((qs5x16.s0 & 0xF000) >> 12) | (((qh5x16.s0 >> 3) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s4 = (half)((( qs5x16.s1 & 0x000F) | (((qh5x16.s0 >> 4) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s5 = (half)((((qs5x16.s1 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 5) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s6 = (half)((((qs5x16.s1 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 6) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s7 = (half)((((qs5x16.s1 & 0xF000) >> 12) | (((qh5x16.s0 >> 7) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s8 = (half)((( qs5x16.s2 & 0x000F) | (( qh5x16.s1 & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s9 = (half)((((qs5x16.s2 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 1) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.sa = (half)((((qs5x16.s2 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 2) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.sb = (half)((((qs5x16.s2 & 0xF000) >> 12) | (((qh5x16.s1 >> 3) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.sc = (half)((( qs5x16.s3 & 0x000F) | (((qh5x16.s1 >> 4) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.sd = (half)((((qs5x16.s3 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 5) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.se = (half)((((qs5x16.s3 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 6) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.sf = (half)((((qs5x16.s3 & 0xF000) >> 12) | (((qh5x16.s1 >> 7) & 0x01) << 4)) - 16) * scale; \
|
||||
|
||||
|
||||
#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \
|
||||
acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \
|
||||
acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \
|
||||
acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \
|
||||
acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \
|
||||
acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \
|
||||
acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \
|
||||
acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \
|
||||
acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \
|
||||
acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \
|
||||
acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \
|
||||
acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \
|
||||
acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \
|
||||
acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \
|
||||
acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \
|
||||
acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \
|
||||
acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \
|
||||
acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \
|
||||
acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \
|
||||
acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \
|
||||
acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \
|
||||
acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \
|
||||
acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \
|
||||
acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \
|
||||
acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \
|
||||
acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \
|
||||
acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \
|
||||
acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \
|
||||
acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \
|
||||
acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \
|
||||
acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \
|
||||
acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \
|
||||
acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \
|
||||
c_reg.lo += convert_float8(acc.lo); \
|
||||
c_reg.hi += convert_float8(acc.hi); \
|
||||
acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \
|
||||
acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \
|
||||
acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \
|
||||
acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \
|
||||
acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \
|
||||
acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \
|
||||
acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \
|
||||
acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \
|
||||
acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \
|
||||
acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \
|
||||
acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \
|
||||
acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \
|
||||
acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \
|
||||
acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \
|
||||
acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \
|
||||
acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \
|
||||
acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \
|
||||
acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \
|
||||
acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \
|
||||
acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \
|
||||
acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \
|
||||
acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \
|
||||
acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \
|
||||
acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \
|
||||
acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \
|
||||
acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \
|
||||
acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \
|
||||
acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \
|
||||
acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \
|
||||
acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \
|
||||
acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \
|
||||
acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \
|
||||
c_reg.lo += convert_float8(acc.lo); \
|
||||
c_reg.hi += convert_float8(acc.hi); \
|
||||
|
||||
|
||||
__attribute__((qcom_wave_pair_mode(1))) // 1=force single 2=force pair
|
||||
kernel void kernel_gemm_moe_q5_0_f32_ns(
|
||||
__read_only image1d_buffer_t src0_qs,
|
||||
__global uint * src0_qh,
|
||||
__global half * src0_d,
|
||||
__read_only image1d_buffer_t src1,
|
||||
__global uint * src2,
|
||||
__global ushort * src2_emap,
|
||||
__write_only image1d_buffer_t dst,
|
||||
__global int * total_tiles,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
uint block_id_m = get_global_id(1); // m_tile
|
||||
uint block_id_n = get_global_id(2); // n_tile
|
||||
|
||||
// Boundary check
|
||||
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
|
||||
return;
|
||||
}
|
||||
|
||||
__private half16 reg_a;
|
||||
__private float32 reg_c = (float32)(0);
|
||||
__local half4 shared_b[128];
|
||||
|
||||
const ushort expert_id = src2_emap[block_id_n];
|
||||
|
||||
const uint row = block_id_m * TILESIZE_M;
|
||||
const uint col = block_id_n * TILESIZE_N;
|
||||
|
||||
uint sub_block_id_m = get_local_id(0);
|
||||
uint2 b_global_offset;
|
||||
b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00;
|
||||
b_global_offset.y = b_global_offset.x + (16 * ne00);
|
||||
uint2 b_local_offset;
|
||||
b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2);
|
||||
b_local_offset.y = b_local_offset.x + 16;
|
||||
|
||||
// Loop along K axis, 32 elements (one block) for each iteration, divided into 2 sub-blocks
|
||||
for (uint step = 0; step < ne00; step += TILESIZE_K * 2) {
|
||||
// First sub-block
|
||||
uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
|
||||
uint s_sub_offset = row + ((ne01 * step) >> 5) + ((expert_id * ne00 * ne01) >> 5);
|
||||
uint b_sub_offset = col * ne00 + step;
|
||||
|
||||
// Load scale for current Q5_0 block
|
||||
uint blk_offset = s_sub_offset + get_global_id(0);
|
||||
half s = src0_d[blk_offset];
|
||||
|
||||
// Load 32 qh (5-th bit of each Q5) for the entire block
|
||||
uchar4 qhx32 = as_uchar4(src0_qh[blk_offset]);
|
||||
|
||||
// Load 16 qs (half block) in transposed layout
|
||||
uint2 qsx16;
|
||||
qsx16.x = read_imageui(src0_qs, q_sub_offset + sub_block_id_m).x;
|
||||
qsx16.y = read_imageui(src0_qs, q_sub_offset + sub_block_id_m + ne01).x;
|
||||
|
||||
// Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
|
||||
float8 bx8_f32;
|
||||
bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
|
||||
bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
|
||||
// Convert to half and store to LM to share within the subgroup
|
||||
half8 bx8_f16 = convert_half8(bx8_f32);
|
||||
shared_b[b_local_offset.x] = bx8_f16.lo;
|
||||
shared_b[b_local_offset.y] = bx8_f16.hi;
|
||||
|
||||
// Dequantization
|
||||
dequantize_q5_0(as_ushort4(qsx16), qhx32.lo, reg_a, s);
|
||||
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// 32 16x16 fp16 dot product with 8 elements reduction for better precision
|
||||
half16 acc;
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
|
||||
// Repeat for second sub-block
|
||||
uint half_step = step + TILESIZE_K;
|
||||
q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
|
||||
b_sub_offset = col * ne00 + half_step;
|
||||
|
||||
// Load next 16 qs in transposed layout
|
||||
qsx16.x = read_imageui(src0_qs, q_sub_offset + sub_block_id_m).x;
|
||||
qsx16.y = read_imageui(src0_qs, q_sub_offset + sub_block_id_m + ne01).x;
|
||||
|
||||
// Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
|
||||
bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
|
||||
bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
|
||||
// Convert to half and store to LM to share within the subgroup
|
||||
bx8_f16 = convert_half8(bx8_f32);
|
||||
shared_b[b_local_offset.x] = bx8_f16.lo;
|
||||
shared_b[b_local_offset.y] = bx8_f16.hi;
|
||||
|
||||
// Dequantization
|
||||
dequantize_q5_0(as_ushort4(qsx16), qhx32.hi, reg_a, s);
|
||||
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// 32 16x16 fp16 dot product with 3-levels reduction for better precision
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
}
|
||||
|
||||
// Load poster router and share in LM
|
||||
__local uint out_idx[TILESIZE_N];
|
||||
|
||||
if (get_local_id(0) < TILESIZE_N) {
|
||||
uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)];
|
||||
if (idx == 0xFFFFFFFF) {
|
||||
idx = src2[block_id_n * TILESIZE_N + 0];
|
||||
}
|
||||
out_idx[get_local_id(0)] = idx * ne01;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Scatter results back to original position in output grid
|
||||
uint m_offset = row + get_local_id(0);
|
||||
|
||||
write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1));
|
||||
write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2));
|
||||
write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3));
|
||||
write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4));
|
||||
write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5));
|
||||
write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6));
|
||||
write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7));
|
||||
write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8));
|
||||
write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9));
|
||||
write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa));
|
||||
write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb));
|
||||
write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc));
|
||||
write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd));
|
||||
write_imagef(dst, out_idx[14] + m_offset, (reg_c.se));
|
||||
write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf));
|
||||
write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg));
|
||||
write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh));
|
||||
write_imagef(dst, out_idx[18] + m_offset, (reg_c.si));
|
||||
write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj));
|
||||
write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk));
|
||||
write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl));
|
||||
write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm));
|
||||
write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn));
|
||||
write_imagef(dst, out_idx[24] + m_offset, (reg_c.so));
|
||||
write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp));
|
||||
write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq));
|
||||
write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr));
|
||||
write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss));
|
||||
write_imagef(dst, out_idx[29] + m_offset, (reg_c.st));
|
||||
write_imagef(dst, out_idx[30] + m_offset, (reg_c.su));
|
||||
write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv));
|
||||
|
||||
// Store zero padding parts to the index of first output in tile, override correct result in the end
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0));
|
||||
}
|
||||
258
ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl
Normal file
258
ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl
Normal file
@@ -0,0 +1,258 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
|
||||
|
||||
#define TILESIZE_K 16
|
||||
#define TILESIZE_M 64
|
||||
#define TILESIZE_N 32
|
||||
|
||||
|
||||
#define dequantize_q5_1(qs5x16, qh5x16, a_f16, scale, m) \
|
||||
a_f16.s0 = (half)((( qs5x16.s0 & 0x000F) | (( qh5x16.s0 & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s1 = (half)((((qs5x16.s0 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 1) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s2 = (half)((((qs5x16.s0 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 2) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s3 = (half)((((qs5x16.s0 & 0xF000) >> 12) | (((qh5x16.s0 >> 3) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s4 = (half)((( qs5x16.s1 & 0x000F) | (((qh5x16.s0 >> 4) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s5 = (half)((((qs5x16.s1 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 5) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s6 = (half)((((qs5x16.s1 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 6) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s7 = (half)((((qs5x16.s1 & 0xF000) >> 12) | (((qh5x16.s0 >> 7) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s8 = (half)((( qs5x16.s2 & 0x000F) | (( qh5x16.s1 & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s9 = (half)((((qs5x16.s2 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 1) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.sa = (half)((((qs5x16.s2 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 2) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.sb = (half)((((qs5x16.s2 & 0xF000) >> 12) | (((qh5x16.s1 >> 3) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.sc = (half)((( qs5x16.s3 & 0x000F) | (((qh5x16.s1 >> 4) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.sd = (half)((((qs5x16.s3 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 5) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.se = (half)((((qs5x16.s3 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 6) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.sf = (half)((((qs5x16.s3 & 0xF000) >> 12) | (((qh5x16.s1 >> 7) & 0x01) << 4)) * scale + m); \
|
||||
|
||||
|
||||
#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \
|
||||
acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \
|
||||
acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \
|
||||
acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \
|
||||
acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \
|
||||
acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \
|
||||
acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \
|
||||
acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \
|
||||
acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \
|
||||
acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \
|
||||
acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \
|
||||
acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \
|
||||
acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \
|
||||
acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \
|
||||
acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \
|
||||
acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \
|
||||
acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \
|
||||
acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \
|
||||
acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \
|
||||
acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \
|
||||
acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \
|
||||
acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \
|
||||
acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \
|
||||
acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \
|
||||
acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \
|
||||
acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \
|
||||
acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \
|
||||
acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \
|
||||
acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \
|
||||
acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \
|
||||
acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \
|
||||
acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \
|
||||
acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \
|
||||
c_reg.lo += convert_float8(acc.lo); \
|
||||
c_reg.hi += convert_float8(acc.hi); \
|
||||
acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \
|
||||
acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \
|
||||
acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \
|
||||
acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \
|
||||
acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \
|
||||
acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \
|
||||
acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \
|
||||
acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \
|
||||
acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \
|
||||
acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \
|
||||
acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \
|
||||
acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \
|
||||
acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \
|
||||
acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \
|
||||
acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \
|
||||
acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \
|
||||
acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \
|
||||
acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \
|
||||
acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \
|
||||
acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \
|
||||
acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \
|
||||
acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \
|
||||
acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \
|
||||
acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \
|
||||
acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \
|
||||
acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \
|
||||
acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \
|
||||
acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \
|
||||
acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \
|
||||
acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \
|
||||
acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \
|
||||
acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \
|
||||
c_reg.lo += convert_float8(acc.lo); \
|
||||
c_reg.hi += convert_float8(acc.hi); \
|
||||
|
||||
|
||||
__attribute__((qcom_wave_pair_mode(1))) // 1=force single 2=force pair
|
||||
kernel void kernel_gemm_moe_q5_1_f32_ns(
|
||||
__read_only image1d_buffer_t src0_qs,
|
||||
__global uint * src0_qh,
|
||||
__global half * src0_d,
|
||||
__global half * src0_m,
|
||||
__read_only image1d_buffer_t src1,
|
||||
__global uint * src2,
|
||||
__global ushort * src2_emap,
|
||||
__write_only image1d_buffer_t dst,
|
||||
__global int * total_tiles,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
uint block_id_m = get_global_id(1); // m_tile
|
||||
uint block_id_n = get_global_id(2); // n_tile
|
||||
|
||||
// Boundary check
|
||||
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
|
||||
return;
|
||||
}
|
||||
|
||||
__private half16 reg_a;
|
||||
__private float32 reg_c = (float32)(0);
|
||||
__local half4 shared_b[128];
|
||||
|
||||
const ushort expert_id = src2_emap[block_id_n];
|
||||
|
||||
const uint row = block_id_m * TILESIZE_M;
|
||||
const uint col = block_id_n * TILESIZE_N;
|
||||
|
||||
uint sub_block_id_m = get_local_id(0);
|
||||
uint2 b_global_offset;
|
||||
b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00;
|
||||
b_global_offset.y = b_global_offset.x + (16 * ne00);
|
||||
uint2 b_local_offset;
|
||||
b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2);
|
||||
b_local_offset.y = b_local_offset.x + 16;
|
||||
|
||||
// Loop along K axis, 32 elements (one block) for each iteration, divided into 2 sub-blocks
|
||||
for (uint step = 0; step < ne00; step += TILESIZE_K * 2) {
|
||||
// First sub-block
|
||||
uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
|
||||
uint s_sub_offset = row + ((ne01 * step) >> 5) + ((expert_id * ne00 * ne01) >> 5);
|
||||
uint b_sub_offset = col * ne00 + step;
|
||||
|
||||
// Load scale and m for current Q5_1 block
|
||||
uint blk_offset = s_sub_offset + get_global_id(0);
|
||||
half s = src0_d[blk_offset];
|
||||
half m = src0_m[blk_offset];
|
||||
|
||||
// Load 32 qh (5-th bit of each Q5) for the entire block
|
||||
uchar4 qhx32 = as_uchar4(src0_qh[blk_offset]);
|
||||
|
||||
// Load 16 qs (half block) in transposed layout
|
||||
uint2 qsx16;
|
||||
qsx16.x = read_imageui(src0_qs, q_sub_offset + sub_block_id_m).x;
|
||||
qsx16.y = read_imageui(src0_qs, q_sub_offset + sub_block_id_m + ne01).x;
|
||||
|
||||
// Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
|
||||
float8 bx8_f32;
|
||||
bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
|
||||
bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
|
||||
// Convert to half and store to LM to share within the subgroup
|
||||
half8 bx8_f16 = convert_half8(bx8_f32);
|
||||
shared_b[b_local_offset.x] = bx8_f16.lo;
|
||||
shared_b[b_local_offset.y] = bx8_f16.hi;
|
||||
|
||||
// Dequantization
|
||||
dequantize_q5_1(as_ushort4(qsx16), qhx32.lo, reg_a, s, m);
|
||||
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// 32 16x16 fp16 dot product with 8 elements reduction for better precision
|
||||
half16 acc;
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
|
||||
// Repeat for second sub-block
|
||||
uint half_step = step + TILESIZE_K;
|
||||
q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
|
||||
b_sub_offset = col * ne00 + half_step;
|
||||
|
||||
// Load next 16 qs in transposed layout
|
||||
qsx16.x = read_imageui(src0_qs, q_sub_offset + sub_block_id_m).x;
|
||||
qsx16.y = read_imageui(src0_qs, q_sub_offset + sub_block_id_m + ne01).x;
|
||||
|
||||
// Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
|
||||
bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
|
||||
bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
|
||||
// Convert to half and store to LM to share within the subgroup
|
||||
bx8_f16 = convert_half8(bx8_f32);
|
||||
shared_b[b_local_offset.x] = bx8_f16.lo;
|
||||
shared_b[b_local_offset.y] = bx8_f16.hi;
|
||||
|
||||
// Dequantization
|
||||
dequantize_q5_1(as_ushort4(qsx16), qhx32.hi, reg_a, s, m);
|
||||
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// 32 16x16 fp16 dot product with 3-levels reduction for better precision
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
}
|
||||
|
||||
// Load poster router and share in LM
|
||||
__local uint out_idx[TILESIZE_N];
|
||||
|
||||
if (get_local_id(0) < TILESIZE_N) {
|
||||
uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)];
|
||||
if (idx == 0xFFFFFFFF) {
|
||||
idx = src2[block_id_n * TILESIZE_N + 0];
|
||||
}
|
||||
out_idx[get_local_id(0)] = idx * ne01;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Scatter results back to original position in output grid
|
||||
uint m_offset = row + get_local_id(0);
|
||||
|
||||
write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1));
|
||||
write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2));
|
||||
write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3));
|
||||
write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4));
|
||||
write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5));
|
||||
write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6));
|
||||
write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7));
|
||||
write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8));
|
||||
write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9));
|
||||
write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa));
|
||||
write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb));
|
||||
write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc));
|
||||
write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd));
|
||||
write_imagef(dst, out_idx[14] + m_offset, (reg_c.se));
|
||||
write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf));
|
||||
write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg));
|
||||
write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh));
|
||||
write_imagef(dst, out_idx[18] + m_offset, (reg_c.si));
|
||||
write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj));
|
||||
write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk));
|
||||
write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl));
|
||||
write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm));
|
||||
write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn));
|
||||
write_imagef(dst, out_idx[24] + m_offset, (reg_c.so));
|
||||
write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp));
|
||||
write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq));
|
||||
write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr));
|
||||
write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss));
|
||||
write_imagef(dst, out_idx[29] + m_offset, (reg_c.st));
|
||||
write_imagef(dst, out_idx[30] + m_offset, (reg_c.su));
|
||||
write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv));
|
||||
|
||||
// Store zero padding parts to the index of first output in tile, override correct result in the end
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0));
|
||||
}
|
||||
233
ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl
Normal file
233
ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl
Normal file
@@ -0,0 +1,233 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load : enable
|
||||
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void adreno_xmem_pack_src_f32(
|
||||
__global const void * src_void,
|
||||
ulong offset,
|
||||
__write_only image2d_t src_img,
|
||||
int K,
|
||||
int N) {
|
||||
const int x = get_global_id(0);
|
||||
const int y = get_global_id(1);
|
||||
const int kpack = K / 4;
|
||||
|
||||
if (x >= N || y >= kpack) {
|
||||
return;
|
||||
}
|
||||
|
||||
__global const float * src = (__global const float *)((__global const char *)src_void + offset);
|
||||
const int base = x*K + y*4;
|
||||
const half4 v = (half4)((half)src[base + 0], (half)src[base + 1], (half)src[base + 2], (half)src[base + 3]);
|
||||
write_imageh(src_img, (int2)(x, y), v);
|
||||
}
|
||||
|
||||
__kernel void adreno_xmem_prepack_weight_f16(
|
||||
__global half4 * dst,
|
||||
__global const void * src_void,
|
||||
ulong offset,
|
||||
int K,
|
||||
int M,
|
||||
int kpack,
|
||||
int npack,
|
||||
int os) {
|
||||
const int linear = get_global_id(0);
|
||||
const int total = kpack*npack;
|
||||
if (linear >= total) {
|
||||
return;
|
||||
}
|
||||
|
||||
__global const half * src = (__global const half *)((__global const char *)src_void + offset);
|
||||
|
||||
const int dst_ogroup = linear % os;
|
||||
const int dst_o_sp_i = linear / os;
|
||||
const int dst_i = dst_o_sp_i % kpack;
|
||||
const int dst_o = dst_o_sp_i / kpack;
|
||||
const int o_slice = dst_o*os + dst_ogroup;
|
||||
const int k_base = dst_i*4;
|
||||
|
||||
half4 w0 = (half4)(0.0h);
|
||||
half4 w1 = (half4)(0.0h);
|
||||
half4 w2 = (half4)(0.0h);
|
||||
half4 w3 = (half4)(0.0h);
|
||||
|
||||
const int o0 = o_slice*4 + 0;
|
||||
const int o1 = o_slice*4 + 1;
|
||||
const int o2 = o_slice*4 + 2;
|
||||
const int o3 = o_slice*4 + 3;
|
||||
|
||||
if (k_base + 0 < K) {
|
||||
if (o0 < M) w0.s0 = src[o0*K + k_base + 0];
|
||||
if (o1 < M) w0.s1 = src[o1*K + k_base + 0];
|
||||
if (o2 < M) w0.s2 = src[o2*K + k_base + 0];
|
||||
if (o3 < M) w0.s3 = src[o3*K + k_base + 0];
|
||||
}
|
||||
if (k_base + 1 < K) {
|
||||
if (o0 < M) w1.s0 = src[o0*K + k_base + 1];
|
||||
if (o1 < M) w1.s1 = src[o1*K + k_base + 1];
|
||||
if (o2 < M) w1.s2 = src[o2*K + k_base + 1];
|
||||
if (o3 < M) w1.s3 = src[o3*K + k_base + 1];
|
||||
}
|
||||
if (k_base + 2 < K) {
|
||||
if (o0 < M) w2.s0 = src[o0*K + k_base + 2];
|
||||
if (o1 < M) w2.s1 = src[o1*K + k_base + 2];
|
||||
if (o2 < M) w2.s2 = src[o2*K + k_base + 2];
|
||||
if (o3 < M) w2.s3 = src[o3*K + k_base + 2];
|
||||
}
|
||||
if (k_base + 3 < K) {
|
||||
if (o0 < M) w3.s0 = src[o0*K + k_base + 3];
|
||||
if (o1 < M) w3.s1 = src[o1*K + k_base + 3];
|
||||
if (o2 < M) w3.s2 = src[o2*K + k_base + 3];
|
||||
if (o3 < M) w3.s3 = src[o3*K + k_base + 3];
|
||||
}
|
||||
|
||||
dst[linear*4 + 0] = w0;
|
||||
dst[linear*4 + 1] = w1;
|
||||
dst[linear*4 + 2] = w2;
|
||||
dst[linear*4 + 3] = w3;
|
||||
}
|
||||
|
||||
__attribute__((qcom_max_concurrent_subgroups(12)))
|
||||
__kernel void kernel_gemm_xmem_f16_f32_os8(
|
||||
__constant half8 * weights_buffer __attribute__((sub_group_uniform)),
|
||||
__constant half8 * xmem_buffer __attribute__((max_constant_size((6144)))),
|
||||
__read_only image2d_t src_img,
|
||||
__write_only image2d_t dst_img,
|
||||
int N,
|
||||
int npack,
|
||||
int kpack) {
|
||||
const int X = get_group_id(1)*get_local_size(0) + get_local_id(0);
|
||||
const int Z = get_group_id(0)*get_local_size(2) + get_local_id(2);
|
||||
|
||||
if (X >= N || Z*8 >= npack) {
|
||||
return;
|
||||
}
|
||||
|
||||
half4 r0 = (half4)(0.0h);
|
||||
half4 r1 = (half4)(0.0h);
|
||||
half4 r2 = (half4)(0.0h);
|
||||
half4 r3 = (half4)(0.0h);
|
||||
half4 r4 = (half4)(0.0h);
|
||||
half4 r5 = (half4)(0.0h);
|
||||
half4 r6 = (half4)(0.0h);
|
||||
half4 r7 = (half4)(0.0h);
|
||||
|
||||
int f_offset = Z*kpack*32;
|
||||
int subgroup_id = (int)(0x1F & qcom_get_physical_sub_group_id());
|
||||
subgroup_id = subgroup_id % 12;
|
||||
const int c_offset = subgroup_id*32;
|
||||
__constant half16 * weights_cache = (__constant half16 *)&xmem_buffer[c_offset];
|
||||
|
||||
int coord_s = 0;
|
||||
do {
|
||||
const half4 src0 = read_imageh(src_img, smp_zero, (int2)(X, coord_s));
|
||||
coord_s++;
|
||||
const half4 src1 = read_imageh(src_img, smp_zero, (int2)(X, coord_s));
|
||||
coord_s++;
|
||||
|
||||
qcom_sub_group_constant_load8(xmem_buffer, weights_buffer, c_offset, f_offset >> 1, 32);
|
||||
f_offset += 64;
|
||||
qcom_sub_group_sync(QCOM_CLK_CONST_LOAD_SYNC);
|
||||
|
||||
r0 += src0.x * weights_cache[0].s0123;
|
||||
r0 += src0.y * weights_cache[0].s4567;
|
||||
r0 += src0.z * weights_cache[0].s89ab;
|
||||
r0 += src0.w * weights_cache[0].scdef;
|
||||
r1 += src0.x * weights_cache[1].s0123;
|
||||
r1 += src0.y * weights_cache[1].s4567;
|
||||
r1 += src0.z * weights_cache[1].s89ab;
|
||||
r1 += src0.w * weights_cache[1].scdef;
|
||||
r2 += src0.x * weights_cache[2].s0123;
|
||||
r2 += src0.y * weights_cache[2].s4567;
|
||||
r2 += src0.z * weights_cache[2].s89ab;
|
||||
r2 += src0.w * weights_cache[2].scdef;
|
||||
r3 += src0.x * weights_cache[3].s0123;
|
||||
r3 += src0.y * weights_cache[3].s4567;
|
||||
r3 += src0.z * weights_cache[3].s89ab;
|
||||
r3 += src0.w * weights_cache[3].scdef;
|
||||
r4 += src0.x * weights_cache[4].s0123;
|
||||
r4 += src0.y * weights_cache[4].s4567;
|
||||
r4 += src0.z * weights_cache[4].s89ab;
|
||||
r4 += src0.w * weights_cache[4].scdef;
|
||||
r5 += src0.x * weights_cache[5].s0123;
|
||||
r5 += src0.y * weights_cache[5].s4567;
|
||||
r5 += src0.z * weights_cache[5].s89ab;
|
||||
r5 += src0.w * weights_cache[5].scdef;
|
||||
r6 += src0.x * weights_cache[6].s0123;
|
||||
r6 += src0.y * weights_cache[6].s4567;
|
||||
r6 += src0.z * weights_cache[6].s89ab;
|
||||
r6 += src0.w * weights_cache[6].scdef;
|
||||
r7 += src0.x * weights_cache[7].s0123;
|
||||
r7 += src0.y * weights_cache[7].s4567;
|
||||
r7 += src0.z * weights_cache[7].s89ab;
|
||||
r7 += src0.w * weights_cache[7].scdef;
|
||||
|
||||
r0 += src1.x * weights_cache[8].s0123;
|
||||
r0 += src1.y * weights_cache[8].s4567;
|
||||
r0 += src1.z * weights_cache[8].s89ab;
|
||||
r0 += src1.w * weights_cache[8].scdef;
|
||||
r1 += src1.x * weights_cache[9].s0123;
|
||||
r1 += src1.y * weights_cache[9].s4567;
|
||||
r1 += src1.z * weights_cache[9].s89ab;
|
||||
r1 += src1.w * weights_cache[9].scdef;
|
||||
r2 += src1.x * weights_cache[10].s0123;
|
||||
r2 += src1.y * weights_cache[10].s4567;
|
||||
r2 += src1.z * weights_cache[10].s89ab;
|
||||
r2 += src1.w * weights_cache[10].scdef;
|
||||
r3 += src1.x * weights_cache[11].s0123;
|
||||
r3 += src1.y * weights_cache[11].s4567;
|
||||
r3 += src1.z * weights_cache[11].s89ab;
|
||||
r3 += src1.w * weights_cache[11].scdef;
|
||||
r4 += src1.x * weights_cache[12].s0123;
|
||||
r4 += src1.y * weights_cache[12].s4567;
|
||||
r4 += src1.z * weights_cache[12].s89ab;
|
||||
r4 += src1.w * weights_cache[12].scdef;
|
||||
r5 += src1.x * weights_cache[13].s0123;
|
||||
r5 += src1.y * weights_cache[13].s4567;
|
||||
r5 += src1.z * weights_cache[13].s89ab;
|
||||
r5 += src1.w * weights_cache[13].scdef;
|
||||
r6 += src1.x * weights_cache[14].s0123;
|
||||
r6 += src1.y * weights_cache[14].s4567;
|
||||
r6 += src1.z * weights_cache[14].s89ab;
|
||||
r6 += src1.w * weights_cache[14].scdef;
|
||||
r7 += src1.x * weights_cache[15].s0123;
|
||||
r7 += src1.y * weights_cache[15].s4567;
|
||||
r7 += src1.z * weights_cache[15].s89ab;
|
||||
r7 += src1.w * weights_cache[15].scdef;
|
||||
} while (coord_s < kpack);
|
||||
|
||||
int coord_s_out = Z*8;
|
||||
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r0); coord_s_out++; }
|
||||
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r1); coord_s_out++; }
|
||||
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r2); coord_s_out++; }
|
||||
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r3); coord_s_out++; }
|
||||
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r4); coord_s_out++; }
|
||||
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r5); coord_s_out++; }
|
||||
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r6); coord_s_out++; }
|
||||
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r7); }
|
||||
}
|
||||
|
||||
__kernel void adreno_xmem_store_dst_f32(
|
||||
__read_only image2d_t dst_img,
|
||||
__global void * dst_void,
|
||||
ulong offset,
|
||||
int M,
|
||||
int N) {
|
||||
const int x = get_global_id(0);
|
||||
const int y = get_global_id(1);
|
||||
const int npack = (M + 3) / 4;
|
||||
|
||||
if (x >= N || y >= npack) {
|
||||
return;
|
||||
}
|
||||
|
||||
__global float * dst = (__global float *)((__global char *)dst_void + offset);
|
||||
const half4 hv = read_imageh(dst_img, smp_zero, (int2)(x, y));
|
||||
const int m = y*4;
|
||||
if (m + 0 < M) dst[x*M + m + 0] = (float)hv.s0;
|
||||
if (m + 1 < M) dst[x*M + m + 1] = (float)hv.s1;
|
||||
if (m + 2 < M) dst[x*M + m + 2] = (float)hv.s2;
|
||||
if (m + 3 < M) dst[x*M + m + 3] = (float)hv.s3;
|
||||
}
|
||||
119
ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl
Normal file
119
ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl
Normal file
@@ -0,0 +1,119 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
|
||||
#define QK_Q4_1 32
|
||||
#define N_SIMDGROUP 4
|
||||
#define SIMDGROUP_WIDTH 64
|
||||
|
||||
static inline float8 q4_1_to_fp32_packed8(ushort2 q4x8, half s, half m) {
|
||||
float8 fp32x8;
|
||||
fp32x8.s0 = (float)((q4x8.s0 & 0x000F) * s + m);
|
||||
fp32x8.s1 = (float)(((q4x8.s0 & 0x00F0) >> 4) * s + m);
|
||||
fp32x8.s2 = (float)(((q4x8.s0 & 0x0F00) >> 8) * s + m);
|
||||
fp32x8.s3 = (float)(((q4x8.s0 & 0xF000) >> 12) * s + m);
|
||||
fp32x8.s4 = (float)((q4x8.s1 & 0x000F) * s + m);
|
||||
fp32x8.s5 = (float)(((q4x8.s1 & 0x00F0) >> 4) * s + m);
|
||||
fp32x8.s6 = (float)(((q4x8.s1 & 0x0F00) >> 8) * s + m);
|
||||
fp32x8.s7 = (float)(((q4x8.s1 & 0xF000) >> 12) * s + m);
|
||||
return fp32x8;
|
||||
}
|
||||
|
||||
|
||||
__attribute__((qcom_reqd_sub_group_size("half")))
|
||||
__kernel void kernel_gemv_moe_q4_1_f32_ns(
|
||||
__global uint * src0_q,
|
||||
__global half * src0_d,
|
||||
__global half * src0_m,
|
||||
__read_only image1d_buffer_t src1,
|
||||
__global uint * src2,
|
||||
__global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne11
|
||||
) {
|
||||
uint i01 = get_global_id(0);
|
||||
uint i20 = get_global_id(2);
|
||||
uint sgid = get_local_id(1);
|
||||
uint slid = get_sub_group_local_id();
|
||||
|
||||
uint i11 = i20 % ne11;
|
||||
|
||||
uint expert_id = src2[i20];
|
||||
uint expert_offset = expert_id * ne00 * ne01 / 32;
|
||||
|
||||
__private float sum = 0.0f; // each thread calculate partial sum of one output
|
||||
|
||||
// loop along ne00 in block granularity, skip 4 blocks every iter
|
||||
for (uint ib00 = sgid; ib00 < (ne00 / QK_Q4_1); ib00 += N_SIMDGROUP) {
|
||||
|
||||
// load one block of q
|
||||
uint4 regQ;
|
||||
uint block_offset = expert_offset * 4 + ib00 * ne01 * 4 + i01;
|
||||
|
||||
regQ.s0 = src0_q[block_offset];
|
||||
regQ.s1 = src0_q[block_offset + ne01];
|
||||
regQ.s2 = src0_q[block_offset + ne01 * 2];
|
||||
regQ.s3 = src0_q[block_offset + ne01 * 3];
|
||||
|
||||
uint offset = i11 * ne00 / 4 + ib00 * 8;
|
||||
|
||||
half regM = src0_m[ib00 * ne01 + i01 + expert_offset];
|
||||
half regS = src0_d[ib00 * ne01 + i01 + expert_offset];
|
||||
|
||||
float8 fp32x8 = q4_1_to_fp32_packed8(as_ushort2(regQ.s0), regS, regM);
|
||||
|
||||
float4 shared_y4;
|
||||
shared_y4 = read_imagef(src1, (offset + 0));
|
||||
float4 acc = shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 1));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
fp32x8 = q4_1_to_fp32_packed8(as_ushort2(regQ.s1), regS, regM);
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 2));
|
||||
acc += shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 3));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
|
||||
fp32x8 = q4_1_to_fp32_packed8(as_ushort2(regQ.s2), regS, regM);
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 4));
|
||||
acc += shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 5));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
|
||||
fp32x8 = q4_1_to_fp32_packed8(as_ushort2(regQ.s3), regS, regM);
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 6));
|
||||
acc += shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 7));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3));
|
||||
}
|
||||
|
||||
// reduction in local memory, assumes #subgroups=4
|
||||
__local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)];
|
||||
if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum;
|
||||
if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum;
|
||||
if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
|
||||
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
|
||||
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
|
||||
|
||||
// 1 outputs per thread in subgroup 0
|
||||
if (sgid == 0) {
|
||||
dst = dst + (offsetd >> 2);
|
||||
dst[i01 + i20 * ne01] = sum;
|
||||
}
|
||||
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user