mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-05-28 17:27:26 +03:00
Compare commits
75 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
314e729347 | ||
|
|
d55fb97174 | ||
|
|
826539ce59 | ||
|
|
b96487645c | ||
|
|
9627d0f540 | ||
|
|
e2ef8fe42c | ||
|
|
6d57c26ef8 | ||
|
|
28123a3937 | ||
|
|
549b9d8433 | ||
|
|
5d246a792d | ||
|
|
63248fc3e3 | ||
|
|
83eebe9d08 | ||
|
|
fff63b5108 | ||
|
|
f3061116ff | ||
|
|
1c0f6db545 | ||
|
|
cec51c7a7d | ||
|
|
b22ff4b7b4 | ||
|
|
c0c7e147e7 | ||
|
|
b0df4c0cfd | ||
|
|
a497476330 | ||
|
|
95405ac65f | ||
|
|
0f3cb3fc8b | ||
|
|
1acee6bf89 | ||
|
|
ef570f6308 | ||
|
|
cc9e331213 | ||
|
|
bcfd1989e9 | ||
|
|
56f16f235c | ||
|
|
8cc67efcd4 | ||
|
|
95feeab52e | ||
|
|
99d4026b11 | ||
|
|
9c92e96a64 | ||
|
|
afcda09d15 | ||
|
|
bbce619adb | ||
|
|
4f0e43da6f | ||
|
|
bb28c1fe24 | ||
|
|
ee7c30578a | ||
|
|
47c0eda9d4 | ||
|
|
5306f4b3b5 | ||
|
|
40d5358d3c | ||
|
|
b65bb4baae | ||
|
|
a1a69f777a | ||
|
|
52fb93a2bd | ||
|
|
c9021714e8 | ||
|
|
1d7ab2b947 | ||
|
|
12e5d99078 | ||
|
|
7ea23ddf7b | ||
|
|
2fc8d1851e | ||
|
|
5e932a1c8d | ||
|
|
2754ce1b3e | ||
|
|
eeeaf6180b | ||
|
|
0be84685bd | ||
|
|
ce02093fdd | ||
|
|
6a257d4463 | ||
|
|
3a479c9132 | ||
|
|
ad27757261 | ||
|
|
3a6db741a8 | ||
|
|
510b5c2a35 | ||
|
|
a8681a0ed2 | ||
|
|
acd604fb27 | ||
|
|
6ce96713de | ||
|
|
c9872a2575 | ||
|
|
e947228222 | ||
|
|
29f1482221 | ||
|
|
e6b4acfe86 | ||
|
|
e2b129e1bf | ||
|
|
7e50ef7d79 | ||
|
|
5028447384 | ||
|
|
585080d310 | ||
|
|
57ebaf4edd | ||
|
|
871b0b70f8 | ||
|
|
b39a7bf1b0 | ||
|
|
b28a2f372a | ||
|
|
17d22a35b2 | ||
|
|
67ace021da | ||
|
|
a8078675a6 |
@@ -58,6 +58,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full && \
|
||||
cp build/bin/* /app/full/ && \
|
||||
cp *.py /app/full/ && \
|
||||
cp -r conversion /app/full/ && \
|
||||
cp -r gguf-py /app/full/ && \
|
||||
cp -r requirements /app/full/ && \
|
||||
cp requirements.txt /app/full/
|
||||
|
||||
@@ -30,6 +30,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -36,6 +36,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -36,6 +36,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -41,6 +41,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -81,6 +81,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/ReleaseOV/bin/* /app/full/ \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -53,6 +53,7 @@ RUN mkdir -p /app/lib \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/root/.ccache \
|
||||
|
||||
COPY *.py /opt/llama.cpp/bin
|
||||
COPY .devops/tools.sh /opt/llama.cpp/bin
|
||||
COPY conversion /opt/llama.cpp/conversion
|
||||
|
||||
COPY gguf-py /opt/llama.cpp/gguf-py
|
||||
COPY requirements.txt /opt/llama.cpp/gguf-py
|
||||
@@ -47,9 +48,10 @@ COPY requirements /opt/llama.cpp/gguf-py/requirements
|
||||
FROM scratch AS collector
|
||||
|
||||
# Copy llama.cpp binaries and libraries
|
||||
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
|
||||
COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
|
||||
COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
|
||||
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
|
||||
COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
|
||||
COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
|
||||
COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion
|
||||
|
||||
|
||||
### Base image
|
||||
@@ -107,6 +109,7 @@ RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
|
||||
|
||||
COPY --from=collector /llama.cpp/bin /app
|
||||
COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
|
||||
COPY --from=collector /llama.cpp/conversion /app/conversion
|
||||
|
||||
RUN pip install --no-cache-dir --break-system-packages \
|
||||
-r /app/gguf-py/requirements.txt
|
||||
|
||||
@@ -26,6 +26,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
4
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
4
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
@@ -100,8 +100,8 @@ body:
|
||||
label: Relevant log output
|
||||
description: >
|
||||
Please copy and paste any relevant log output, including the command that you entered and any generated text.
|
||||
For very long logs (thousands of lines), preferably upload them as files instead.
|
||||
On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
|
||||
For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose.
|
||||
On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command.
|
||||
value: |
|
||||
<details>
|
||||
<summary>Logs</summary>
|
||||
|
||||
4
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
4
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
@@ -88,8 +88,8 @@ body:
|
||||
description: >
|
||||
If applicable, please copy and paste any relevant log output, including any generated text.
|
||||
If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
|
||||
For very long logs (thousands of lines), please upload them as files instead.
|
||||
On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
|
||||
For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose.
|
||||
On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command.
|
||||
value: |
|
||||
<details>
|
||||
<summary>Logs</summary>
|
||||
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
android-ndk-snapdragon:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.6'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -61,7 +61,7 @@ jobs:
|
||||
linux-iot-snapdragon:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.1'
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.6'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
5
.github/workflows/build-android.yml
vendored
5
.github/workflows/build-android.yml
vendored
@@ -73,6 +73,11 @@ jobs:
|
||||
fetch-depth: 0
|
||||
lfs: false
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y build-essential
|
||||
|
||||
- name: Build
|
||||
id: ndk_build
|
||||
run: |
|
||||
|
||||
5
.github/workflows/build-apple.yml
vendored
5
.github/workflows/build-apple.yml
vendored
@@ -59,6 +59,7 @@ jobs:
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
@@ -89,6 +90,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -138,6 +140,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -163,6 +166,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -206,6 +210,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
|
||||
18
.github/workflows/build-cmake-pkg.yml
vendored
18
.github/workflows/build-cmake-pkg.yml
vendored
@@ -5,23 +5,23 @@ on:
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, Linux, CPU]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y build-essential tcl cmake
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
PREFIX="$(pwd)"/inst
|
||||
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
|
||||
-DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_PREFIX_PATH="$PREFIX" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix "$PREFIX" --config Release
|
||||
|
||||
|
||||
118
.github/workflows/build-self-hosted.yml
vendored
118
.github/workflows/build-self-hosted.yml
vendored
@@ -55,24 +55,7 @@ env:
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
determine-tag:
|
||||
name: Determine tag name
|
||||
runs-on: ubuntu-slim
|
||||
outputs:
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
ggml-ci-nvidia-cuda:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
steps:
|
||||
@@ -82,14 +65,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
nvidia-smi
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-vulkan-cm:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
steps:
|
||||
@@ -99,14 +79,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-vulkan-cm2:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]
|
||||
|
||||
steps:
|
||||
@@ -116,14 +93,12 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-webgpu:
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
runs-on: [self-hosted, Linux, NVIDIA, X64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -149,7 +124,7 @@ jobs:
|
||||
GG_BUILD_WEBGPU=1 \
|
||||
GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
|
||||
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMX-compatible machine
|
||||
#ggml-ci-cpu-amx:
|
||||
@@ -163,7 +138,7 @@ jobs:
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
# bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMD GPU machine
|
||||
# ggml-ci-amd-vulkan:
|
||||
@@ -178,7 +153,7 @@ jobs:
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# vulkaninfo --summary
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMD GPU machine
|
||||
# ggml-ci-amd-rocm:
|
||||
@@ -193,10 +168,9 @@ jobs:
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# amd-smi static
|
||||
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-metal:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
@@ -206,13 +180,10 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-webgpu:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
@@ -235,14 +206,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-vulkan:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
@@ -252,14 +220,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-linux-intel-vulkan:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, Linux, Intel]
|
||||
|
||||
steps:
|
||||
@@ -271,14 +236,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-win-intel-vulkan:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, Windows, X64, Intel]
|
||||
|
||||
steps:
|
||||
@@ -293,7 +255,6 @@ jobs:
|
||||
MSYSTEM: UCRT64
|
||||
CHERE_INVOKING: 1
|
||||
PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
# Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
|
||||
@@ -301,7 +262,6 @@ jobs:
|
||||
LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp
|
||||
|
||||
ggml-ci-intel-openvino-gpu-low-perf:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, Linux, Intel, OpenVINO]
|
||||
|
||||
concurrency:
|
||||
@@ -333,8 +293,64 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-arm64-cpu-low-perf:
|
||||
runs-on: [self-hosted, Linux, ARM64, CPU]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-arm64-cpu-high-perf:
|
||||
runs-on: [self-hosted, Linux, ARM64, CPU]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: not sure how to detect ARM flags on DGX Spark. currently get this error during cmake:
|
||||
# CMake Warning at ggml/src/ggml-cpu/CMakeLists.txt:147 (message):
|
||||
# ARM -march/-mcpu not found, -mcpu=native will be used
|
||||
#
|
||||
# if we resolve this, we should be able to offload these jobs to the self-hosted runners
|
||||
#
|
||||
# ggml-ci-arm64-cpu-high-perf-sve:
|
||||
# runs-on: [self-hosted, Linux, ARM64, CPU]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
#
|
||||
# ggml-ci-arm64-cpu-kleidiai:
|
||||
# runs-on: [self-hosted, Linux, ARM64, CPU]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
102
.github/workflows/build.yml
vendored
102
.github/workflows/build.yml
vendored
@@ -931,31 +931,32 @@ jobs:
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-arm64-cpu-low-perf:
|
||||
runs-on: ubuntu-22.04-arm
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ggml-ci-arm64-cpu-low-perf
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
# note: moved to build-self-hosted.yml - can remove from here when everything is stable
|
||||
# ggml-ci-arm64-cpu-low-perf:
|
||||
# runs-on: ubuntu-22.04-arm
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ggml-ci-arm64-cpu-low-perf
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# - name: Dependencies
|
||||
# id: depends
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install build-essential
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-x64-cpu-high-perf:
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -983,31 +984,32 @@ jobs:
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-arm64-cpu-high-perf:
|
||||
runs-on: ubuntu-22.04-arm
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ggml-ci-arm64-cpu-high-perf
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
# note: moved to build-self-hosted.yml - can remove from here when everything is stable
|
||||
# ggml-ci-arm64-cpu-high-perf:
|
||||
# runs-on: ubuntu-22.04-arm
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ggml-ci-arm64-cpu-high-perf
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# - name: Dependencies
|
||||
# id: depends
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install build-essential
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-arm64-cpu-high-perf-sve:
|
||||
runs-on: ubuntu-22.04-arm
|
||||
|
||||
2
.github/workflows/check-vendor.yml
vendored
2
.github/workflows/check-vendor.yml
vendored
@@ -19,7 +19,7 @@ on:
|
||||
|
||||
jobs:
|
||||
check-vendor:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
2
.github/workflows/code-style.yml
vendored
2
.github/workflows/code-style.yml
vendored
@@ -15,7 +15,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
model-naming:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- name: Check model naming conventions
|
||||
|
||||
2
.github/workflows/editorconfig.yml
vendored
2
.github/workflows/editorconfig.yml
vendored
@@ -15,7 +15,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
editorconfig:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0
|
||||
|
||||
2
.github/workflows/pre-tokenizer-hashes.yml
vendored
2
.github/workflows/pre-tokenizer-hashes.yml
vendored
@@ -12,7 +12,7 @@ on:
|
||||
|
||||
jobs:
|
||||
pre-tokenizer-hashes:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
|
||||
@@ -20,7 +20,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
python-check-requirements:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, CPU, fast]
|
||||
name: check-requirements
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
|
||||
2
.github/workflows/python-lint.yml
vendored
2
.github/workflows/python-lint.yml
vendored
@@ -21,7 +21,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
flake8-lint:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
name: Lint
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
|
||||
2
.github/workflows/python-type-check.yml
vendored
2
.github/workflows/python-type-check.yml
vendored
@@ -22,7 +22,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
python-type-check:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
name: python type-check
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
|
||||
20
.github/workflows/release.yml
vendored
20
.github/workflows/release.yml
vendored
@@ -1108,6 +1108,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -1233,6 +1234,9 @@ jobs:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
|
||||
ui-build:
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
|
||||
release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
|
||||
@@ -1258,6 +1262,7 @@ jobs:
|
||||
- macOS-cpu
|
||||
- ios-xcode-build
|
||||
- openEuler-cann
|
||||
- ui-build
|
||||
|
||||
outputs:
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
@@ -1317,6 +1322,18 @@ jobs:
|
||||
mv -v artifact/*.zip release
|
||||
mv -v artifact/*.tar.gz release
|
||||
|
||||
- name: Download UI build
|
||||
id: download_ui
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: ui-build
|
||||
path: ./ui-dist
|
||||
|
||||
- name: Package UI
|
||||
id: package_ui
|
||||
run: |
|
||||
tar -czvf release/llama-${{ steps.tag.outputs.name }}-ui.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./ui-dist .
|
||||
|
||||
- name: Create release
|
||||
id: create_release
|
||||
uses: ggml-org/action-create-release@v1
|
||||
@@ -1366,6 +1383,9 @@ jobs:
|
||||
- [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
|
||||
- [openEuler aarch64 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64-aclgraph.tar.gz)
|
||||
|
||||
**UI:**
|
||||
- [UI](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-ui.tar.gz)
|
||||
|
||||
- name: Upload release
|
||||
id: upload_release
|
||||
uses: actions/github-script@v8
|
||||
|
||||
77
.github/workflows/server-self-hosted.yml
vendored
77
.github/workflows/server-self-hosted.yml
vendored
@@ -91,45 +91,44 @@ jobs:
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
# TODO: provision CUDA runner
|
||||
# server-cuda:
|
||||
# runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||
#
|
||||
# name: server-cuda (${{ matrix.wf_name }})
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build_type: [Release]
|
||||
# wf_name: ["GPUx1"]
|
||||
# include:
|
||||
# - build_type: Release
|
||||
# extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
# wf_name: "GPUx1, backend-sampling"
|
||||
# fail-fast: false
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
# ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
||||
# cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
#
|
||||
# - name: Tests
|
||||
# id: server_integration_tests
|
||||
# if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
# run: |
|
||||
# cd tools/server/tests
|
||||
# python3 -m venv venv
|
||||
# source venv/bin/activate
|
||||
# pip install -r requirements.txt
|
||||
# export ${{ matrix.extra_args }}
|
||||
# pytest -v -x -m "not slow"
|
||||
server-cuda:
|
||||
runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||
|
||||
name: server-cuda (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
wf_name: ["GPUx1"]
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx1, backend-sampling"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
server-kleidiai:
|
||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||
|
||||
14
.github/workflows/server.yml
vendored
14
.github/workflows/server.yml
vendored
@@ -54,8 +54,13 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
ui-build:
|
||||
name: Build Web UI
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
|
||||
server:
|
||||
runs-on: ubuntu-latest
|
||||
needs: ui-build
|
||||
|
||||
name: server (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
@@ -93,12 +98,11 @@ jobs:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
- name: Download built UI
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
name: ui-build
|
||||
path: tools/ui/dist
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
||||
7
.github/workflows/ui-build.yml
vendored
7
.github/workflows/ui-build.yml
vendored
@@ -5,8 +5,7 @@ on:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: Build static output
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
@@ -31,7 +30,7 @@ jobs:
|
||||
|
||||
- name: Generate checksums
|
||||
run: |
|
||||
cd build/tools/ui/dist
|
||||
cd tools/ui/dist
|
||||
for f in *; do
|
||||
sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
|
||||
done
|
||||
@@ -40,5 +39,5 @@ jobs:
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: ui-build
|
||||
path: build/tools/ui/dist/
|
||||
path: tools/ui/dist/
|
||||
retention-days: 1
|
||||
|
||||
6
.github/workflows/ui-publish.yml
vendored
6
.github/workflows/ui-publish.yml
vendored
@@ -38,7 +38,7 @@ jobs:
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: ui-build
|
||||
path: build/tools/ui/dist/
|
||||
path: tools/ui/dist/
|
||||
|
||||
- name: Install Hugging Face Hub CLI
|
||||
run: pip install -U huggingface_hub
|
||||
@@ -49,12 +49,12 @@ jobs:
|
||||
- name: Sync built files to Hugging Face bucket (version tag)
|
||||
run: |
|
||||
# Upload the built files to the Hugging Face bucket under the release version
|
||||
hf buckets sync build/tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
|
||||
hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
|
||||
|
||||
- name: Sync built files to Hugging Face bucket (latest)
|
||||
run: |
|
||||
# Also upload to the 'latest' directory for fallback downloads
|
||||
hf buckets sync build/tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
|
||||
hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
|
||||
|
||||
- name: Verify upload
|
||||
run: |
|
||||
|
||||
118
.github/workflows/ui-self-hosted.yml
vendored
Normal file
118
.github/workflows/ui-self-hosted.yml
vendored
Normal file
@@ -0,0 +1,118 @@
|
||||
name: UI (self-hosted)
|
||||
|
||||
# these are the same as ui.yml, but with self-hosted runners
|
||||
# the runners come with pre-installed Playwright browsers version: 1.56.1
|
||||
# the jobs are much lighter because they don't need to install node and playwright browsers
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
sha:
|
||||
description: 'Commit SHA1 to build'
|
||||
required: false
|
||||
type: string
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/ui-self-hosted.yml',
|
||||
'.github/workflows/ui-build.yml',
|
||||
'tools/ui/**.*',
|
||||
'tools/server/tests/**.*'
|
||||
]
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/ui-self-hosted.yml',
|
||||
'.github/workflows/ui-build.yml',
|
||||
'tools/ui/**.*',
|
||||
'tools/server/tests/**.*'
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
ui-build:
|
||||
name: Build static output
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
|
||||
ui-checks:
|
||||
name: Checks
|
||||
needs: ui-build
|
||||
runs-on: [self-hosted, PLAYWRIGHT]
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Install dependencies
|
||||
id: setup
|
||||
run: npm ci
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run type checking
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run check
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run linting
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run lint
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run Client tests
|
||||
if: ${{ always() }}
|
||||
run: npm run test:client
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run Unit tests
|
||||
if: ${{ always() }}
|
||||
run: npm run test:unit
|
||||
working-directory: tools/ui
|
||||
|
||||
e2e-tests:
|
||||
name: E2E Tests
|
||||
needs: ui-build
|
||||
runs-on: [self-hosted, PLAYWRIGHT]
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Install dependencies
|
||||
id: setup
|
||||
run: npm ci
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Build application
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run build
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Build Storybook
|
||||
if: ${{ always() }}
|
||||
run: npm run build-storybook
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run UI tests
|
||||
if: ${{ always() }}
|
||||
run: npm run test:ui -- --testTimeout=60000
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run E2E tests
|
||||
if: ${{ always() }}
|
||||
run: npm run test:e2e
|
||||
working-directory: tools/ui
|
||||
@@ -1,4 +1,4 @@
|
||||
name: CI (UI)
|
||||
name: UI
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
@@ -11,14 +11,16 @@ on:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/ui-ci.yml',
|
||||
'.github/workflows/ui.yml',
|
||||
'.github/workflows/ui-build.yml',
|
||||
'tools/ui/**.*',
|
||||
'tools/server/tests/**.*'
|
||||
]
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/ui-ci.yml',
|
||||
'.github/workflows/ui.yml',
|
||||
'.github/workflows/ui-build.yml',
|
||||
'tools/ui/**.*',
|
||||
'tools/server/tests/**.*'
|
||||
]
|
||||
@@ -39,7 +41,7 @@ jobs:
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
|
||||
ui-checks:
|
||||
name: UI Checks
|
||||
name: Checks
|
||||
needs: ui-build
|
||||
runs-on: ubuntu-latest
|
||||
continue-on-error: true
|
||||
4
.github/workflows/update-ops-docs.yml
vendored
4
.github/workflows/update-ops-docs.yml
vendored
@@ -3,18 +3,20 @@ name: Update Operations Documentation
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- '.github/workflows/update-ops-docs.yml'
|
||||
- 'docs/ops.md'
|
||||
- 'docs/ops/**'
|
||||
- 'scripts/create_ops_docs.py'
|
||||
pull_request:
|
||||
paths:
|
||||
- '.github/workflows/update-ops-docs.yml'
|
||||
- 'docs/ops.md'
|
||||
- 'docs/ops/**'
|
||||
- 'scripts/create_ops_docs.py'
|
||||
|
||||
jobs:
|
||||
update-ops-docs:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
You are a coding agent. Here are some very important rules that you must follow:
|
||||
|
||||
General:
|
||||
- By very precise and concise when writing code, comments, explanations, etc.
|
||||
- Be very precise and concise when writing code, comments, explanations, etc.
|
||||
- PR and commit titles format: `<module> : <title>`. Lookup recents for examples
|
||||
- Don't try to build or run the code unless you are explicitly asked to do so
|
||||
- Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources
|
||||
@@ -16,7 +16,8 @@ Pull requests (PRs):
|
||||
- New branch names are prefixed with "gg/"
|
||||
- Before opening a pull request, ask the user to confirm the description
|
||||
- When creating a pull request, look for the repository's PR template and follow it
|
||||
- For the AI usage disclosure section, write "YES. llama.cpp + pi"
|
||||
- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]"
|
||||
- Ask the user to tell you what model was used and write it in place of [MODEL]
|
||||
- Always create the pull requests in draft mode
|
||||
|
||||
Commits:
|
||||
|
||||
@@ -104,24 +104,16 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
||||
option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
|
||||
|
||||
# extra artifacts
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_UI "llama: build the embedded Web UI for server" ON)
|
||||
option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON)
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_APP "llama: build the unified binary" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_UI "llama: build the embedded Web UI for server" ON)
|
||||
option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON)
|
||||
|
||||
# Backward compat: when old var is set but new one isn't, forward the value
|
||||
if(DEFINED LLAMA_BUILD_WEBUI)
|
||||
set(LLAMA_BUILD_UI ${LLAMA_BUILD_WEBUI})
|
||||
message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead")
|
||||
endif()
|
||||
if(DEFINED LLAMA_USE_PREBUILT_WEBUI)
|
||||
set(LLAMA_USE_PREBUILT_UI ${LLAMA_USE_PREBUILT_WEBUI})
|
||||
message(DEPRECATION "LLAMA_USE_PREBUILT_WEBUI is deprecated, use LLAMA_USE_PREBUILT_UI instead")
|
||||
endif()
|
||||
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
||||
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
|
||||
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
||||
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
|
||||
|
||||
# 3rd party libs
|
||||
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON)
|
||||
@@ -226,6 +218,10 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
|
||||
add_subdirectory(tools)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BUILD_APP)
|
||||
add_subdirectory(app)
|
||||
endif()
|
||||
|
||||
# Automatically add all files from the 'licenses' directory
|
||||
file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
|
||||
|
||||
|
||||
@@ -49,7 +49,6 @@
|
||||
/examples/parallel/ @ggerganov
|
||||
/examples/passkey/ @ggerganov
|
||||
/examples/retrieval/ @ggerganov
|
||||
/examples/save-load-state/ @ggerganov
|
||||
/examples/speculative-simple/ @ggerganov
|
||||
/examples/speculative/ @ggerganov
|
||||
/ggml/cmake/ @ggerganov
|
||||
|
||||
@@ -27,6 +27,7 @@ LLM inference in C/C++
|
||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
||||
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
|
||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||
- WebGPU support is now available in the browser, see a blog/demo introducing it [here](https://reeselevine.github.io/llamas-on-the-web/).
|
||||
|
||||
----
|
||||
|
||||
@@ -290,7 +291,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [CANN](docs/build.md#cann) | Ascend NPU |
|
||||
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
||||
| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
|
||||
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
|
||||
| [WebGPU](docs/build.md#webgpu) | All |
|
||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
||||
| [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon |
|
||||
| [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
|
||||
|
||||
20
app/CMakeLists.txt
Normal file
20
app/CMakeLists.txt
Normal file
@@ -0,0 +1,20 @@
|
||||
set(TARGET llama-app)
|
||||
|
||||
add_executable(${TARGET} llama.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE
|
||||
llama-server-impl
|
||||
llama-cli-impl
|
||||
llama-completion-impl
|
||||
llama-bench-impl
|
||||
llama-batched-bench-impl
|
||||
llama-fit-params-impl
|
||||
llama-quantize-impl
|
||||
llama-perplexity-impl
|
||||
)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
endif()
|
||||
95
app/llama.cpp
Normal file
95
app/llama.cpp
Normal file
@@ -0,0 +1,95 @@
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// visible
|
||||
int llama_server(int argc, char ** argv);
|
||||
int llama_cli(int argc, char ** argv);
|
||||
|
||||
// hidden
|
||||
int llama_completion(int argc, char ** argv);
|
||||
int llama_bench(int argc, char ** argv);
|
||||
int llama_batched_bench(int argc, char ** argv);
|
||||
int llama_fit_params(int argc, char ** argv);
|
||||
int llama_quantize(int argc, char ** argv);
|
||||
int llama_perplexity(int argc, char ** argv);
|
||||
|
||||
static int help(int argc, char ** argv);
|
||||
static int version(int argc, char ** argv);
|
||||
|
||||
struct command {
|
||||
const char * name;
|
||||
const char * desc;
|
||||
std::vector<std::string> aliases;
|
||||
bool hidden;
|
||||
int (*func)(int, char **);
|
||||
};
|
||||
|
||||
static const command cmds[] = {
|
||||
{"serve", "HTTP API server", {"server"}, false, llama_server },
|
||||
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
|
||||
{"completion", "Text completion", {"complete"}, true, llama_completion },
|
||||
{"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench },
|
||||
{"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench},
|
||||
{"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params },
|
||||
{"quantize", "Quantize a model", {}, true, llama_quantize },
|
||||
{"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity },
|
||||
{"version", "Show version", {}, true, version },
|
||||
{"help", "Show available commands", {}, true, help },
|
||||
};
|
||||
|
||||
static int version(int argc, char ** argv) {
|
||||
printf("%s\n", llama_build_info());
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int help(int argc, char ** argv) {
|
||||
const bool show_all = argc >= 2 && std::string(argv[1]) == "all";
|
||||
|
||||
printf("Usage: llama <command> [options]\n\nAvailable commands:\n");
|
||||
|
||||
for (const auto & cmd : cmds) {
|
||||
if (show_all || !cmd.hidden) {
|
||||
printf(" %-15s %s\n", cmd.name, cmd.desc);
|
||||
}
|
||||
}
|
||||
printf("\nRun 'llama <command> --help' for command-specific usage.\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool matches(const std::string & arg, const command & cmd) {
|
||||
if (arg == cmd.name) {
|
||||
return true;
|
||||
}
|
||||
for (const auto & alias : cmd.aliases) {
|
||||
if (arg == alias) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
const std::string arg = argc >= 2 ? argv[1] : "help";
|
||||
|
||||
for (const auto & cmd : cmds) {
|
||||
if (matches(arg, cmd)) {
|
||||
|
||||
// router spawns children through this same binary, it needs the
|
||||
// subcommand to relaunch as 'llama serve' and not bare options
|
||||
#ifdef _WIN32
|
||||
_putenv_s("LLAMA_APP_CMD", cmd.name);
|
||||
#else
|
||||
setenv("LLAMA_APP_CMD", cmd.name, 1);
|
||||
#endif
|
||||
return cmd.func(argc - 1, argv + 1);
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "error: unknown command '%s'\n", arg.c_str());
|
||||
return 1;
|
||||
}
|
||||
@@ -7,6 +7,7 @@ VISIONOS_MIN_OS_VERSION=1.0
|
||||
TVOS_MIN_OS_VERSION=16.4
|
||||
|
||||
BUILD_SHARED_LIBS=OFF
|
||||
LLAMA_BUILD_APP=OFF
|
||||
LLAMA_BUILD_EXAMPLES=OFF
|
||||
LLAMA_BUILD_TOOLS=OFF
|
||||
LLAMA_BUILD_TESTS=OFF
|
||||
@@ -31,6 +32,7 @@ COMMON_CMAKE_ARGS=(
|
||||
-DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
|
||||
-DLLAMA_BUILD_APP=${LLAMA_BUILD_APP}
|
||||
-DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
|
||||
-DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
|
||||
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
|
||||
|
||||
10
ci/run.sh
10
ci/run.sh
@@ -238,7 +238,7 @@ function gg_run_ctest_debug {
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
(time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops|test-llama-archs" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
set +e
|
||||
}
|
||||
@@ -461,10 +461,10 @@ function gg_run_qwen3_0_6b {
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
||||
@@ -1334,12 +1334,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
add_opt(common_arg(
|
||||
{"-cpent", "--checkpoint-every-n-tokens"}, "N",
|
||||
string_format("create a checkpoint every n tokens during prefill (processing), -1 to disable (default: %d)", params.checkpoint_every_nt),
|
||||
{"-cms", "--checkpoint-min-step"}, "N",
|
||||
string_format("minimum spacing between context checkpoints in tokens (default: %d, 0 = no minimum)", params.checkpoint_min_step),
|
||||
[](common_params & params, int value) {
|
||||
params.checkpoint_every_nt = value;
|
||||
if (value < 0) {
|
||||
throw std::invalid_argument("checkpoint-min-step must be non-negative");
|
||||
}
|
||||
params.checkpoint_min_step = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_CHECKPOINT_EVERY_NT").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
).set_env("LLAMA_ARG_CHECKPOINT_MIN_SPACING_NT").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-cram", "--cache-ram"}, "N",
|
||||
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
|
||||
@@ -3591,6 +3594,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.speculative.draft.p_min = std::stof(value);
|
||||
}
|
||||
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_MIN"));
|
||||
add_opt(common_arg(
|
||||
{"--spec-draft-backend-sampling"},
|
||||
{"--no-spec-draft-backend-sampling"},
|
||||
string_format("offload draft sampling to the backend (default: %s)",
|
||||
params.speculative.draft.backend_sampling ? "enabled" : "disabled"),
|
||||
[](common_params & params, bool value) {
|
||||
params.speculative.draft.backend_sampling = value;
|
||||
}
|
||||
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING"));
|
||||
add_opt(common_arg(
|
||||
{"--spec-draft-device", "-devd", "--device-draft"}, "<dev1,dev2,..>",
|
||||
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
||||
|
||||
@@ -310,6 +310,8 @@ std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segm
|
||||
|
||||
namespace autoparser {
|
||||
|
||||
static const std::string ERR_TMPL = "#**ERROR**#";
|
||||
|
||||
std::string apply_template(const common_chat_template & tmpl, const template_params & params) {
|
||||
generation_params tmpl_params;
|
||||
tmpl_params.messages = params.messages;
|
||||
@@ -326,7 +328,7 @@ std::string apply_template(const common_chat_template & tmpl, const template_par
|
||||
return common_chat_template_direct_apply(tmpl, tmpl_params);
|
||||
} catch (const std::exception & e) {
|
||||
LOG_DBG("Template application failed: %s\n", e.what());
|
||||
return "";
|
||||
return ERR_TMPL;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -347,7 +349,7 @@ std::optional<compare_variants_result> compare_variants(
|
||||
std::string output_B = apply_template(tmpl, params_B);
|
||||
|
||||
// Check for template application failures
|
||||
if (output_A.empty() || output_B.empty()) {
|
||||
if (output_A == ERR_TMPL || output_B == ERR_TMPL) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
|
||||
@@ -377,6 +377,8 @@ struct analyze_tools : analyze_base {
|
||||
|
||||
struct autoparser {
|
||||
jinja::caps jinja_caps;
|
||||
std::string user_start;
|
||||
std::string assistant_start;
|
||||
analyze_reasoning reasoning;
|
||||
analyze_content content;
|
||||
analyze_tools tools;
|
||||
@@ -387,6 +389,10 @@ struct autoparser {
|
||||
|
||||
autoparser() = default;
|
||||
|
||||
// Find the starting marker for the user message and assistant message
|
||||
std::string detect_user_start_marker(const common_chat_template & tmpl);
|
||||
std::string detect_assistant_start_marker(const common_chat_template & tmpl);
|
||||
|
||||
// Run full differential analysis on a template
|
||||
void analyze_template(const common_chat_template & tmpl);
|
||||
|
||||
|
||||
@@ -8,6 +8,9 @@
|
||||
#include "peg-parser.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <ostream>
|
||||
#include <sstream>
|
||||
|
||||
#define ANSI_RESET "\033[0m"
|
||||
#define ANSI_PURPLE "\033[1m\x1b[38;5;126m"
|
||||
@@ -23,6 +26,7 @@ static const std::string FUN_SECOND = "SSS_SECOND_FUN_S";
|
||||
static const std::string ARG_FIRST = "AA_ARG_FST_AA";
|
||||
static const std::string ARG_SECOND = "BB_ARG_SND_BB";
|
||||
static const std::string USER_MSG = "U_USER_MSG Hello END_U";
|
||||
static const std::string USER_MSG_TWO = "V_USER_MSG Hello END_V";
|
||||
static const std::string ASSISTANT_MSG = "A_ASST_MSG I can help END_A";
|
||||
static const std::string THINKING_CONTENT = "REASON_PART I am thinking END_R";
|
||||
static const std::string CALL_ID_001 = "call00001";
|
||||
@@ -71,6 +75,7 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
|
||||
analysis.content.end = "<|END_OF_TURN_TOKEN|>";
|
||||
analysis.preserved_tokens.push_back("<|CHATBOT_TOKEN|>");
|
||||
analysis.preserved_tokens.push_back("<|END_OF_TURN_TOKEN|>");
|
||||
analysis.user_start = "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Cohere Command R+]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
@@ -108,7 +113,59 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
|
||||
analysis.tools.function.close = "```";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: DeepSeek-R1-Distill-Qwen]\n" ANSI_RESET);
|
||||
}
|
||||
}
|
||||
},
|
||||
// Nemotron Nano v2
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find("<SPECIAL_10>") != std::string::npos && tmpl.src.find("<SPECIAL_11>") != std::string::npos &&
|
||||
tmpl.src.find("<SPECIAL_12>") != std::string::npos && tmpl.src.find("<TOOL_RESPONSE>") != std::string::npos) {
|
||||
|
||||
analysis.tools.format.mode = tool_format::JSON_NATIVE;
|
||||
analysis.tools.format.section_start = "";
|
||||
analysis.tools.format.section_end = "";
|
||||
analysis.tools.format.per_call_start = "<TOOLCALL>";
|
||||
analysis.tools.format.per_call_end = "</TOOLCALL>";
|
||||
analysis.content.mode = content_mode::PLAIN;
|
||||
analysis.content.start = "";
|
||||
analysis.content.end = "";
|
||||
analysis.reasoning.mode = reasoning_mode::TAG_BASED;
|
||||
analysis.reasoning.start = "<think>\n\n";
|
||||
analysis.reasoning.end = "</think>";
|
||||
analysis.assistant_start = "<SPECIAL_11>Assistant";
|
||||
analysis.user_start = "<SPECIAL_11>User";
|
||||
analysis.preserved_tokens.clear();
|
||||
analysis.preserved_tokens.push_back("<SPECIAL_12>");
|
||||
analysis.preserved_tokens.push_back("<SPECIAL_11>");
|
||||
analysis.preserved_tokens.push_back("</think>");
|
||||
analysis.preserved_tokens.push_back("<TOOLCALL>");
|
||||
analysis.preserved_tokens.push_back("</TOOLCALL>");
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Nemotron Nano v2]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
// Fireworks
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find("{%- set system_prompt = '<|start_header_id|>' + 'system' + '<|end_header_id|>\\n\\n'"
|
||||
" + message['content'] | trim + '\\n' + system_prompt_suffix + '<|eot_id|>' -%}") != std::string::npos) {
|
||||
analysis.assistant_start = "<|start_header_id|>assistant<|end_header_id|>";
|
||||
analysis.user_start = "<|start_header_id|>user<|end_header_id|>";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Fireworks v2]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
// Solar Open
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find("<|begin|>assistant<|think|><|end|>") != std::string::npos) {
|
||||
analysis.assistant_start = "<|begin|>assistant";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Solar Open]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
// Apriel 1.6
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find("if not loop.last and '[BEGIN FINAL RESPONSE]' in asst_text") != std::string::npos) {
|
||||
analysis.user_start = "<|begin_user|>";
|
||||
analysis.assistant_start = "<|begin_assistant|>";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Apriel 1.6]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
|
||||
});
|
||||
|
||||
// Common JSON structures
|
||||
@@ -166,6 +223,8 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
|
||||
reasoning = analyze_reasoning(tmpl, jinja_caps.supports_tool_calls);
|
||||
content = analyze_content(tmpl, reasoning);
|
||||
tools = analyze_tools(jinja_caps.supports_tool_calls ? analyze_tools(tmpl, jinja_caps, reasoning) : analyze_tools());
|
||||
assistant_start = detect_assistant_start_marker(tmpl);
|
||||
user_start = detect_user_start_marker(tmpl);
|
||||
collect_preserved_tokens();
|
||||
|
||||
for (auto & workaround : workarounds) {
|
||||
@@ -173,6 +232,8 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
|
||||
}
|
||||
|
||||
LOG_DBG("\n--- Reasoning & Content Structure ---\n");
|
||||
LOG_DBG("user_msg_start: %s\n", user_start.c_str());
|
||||
LOG_DBG("assistant_msg_start: %s\n", assistant_start.c_str());
|
||||
LOG_DBG("reasoning_mode: %s\n", mode_to_str(reasoning.mode).c_str());
|
||||
LOG_DBG("reasoning_start: '%s'\n", reasoning.start.c_str());
|
||||
LOG_DBG("reasoning_end: '%s'\n", reasoning.end.c_str());
|
||||
@@ -245,6 +306,120 @@ void autoparser::collect_preserved_tokens() {
|
||||
add_token(tools.call_id.suffix);
|
||||
}
|
||||
|
||||
std::string autoparser::detect_assistant_start_marker(const common_chat_template & tmpl) {
|
||||
json user_msg = json{
|
||||
{ "role", "user" },
|
||||
{ "content", USER_MSG }
|
||||
};
|
||||
|
||||
json assistant_no_reasoning = json{
|
||||
{ "role", "assistant" },
|
||||
{ "content", ASSISTANT_MSG }
|
||||
};
|
||||
|
||||
template_params params;
|
||||
params.messages = json::array({ user_msg });
|
||||
params.add_generation_prompt = false;
|
||||
params.enable_thinking = true;
|
||||
|
||||
auto comparison = compare_variants(
|
||||
tmpl, params, [&](template_params & p) {
|
||||
p.messages = json::array({ user_msg, assistant_no_reasoning });
|
||||
}
|
||||
);
|
||||
|
||||
if (!comparison) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Template application failed, skipping assistant start detection\n" ANSI_RESET, __func__);
|
||||
return "";
|
||||
}
|
||||
|
||||
auto usermsg = comparison->diff.right;
|
||||
if (usermsg.find(ASSISTANT_MSG) == std::string::npos) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Did not find assistant message in assistant message block, skipping detection\n" ANSI_RESET, __func__);
|
||||
}
|
||||
|
||||
auto ast_prefix = usermsg.substr(0, usermsg.find(ASSISTANT_MSG));
|
||||
if (!reasoning.start.empty() && ast_prefix.find(trim_whitespace(reasoning.start)) != std::string::npos) {
|
||||
ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.start)));
|
||||
}
|
||||
if (!reasoning.end.empty() && ast_prefix.find(trim_whitespace(reasoning.end)) != std::string::npos) {
|
||||
ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.end)));
|
||||
}
|
||||
return trim_whitespace(ast_prefix);
|
||||
}
|
||||
|
||||
std::string autoparser::detect_user_start_marker(const common_chat_template & tmpl) {
|
||||
json user_msg = json{
|
||||
{ "role", "user" },
|
||||
{ "content", USER_MSG }
|
||||
};
|
||||
|
||||
json assistant = json{
|
||||
{ "role", "assistant" },
|
||||
{ "content", ASSISTANT_MSG }
|
||||
};
|
||||
|
||||
json user_msg_two = json{
|
||||
{ "role", "user" },
|
||||
{ "content", USER_MSG_TWO }
|
||||
};
|
||||
|
||||
template_params params;
|
||||
params.messages = json::array({});
|
||||
params.add_generation_prompt = false;
|
||||
params.enable_thinking = true;
|
||||
|
||||
auto comparison = compare_variants(
|
||||
tmpl, params, [&](template_params & p) {
|
||||
p.messages = json::array({ user_msg });
|
||||
}
|
||||
);
|
||||
|
||||
if (!comparison) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Template application failed, unsupported empty messages? trying complex variant\n" ANSI_RESET, __func__);
|
||||
params.messages = json::array({ user_msg_two, assistant });
|
||||
comparison = compare_variants(
|
||||
tmpl, params, [&](template_params & p) {
|
||||
p.messages = json::array({ user_msg_two, assistant, user_msg });
|
||||
}
|
||||
);
|
||||
if (!comparison) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Template application failed for reserve variant, aborting\n" ANSI_RESET, __func__);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
auto usermsg = comparison->diff.right;
|
||||
if (usermsg.find(USER_MSG) == std::string::npos) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Did not find user message in user message block, aborting detection\n" ANSI_RESET, __func__);
|
||||
}
|
||||
|
||||
if (usermsg.find(ASSISTANT_MSG) != std::string::npos) {
|
||||
usermsg = usermsg.substr(usermsg.find(ASSISTANT_MSG) + ASSISTANT_MSG.size());
|
||||
}
|
||||
|
||||
auto candidate = usermsg.substr(0, usermsg.find(USER_MSG));
|
||||
auto candidate_split = segmentize_markers(candidate);
|
||||
std::stringstream result;
|
||||
bool encountered_marker = false;
|
||||
for (const auto & mrk : candidate_split) {
|
||||
std::string lower_mrk = std::string(mrk.value);
|
||||
std::transform(lower_mrk.begin(), lower_mrk.end(), lower_mrk.begin(),
|
||||
[](unsigned char c) { return std::tolower(c); });
|
||||
// heuristic to weed out potential end markers, but only at the start
|
||||
if (mrk.type == segment_type::MARKER && !encountered_marker &&
|
||||
(lower_mrk.find("end") != std::string::npos || lower_mrk.find("close") != std::string::npos)) {
|
||||
continue;
|
||||
}
|
||||
if (mrk.type == segment_type::TEXT && !encountered_marker && trim_whitespace(mrk.value).empty()) {
|
||||
continue;
|
||||
}
|
||||
encountered_marker |= mrk.type == segment_type::MARKER;
|
||||
result << mrk.value;
|
||||
}
|
||||
return trim_whitespace(result.str());
|
||||
}
|
||||
|
||||
analyze_reasoning::analyze_reasoning(const common_chat_template & tmpl, bool supports_tools)
|
||||
: analyze_base(tmpl) {
|
||||
LOG_DBG(ANSI_PURPLE "=== Starting differential analysis ===\n" ANSI_RESET);
|
||||
|
||||
@@ -90,6 +90,45 @@ std::string common_chat_msg::render_content(const std::string & delimiter) const
|
||||
return text;
|
||||
}
|
||||
|
||||
std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims) {
|
||||
if (delims.empty() || prompt.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
auto parser = build_peg_parser([&](common_peg_parser_builder & p) {
|
||||
std::vector<std::string> all_delims;
|
||||
std::vector<common_peg_parser> tagged_messages;
|
||||
|
||||
all_delims.reserve(delims.size());
|
||||
tagged_messages.reserve(delims.size());
|
||||
for (const auto & d : delims) {
|
||||
all_delims.push_back(d.delimiter);
|
||||
}
|
||||
|
||||
auto any_delim = p.until_one_of(all_delims);
|
||||
for (const auto & d : delims) {
|
||||
tagged_messages.push_back(p.tag(d.role, p.literal(d.delimiter) + any_delim));
|
||||
}
|
||||
|
||||
return any_delim + p.zero_or_more(p.choice(tagged_messages)) + p.end();
|
||||
});
|
||||
|
||||
common_peg_parse_context ctx(prompt);
|
||||
const auto result = parser.parse(ctx);
|
||||
if (!result.success()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<common_chat_msg_span> spans;
|
||||
ctx.ast.visit(result, [&](const common_peg_ast_node & node) {
|
||||
if (!node.tag.empty()) {
|
||||
spans.push_back({ node.tag, node.start, node.end - node.start });
|
||||
}
|
||||
});
|
||||
|
||||
return spans;
|
||||
}
|
||||
|
||||
json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
|
||||
if (!content.empty() && !content_parts.empty()) {
|
||||
throw std::runtime_error("Cannot specify both content and content_parts");
|
||||
@@ -1042,6 +1081,14 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
|
||||
data.prompt = prompt;
|
||||
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);
|
||||
data.message_spans = common_chat_split_by_role(prompt, {
|
||||
{ "assistant", "<|start|>assistant" },
|
||||
{ "user", "<|start|>user" },
|
||||
{ "system", "<|start|>developer" },
|
||||
{ "system", "<|start|>system" },
|
||||
{ "tool", "<|start|>functions" },
|
||||
});
|
||||
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
|
||||
@@ -1181,6 +1228,11 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
|
||||
data.prompt += data.generation_prompt;
|
||||
}
|
||||
|
||||
data.message_spans = common_chat_split_by_role(data.prompt, {
|
||||
{ "user", "<|turn>user\n" },
|
||||
{ "assistant", "<|turn>model\n" },
|
||||
});
|
||||
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_GEMMA4;
|
||||
data.supports_thinking = true;
|
||||
data.thinking_start_tag = "<|channel>thought";
|
||||
@@ -2393,6 +2445,19 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||
struct autoparser::autoparser autoparser;
|
||||
autoparser.analyze_template(tmpl);
|
||||
auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
|
||||
|
||||
std::vector<common_chat_msg_delimiter> delimiters;
|
||||
if (!autoparser.assistant_start.empty()) {
|
||||
delimiters.push_back({ "assistant", autoparser.assistant_start });
|
||||
}
|
||||
if (!autoparser.user_start.empty()) {
|
||||
delimiters.push_back({ "user", autoparser.user_start });
|
||||
}
|
||||
|
||||
if (!delimiters.empty()) {
|
||||
auto_params.message_spans = common_chat_split_by_role(auto_params.prompt, delimiters);
|
||||
}
|
||||
|
||||
auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
|
||||
if (auto_params.supports_thinking) {
|
||||
auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start);
|
||||
|
||||
@@ -143,6 +143,17 @@ struct common_chat_msg_diff {
|
||||
}
|
||||
};
|
||||
|
||||
struct common_chat_msg_span {
|
||||
std::string role;
|
||||
std::size_t pos = 0;
|
||||
std::size_t len = 0;
|
||||
};
|
||||
|
||||
struct common_chat_msg_delimiter {
|
||||
std::string role;
|
||||
std::string delimiter;
|
||||
};
|
||||
|
||||
struct common_chat_tool {
|
||||
std::string name;
|
||||
std::string description;
|
||||
@@ -208,6 +219,7 @@ struct common_chat_params {
|
||||
std::vector<std::string> preserved_tokens;
|
||||
std::vector<std::string> additional_stops;
|
||||
std::string parser;
|
||||
std::vector<common_chat_msg_span> message_spans;
|
||||
};
|
||||
|
||||
// per-message parsing syntax
|
||||
@@ -219,6 +231,7 @@ struct common_chat_parser_params {
|
||||
bool reasoning_in_content = false;
|
||||
std::string generation_prompt;
|
||||
bool parse_tool_calls = true;
|
||||
bool is_continuation = false;
|
||||
bool echo = false; // Include assistant prefilled msg in output
|
||||
bool debug = false; // Enable debug output for PEG parser
|
||||
common_peg_arena parser = {};
|
||||
@@ -303,6 +316,7 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
|
||||
const std::string & src,
|
||||
autoparser::generation_params & params);
|
||||
|
||||
|
||||
// specialized per-task preset
|
||||
struct common_chat_prompt_preset {
|
||||
std::string system;
|
||||
@@ -310,3 +324,6 @@ struct common_chat_prompt_preset {
|
||||
};
|
||||
|
||||
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
|
||||
|
||||
std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims);
|
||||
|
||||
|
||||
@@ -445,6 +445,27 @@ std::string string_strip(const std::string & str) {
|
||||
return str.substr(start, end - start);
|
||||
}
|
||||
|
||||
std::string string_lcs(std::string_view a, std::string_view b) {
|
||||
if (a.empty() || b.empty()) return {};
|
||||
|
||||
std::vector<std::vector<size_t>> dp(a.size() + 1, std::vector<size_t>(b.size() + 1, 0));
|
||||
size_t best_len = 0;
|
||||
size_t best_end_a = 0;
|
||||
|
||||
for (size_t i = 1; i <= a.size(); ++i) {
|
||||
for (size_t j = 1; j <= b.size(); ++j) {
|
||||
if (a[i - 1] == b[j - 1]) {
|
||||
dp[i][j] = dp[i - 1][j - 1] + 1;
|
||||
if (dp[i][j] > best_len) {
|
||||
best_len = dp[i][j];
|
||||
best_end_a = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return std::string(a.substr(best_end_a - best_len, best_len));
|
||||
}
|
||||
|
||||
std::string string_get_sortable_timestamp() {
|
||||
using clock = std::chrono::system_clock;
|
||||
|
||||
|
||||
@@ -305,6 +305,8 @@ struct common_params_speculative_draft {
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.0f; // minimum speculative decoding probability (greedy)
|
||||
|
||||
bool backend_sampling = true; // offload draft sampling to the backend (default: on)
|
||||
|
||||
common_params_model mparams;
|
||||
|
||||
llama_context * ctx_tgt = nullptr;
|
||||
@@ -592,7 +594,7 @@ struct common_params {
|
||||
bool cache_prompt = true; // whether to enable prompt caching
|
||||
bool cache_idle_slots = true; // save and clear idle slots upon starting a new task
|
||||
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
|
||||
int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
|
||||
int32_t checkpoint_min_step = 256; // minimum spacing between context checkpoints
|
||||
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
@@ -615,11 +617,7 @@ struct common_params {
|
||||
std::map<std::string, std::string> default_template_kwargs;
|
||||
|
||||
// UI configs
|
||||
#ifdef LLAMA_UI_DEFAULT_ENABLED
|
||||
bool ui = LLAMA_UI_DEFAULT_ENABLED != 0;
|
||||
#else
|
||||
bool ui = true; // default to enabled when not set
|
||||
#endif
|
||||
bool ui = true;
|
||||
|
||||
// Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
|
||||
bool webui = ui;
|
||||
@@ -733,6 +731,7 @@ std::string string_format(const char * fmt, ...);
|
||||
|
||||
std::string string_strip(const std::string & str);
|
||||
std::string string_get_sortable_timestamp();
|
||||
std::string string_lcs(std::string_view a, std::string_view b);
|
||||
|
||||
std::string string_join(const std::vector<std::string> & values, const std::string & separator);
|
||||
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
|
||||
|
||||
@@ -26,7 +26,7 @@ class common_params_fit_exception : public std::runtime_error {
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
static std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const llama_model_params * mparams,
|
||||
const llama_context_params * cparams,
|
||||
|
||||
16
common/fit.h
16
common/fit.h
@@ -1,6 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "llama.h"
|
||||
#include "../src/llama-ext.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
enum common_params_fit_status {
|
||||
COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
||||
@@ -30,3 +35,14 @@ void common_fit_print(
|
||||
struct llama_context_params * cparams);
|
||||
|
||||
void common_memory_breakdown_print(const struct llama_context * ctx);
|
||||
|
||||
// Load a model + context with no_alloc and return the per-device memory breakdown.
|
||||
std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const struct llama_model_params * mparams,
|
||||
const struct llama_context_params * cparams,
|
||||
std::vector<ggml_backend_dev_t> & devs,
|
||||
uint32_t & hp_ngl,
|
||||
uint32_t & hp_n_ctx_train,
|
||||
uint32_t & hp_n_expert,
|
||||
enum ggml_log_level log_level);
|
||||
|
||||
@@ -33,16 +33,15 @@ const std::map<std::string, common_speculative_type> common_speculative_type_fro
|
||||
};
|
||||
|
||||
static std::string common_speculative_get_devices_str(const std::vector<ggml_backend_dev_t> & devices) {
|
||||
if (devices.empty()) {
|
||||
return "default";
|
||||
}
|
||||
|
||||
std::string result;
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
if (i > 0) result += ", ";
|
||||
if (devices[i] == nullptr) {
|
||||
continue;
|
||||
}
|
||||
if (!result.empty()) result += ", ";
|
||||
result += ggml_backend_dev_name(devices[i]);
|
||||
}
|
||||
return result;
|
||||
return result.empty() ? "default" : result;
|
||||
}
|
||||
|
||||
struct common_speculative_config {
|
||||
@@ -414,6 +413,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
|
||||
std::vector<common_sampler_ptr> smpls;
|
||||
|
||||
// backend sampler chain per seq, attached to ctx_dft
|
||||
std::vector<llama_sampler *> backend_chains;
|
||||
|
||||
int32_t n_embd = 0;
|
||||
|
||||
// Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1.
|
||||
@@ -445,7 +447,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
n_embd = llama_model_n_embd(llama_get_model(ctx_dft));
|
||||
|
||||
LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling);
|
||||
LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__,
|
||||
this->params.n_gpu_layers,
|
||||
ggml_type_name(this->params.cache_type_k),
|
||||
@@ -469,6 +471,22 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
|
||||
}
|
||||
|
||||
// offload draft sampling to the backend
|
||||
backend_chains.assign(n_seq, nullptr);
|
||||
if (this->params.backend_sampling) {
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
llama_sampler * chain = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
||||
llama_sampler_chain_add(chain, llama_sampler_init_top_k(10));
|
||||
|
||||
if (!llama_set_sampler(ctx_dft, seq_id, chain)) {
|
||||
LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id);
|
||||
llama_sampler_free(chain);
|
||||
chain = nullptr;
|
||||
}
|
||||
backend_chains[seq_id] = chain;
|
||||
}
|
||||
}
|
||||
|
||||
llama_set_embeddings_pre_norm(ctx_tgt, true, /*masked*/ false);
|
||||
llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true);
|
||||
|
||||
@@ -484,6 +502,18 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
}
|
||||
|
||||
~common_speculative_impl_draft_mtp() override {
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) backend_chains.size(); ++seq_id) {
|
||||
if (backend_chains[seq_id] == nullptr) {
|
||||
continue;
|
||||
}
|
||||
if (ctx_dft) {
|
||||
llama_set_sampler(ctx_dft, seq_id, nullptr);
|
||||
}
|
||||
llama_sampler_free(backend_chains[seq_id]);
|
||||
}
|
||||
backend_chains.clear();
|
||||
|
||||
if (batch.token != nullptr) {
|
||||
free(batch.token);
|
||||
batch.token = nullptr;
|
||||
|
||||
@@ -1610,6 +1610,47 @@ class TextModel(ModelBase):
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_hybriddna(self):
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute]
|
||||
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]
|
||||
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
|
||||
# k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get
|
||||
# dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each
|
||||
# k-mer's own id (llama.cpp strips it on detokenization)
|
||||
for kmer in tokenizer.kmers: # ty: ignore[unresolved-attribute]
|
||||
reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000" # ty: ignore[unresolved-attribute]
|
||||
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
|
||||
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
|
||||
|
||||
tokens: list[str] = []
|
||||
toktypes: list[int] = []
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
else:
|
||||
token: str = reverse_vocab[i]
|
||||
if token in added_vocab:
|
||||
if added_tokens_decoder[i].special or self.does_token_look_special(token):
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
tokens.append(token)
|
||||
|
||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||
self.gguf_writer.add_tokenizer_model("hybriddna")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_qwen(self):
|
||||
from .qwen import QwenModel
|
||||
|
||||
|
||||
@@ -189,7 +189,8 @@ class HunYuanModel(TextModel):
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
# HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab
|
||||
# Some HunYuanVL variants (e.g. OCR-style configs) have pad_token_id=-1;
|
||||
# guard SpecialVocab so it doesn't try to emit an invalid pad id.
|
||||
token_types = None
|
||||
if (self.hparams.get("pad_token_id") or 0) < 0:
|
||||
token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask')
|
||||
@@ -250,7 +251,8 @@ class HunYuanModel(TextModel):
|
||||
self._fix_special_tokens()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
# HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it
|
||||
# Some HunYuanVL variants set num_experts=1 (not real MoE);
|
||||
# prevent the parent class from emitting expert_count metadata in that case.
|
||||
saved_num_experts = self.hparams.pop("num_experts", None)
|
||||
super().set_gguf_parameters()
|
||||
if saved_num_experts is not None and saved_num_experts > 1:
|
||||
@@ -288,51 +290,21 @@ class HunYuanModel(TextModel):
|
||||
|
||||
@ModelBase.register("HunYuanVLForConditionalGeneration")
|
||||
class HunyuanVLVisionModel(MmprojModel):
|
||||
# Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name
|
||||
# "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout.
|
||||
# Each variant maps to a different projector type in clip.cpp so image
|
||||
# preprocessing follows the correct code path.
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
# HunyuanOCR / HunyuanVL uses max_image_size instead of image_size
|
||||
# HunyuanVL uses max_image_size instead of image_size
|
||||
if "image_size" not in self.hparams_vision:
|
||||
self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
|
||||
|
||||
@staticmethod
|
||||
def is_ocr_variant(hparams: dict) -> bool:
|
||||
"""Return True for HunyuanOCR, False for HunyuanVL.
|
||||
|
||||
The projector's output dim must equal the text model's hidden_size by
|
||||
construction (that's what "projector" means). HunyuanOCR pairs a 1B text
|
||||
backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the
|
||||
ViT -> LLM projection dim is a hard architectural signature, not a
|
||||
magic number.
|
||||
"""
|
||||
vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0))
|
||||
return vision_out == 1024
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
assert self.hparams_vision is not None
|
||||
vcfg = self.hparams_vision
|
||||
|
||||
if self.is_ocr_variant(self.global_config):
|
||||
# --- HunyuanOCR ---
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
|
||||
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
|
||||
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
|
||||
return
|
||||
|
||||
# --- HunyuanVL ---
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
|
||||
self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu")
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"]))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"]))
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
|
||||
self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
|
||||
self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))
|
||||
|
||||
@@ -353,7 +325,7 @@ class HunyuanVLVisionModel(MmprojModel):
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
# force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
|
||||
# Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
|
||||
# HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
|
||||
if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
|
||||
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
@@ -361,40 +333,18 @@ class HunyuanVLVisionModel(MmprojModel):
|
||||
|
||||
@ModelBase.register("HunYuanVLForConditionalGeneration")
|
||||
class HunyuanVLTextModel(HunYuanModel):
|
||||
# The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR
|
||||
# and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE),
|
||||
# while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from
|
||||
# the config and pick the matching GGUF architecture.
|
||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
|
||||
|
||||
@staticmethod
|
||||
def _is_ocr_config(hparams: dict) -> bool:
|
||||
# OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that
|
||||
# outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with
|
||||
# HunyuanVLVisionModel.is_ocr_variant.
|
||||
return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024
|
||||
|
||||
def __init__(self, dir_model: Path, *args, **kwargs):
|
||||
raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False)
|
||||
if self._is_ocr_config(raw_hparams):
|
||||
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
|
||||
else:
|
||||
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
|
||||
super().__init__(dir_model, *args, **kwargs)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses
|
||||
# the HunYuan-Dense arch which already handles standard rope in super().
|
||||
if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL:
|
||||
return
|
||||
|
||||
# XD-RoPE metadata for the HunyuanVL;
|
||||
if self.rope_parameters.get("rope_type") != "xdrope":
|
||||
return
|
||||
|
||||
# defaults for HunyuanVL. The C++ side later computes:
|
||||
# freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2))
|
||||
self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"]))
|
||||
self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"]))
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
|
||||
@@ -51,6 +51,15 @@ class LlamaModel(TextModel):
|
||||
if path_tekken_json.is_file() and not path_tokenizer_json.is_file():
|
||||
self._set_vocab_mistral()
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
if (add_prefix_space := tokenizer_config_json.get("add_prefix_space")) is not None:
|
||||
self.gguf_writer.add_add_space_prefix(add_prefix_space)
|
||||
if tokenizer_config_json.get("tokenizer_class") == "HybridDNATokenizer":
|
||||
return self._set_vocab_hybriddna()
|
||||
|
||||
try:
|
||||
self._set_vocab_sentencepiece()
|
||||
except FileNotFoundError:
|
||||
@@ -72,13 +81,6 @@ class LlamaModel(TextModel):
|
||||
special_vocab._set_special_token("eot", 32010)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
if "add_prefix_space" in tokenizer_config_json:
|
||||
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||
|
||||
# Apply to granite small models only
|
||||
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
@@ -489,6 +489,7 @@ The following templates have active tests in `tests/test-chat.cpp`:
|
||||
| Qwen-QwQ-32B | Reasoning | Forced-open thinking |
|
||||
| NousResearch Hermes 2 Pro | JSON_NATIVE | `<tool_call>` wrapper |
|
||||
| IBM Granite 3.3 | JSON_NATIVE | `<think></think>` + `<response></response>` |
|
||||
| IBM Granite 4.0 | JSON_NATIVE | `<tool_call>` wrapper (same template used by 4.1) |
|
||||
| ByteDance Seed-OSS | TAG_WITH_TAGGED | Custom `<seed:think>` and `<seed:tool_call>` tags |
|
||||
| Qwen3-Coder | TAG_WITH_TAGGED | XML-style tool format |
|
||||
| DeepSeek V3.1 | JSON_NATIVE | Forced thinking mode |
|
||||
|
||||
@@ -10,8 +10,8 @@
|
||||
"ANDROID_ABI": "arm64-v8a",
|
||||
"ANDROID_PLATFORM": "android-31",
|
||||
"CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake",
|
||||
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
@@ -33,8 +33,8 @@
|
||||
"name": "arm64-windows-snapdragon",
|
||||
"inherits": [ "base", "arm64-windows-llvm" ],
|
||||
"cacheVariables": {
|
||||
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
@@ -59,8 +59,8 @@
|
||||
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "cmake/arm64-linux-clang.cmake",
|
||||
"CMAKE_C_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS": "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
|
||||
@@ -10,7 +10,7 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
|
||||
This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
|
||||
|
||||
```
|
||||
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3
|
||||
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.6
|
||||
[d]/> cd /workspace
|
||||
```
|
||||
|
||||
@@ -24,7 +24,7 @@ Native Windows 11 arm64 builds has the following tools dependencies:
|
||||
- UCRT and Driver Kit
|
||||
- LLVM core libraries and Clang compiler (winget)
|
||||
- CMake, Git, Python (winget)
|
||||
- Hexagon SDK Community Edition 6.4 or later (see windows.md)
|
||||
- Hexagon SDK Community Edition 6.6 or later (see windows.md)
|
||||
- OpenCL SDK 2.3 or later (see windows.md)
|
||||
|
||||
Note: The rest of the **Windows** build process assumes that you're running natively in Powershell.
|
||||
@@ -45,7 +45,7 @@ Preset CMake variables:
|
||||
GGML_HEXAGON="ON"
|
||||
GGML_OPENCL="ON"
|
||||
GGML_OPENMP="OFF"
|
||||
HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2"
|
||||
HEXAGON_SDK_ROOT="/opt/hexagon/6.6.0.0"
|
||||
...
|
||||
-- Including OpenCL backend
|
||||
-- Including Hexagon backend
|
||||
|
||||
@@ -28,15 +28,15 @@ c:\Qualcomm\OpenCL_SDK\2.3.2
|
||||
|
||||
Either use the trimmed down version (optimized for CI) from
|
||||
|
||||
https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz
|
||||
https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.6.0.0/hexagon-sdk-v6.6.0.0-arm64-wos.tar.xz
|
||||
|
||||
Or download the complete official version from
|
||||
|
||||
https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.4.0.2
|
||||
https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.6.0.0
|
||||
|
||||
Unzip/untar the archive into
|
||||
```
|
||||
c:\Qualcomm\Hexagon_SDK\6.4.0.2
|
||||
c:\Qualcomm\Hexagon_SDK\6.6.0.0
|
||||
```
|
||||
|
||||
## Install the latest Adreno GPU driver
|
||||
@@ -123,10 +123,10 @@ The overall Hexagon backend build procedure for Windows on Snapdragon is the sam
|
||||
However, additional settings are required for generating and signing HTP Ops libraries.
|
||||
```
|
||||
> $env:OPENCL_SDK_ROOT="C:\Qualcomm\OpenCL_SDK\2.3.2"
|
||||
> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2"
|
||||
> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2\tools\HEXAGON_Tools\19.0.04"
|
||||
> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.6.0.0"
|
||||
> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.6.0.0\tools\HEXAGON_Tools\19.0.07"
|
||||
> $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx"
|
||||
> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64"
|
||||
> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0"
|
||||
|
||||
> cmake --preset arm64-windows-snapdragon-release -B build-wos
|
||||
...
|
||||
|
||||
@@ -735,7 +735,7 @@ ninja
|
||||
|
||||
To read documentation for how to build on Android, [click here](./android.md)
|
||||
|
||||
## WebGPU [In Progress]
|
||||
## WebGPU
|
||||
|
||||
The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `18eb229`.
|
||||
|
||||
|
||||
@@ -291,6 +291,7 @@ Here are some models known to work (w/ chat template override when needed):
|
||||
llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
|
||||
llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
|
||||
llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
|
||||
llama-server --jinja -fa -hf ibm-granite/granite-4.1-3b-GGUF:Q4_K_M
|
||||
|
||||
# Native support for DeepSeek R1 works best w/ our template override (official template is buggy, although we do work around it)
|
||||
|
||||
|
||||
@@ -247,7 +247,7 @@ Specifies a comma-separated list of speculative decoding types to use.
|
||||
|------|-------------|
|
||||
| `none` | No speculative decoding (default) |
|
||||
| `draft-simple` | Use a simple draft model for speculation |
|
||||
| `draft-mtp` | Use Masked Token Prediction (MTP) heads from the main model |
|
||||
| `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model |
|
||||
| `ngram-cache` | Use n-gram cache lookup |
|
||||
| `ngram-simple` | Use simple n-gram pattern matching |
|
||||
| `ngram-map-k` | Use n-gram pattern matching with n-gram-keys |
|
||||
|
||||
@@ -27,7 +27,6 @@ else()
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(passkey)
|
||||
add_subdirectory(retrieval)
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(simple-chat)
|
||||
add_subdirectory(speculative)
|
||||
|
||||
@@ -1308,7 +1308,8 @@ def do_dump_model(model_plus: ModelPlus) -> None:
|
||||
|
||||
def main(args_in: list[str] | None = None) -> None:
|
||||
output_choices = ["f32", "f16"]
|
||||
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
|
||||
dummy_val = np.uint32(1)
|
||||
if dummy_val == dummy_val.view(dummy_val.dtype.newbyteorder("<")):
|
||||
# We currently only support Q8_0 output on little endian systems.
|
||||
output_choices.append("q8_0")
|
||||
parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
|
||||
|
||||
@@ -25,6 +25,7 @@ android {
|
||||
arguments += "-DCMAKE_VERBOSE_MAKEFILE=ON"
|
||||
|
||||
arguments += "-DBUILD_SHARED_LIBS=ON"
|
||||
arguments += "-DLLAMA_BUILD_APP=OFF"
|
||||
arguments += "-DLLAMA_BUILD_COMMON=ON"
|
||||
arguments += "-DLLAMA_OPENSSL=OFF"
|
||||
|
||||
|
||||
@@ -64,7 +64,7 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
|
||||
print("Using SentenceTransformer to apply all numbered layers")
|
||||
model = SentenceTransformer(model_path)
|
||||
tokenizer = model.tokenizer
|
||||
config = model[0].auto_model.config
|
||||
config = model[0].auto_model.config # ty: ignore[unresolved-attribute]
|
||||
else:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
set(TARGET llama-save-load-state)
|
||||
add_executable(${TARGET} save-load-state.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
@@ -6,6 +6,7 @@
|
||||
include(CMakeFindDependencyMacro)
|
||||
find_dependency(Threads)
|
||||
if (NOT GGML_SHARED_LIB)
|
||||
set(GGML_BASE_INTERFACE_LINK_LIBRARIES "")
|
||||
set(GGML_CPU_INTERFACE_LINK_LIBRARIES "")
|
||||
set(GGML_CPU_INTERFACE_LINK_OPTIONS "")
|
||||
|
||||
@@ -20,7 +21,15 @@ if (NOT GGML_SHARED_LIB)
|
||||
|
||||
if (GGML_OPENMP_ENABLED)
|
||||
find_dependency(OpenMP)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
set(GGML_OPENMP_INTERFACE_LINK_LIBRARIES "")
|
||||
if (TARGET OpenMP::OpenMP_C)
|
||||
list(APPEND GGML_OPENMP_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C)
|
||||
endif()
|
||||
if (TARGET OpenMP::OpenMP_CXX)
|
||||
list(APPEND GGML_OPENMP_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_CXX)
|
||||
endif()
|
||||
list(APPEND GGML_BASE_INTERFACE_LINK_LIBRARIES ${GGML_OPENMP_INTERFACE_LINK_LIBRARIES})
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${GGML_OPENMP_INTERFACE_LINK_LIBRARIES})
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_HBM)
|
||||
@@ -122,7 +131,8 @@ if(NOT TARGET ggml::ggml)
|
||||
add_library(ggml::ggml-base UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::ggml-base
|
||||
PROPERTIES
|
||||
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
|
||||
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}"
|
||||
INTERFACE_LINK_LIBRARIES "${GGML_BASE_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
set(_ggml_all_targets "")
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
|
||||
@@ -76,6 +76,7 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
|
||||
// Utils
|
||||
// Create a buffer and allocate all the tensors in a ggml_context
|
||||
// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
|
||||
// ggml_backend_alloc_ctx_tensors_from_buft returns NULL on failure or if all tensors in ctx are already allocated or zero-sized
|
||||
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
||||
|
||||
@@ -222,6 +222,23 @@ if (GGML_SCHED_NO_REALLOC)
|
||||
target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP)
|
||||
find_package(OpenMP)
|
||||
if (OpenMP_FOUND)
|
||||
set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
|
||||
else()
|
||||
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
|
||||
message(WARNING "OpenMP not found")
|
||||
endif()
|
||||
else()
|
||||
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP_ENABLED)
|
||||
target_compile_definitions(ggml-base PRIVATE GGML_USE_OPENMP)
|
||||
target_link_libraries(ggml-base PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
endif()
|
||||
|
||||
add_library(ggml
|
||||
ggml-backend-dl.cpp
|
||||
ggml-backend-reg.cpp)
|
||||
|
||||
@@ -1275,6 +1275,9 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
|
||||
if (chunk_size_j == 0) {
|
||||
continue;
|
||||
}
|
||||
const size_t simple_offset = i_start * chunk_size_j;
|
||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full);
|
||||
offset_j += chunk_size_j;
|
||||
@@ -1382,6 +1385,9 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
||||
for (size_t j = 0; j < n_bufs; j++){
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
|
||||
if (chunk_size_j == 0) {
|
||||
continue;
|
||||
}
|
||||
const size_t simple_offset = i_start * chunk_size_j;
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full);
|
||||
offset_j += chunk_size_j;
|
||||
@@ -1445,6 +1451,7 @@ static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_bac
|
||||
buf_ctx->buf_configs.reserve(n_simple_bufts);
|
||||
for (size_t i = 0; i < n_simple_bufts; i++) {
|
||||
ggml_backend_buffer_t simple_buf = ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size);
|
||||
GGML_ASSERT(simple_buf != nullptr);
|
||||
max_size = std::max(max_size, ggml_backend_buffer_get_size(simple_buf));
|
||||
buf_ctx->buf_configs.emplace_back(ggml_init(params), simple_buf);
|
||||
}
|
||||
@@ -1474,8 +1481,27 @@ struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struc
|
||||
t->data = (void *) 0x2000000000000000; // FIXME
|
||||
}
|
||||
for (size_t i = 0; i < n_simple_bufts; i++) {
|
||||
meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft(
|
||||
meta_buf_ctx->buf_configs[i].ctx, ggml_backend_meta_buft_simple_buft(buft, i));
|
||||
ggml_context * ctx = meta_buf_ctx->buf_configs[i].ctx;
|
||||
ggml_backend_buffer_type_t simple_buft = ggml_backend_meta_buft_simple_buft(buft, i);
|
||||
|
||||
// If a ggml_context only has zero-sized tensors, ggml_backend_alloc_ctx_tensors_from_buft returns NULL.
|
||||
// For those edge cases, allocate a dummy buffer instead.
|
||||
bool any_nonzero_slice = false;
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (ggml_nelements(t) != 0) {
|
||||
any_nonzero_slice = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (any_nonzero_slice) {
|
||||
meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft);
|
||||
} else {
|
||||
meta_buf_ctx->buf_configs[i].buf = ggml_backend_buft_alloc_buffer(simple_buft, 0);
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
t->buffer = meta_buf_ctx->buf_configs[i].buf;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(meta_buf_ctx->buf_configs[i].buf != nullptr);
|
||||
meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->buf_configs[i].buf));
|
||||
}
|
||||
return meta_buf;
|
||||
@@ -1605,6 +1631,9 @@ static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tens
|
||||
ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j);
|
||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
|
||||
if (chunk_size_j == 0) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_tensor_set_2d_async(simple_backend, simple_tensor, (const char *) data + offset_j, offset, chunk_size_j,
|
||||
i_stop - i_start, chunk_size_j, chunk_size_full);
|
||||
offset_j += chunk_size_j;
|
||||
@@ -1646,6 +1675,9 @@ static void ggml_backend_meta_get_tensor_async(ggml_backend_t backend, const ggm
|
||||
ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j);
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
|
||||
if (chunk_size_j == 0) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_tensor_get_2d_async(simple_backend, simple_tensor, (char *) data + offset_j, offset, chunk_size_j,
|
||||
i_stop - i_start, chunk_size_j, chunk_size_full);
|
||||
offset_j += chunk_size_j;
|
||||
|
||||
@@ -306,7 +306,7 @@ void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_
|
||||
GGML_ASSERT(tensor);
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
|
||||
if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) {
|
||||
if (n_copies <= 1 || backend->iface.get_tensor_2d_async == NULL) {
|
||||
for (size_t i = 0; i < n_copies; i++) {
|
||||
ggml_backend_tensor_get_async(backend, tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
|
||||
}
|
||||
@@ -317,7 +317,7 @@ void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_
|
||||
}
|
||||
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||||
backend->iface.get_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
|
||||
}
|
||||
|
||||
@@ -379,7 +379,7 @@ void ggml_backend_tensor_get_2d(const struct ggml_tensor * tensor, void * data,
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
||||
|
||||
if (n_copies <= 1 || buf->iface.set_tensor_2d == NULL) {
|
||||
if (n_copies <= 1 || buf->iface.get_tensor_2d == NULL) {
|
||||
for (size_t i = 0; i < n_copies; i++) {
|
||||
ggml_backend_tensor_get(tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
|
||||
}
|
||||
|
||||
@@ -72,17 +72,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP)
|
||||
find_package(OpenMP)
|
||||
if (OpenMP_FOUND)
|
||||
set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
|
||||
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
else()
|
||||
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
|
||||
message(WARNING "OpenMP not found")
|
||||
endif()
|
||||
if (GGML_OPENMP_ENABLED)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
endif()
|
||||
|
||||
if (GGML_LLAMAFILE)
|
||||
|
||||
@@ -15,6 +15,7 @@ if (CUDAToolkit_FOUND)
|
||||
# 80 == Ampere, asynchronous data loading, faster tensor core instructions
|
||||
# 86 == RTX 3000, needs CUDA v11.1
|
||||
# 89 == RTX 4000, needs CUDA v11.8
|
||||
# 90 == Hopper H100/200, needs CUDA v11.8
|
||||
# 120 == Blackwell, needs CUDA v12.8, FP4 tensor cores
|
||||
#
|
||||
# XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run
|
||||
@@ -33,7 +34,7 @@ if (CUDAToolkit_FOUND)
|
||||
list(APPEND CMAKE_CUDA_ARCHITECTURES 75-virtual 80-virtual 86-real)
|
||||
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
|
||||
list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real)
|
||||
list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real 90-virtual)
|
||||
endif()
|
||||
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
||||
|
||||
@@ -2,6 +2,9 @@
|
||||
#include <cstdint>
|
||||
#include <utility>
|
||||
|
||||
template<typename T, size_t>
|
||||
using type_for_index = T;
|
||||
|
||||
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
||||
return b;
|
||||
GGML_UNUSED(a);
|
||||
@@ -52,6 +55,7 @@ static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const int s12,
|
||||
const int s13,
|
||||
src1_ptrs... src1s) {
|
||||
ggml_cuda_pdl_lc();
|
||||
const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const uint32_t i1 = (blockDim.y * blockIdx.y + threadIdx.y);
|
||||
const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
|
||||
@@ -72,6 +76,7 @@ static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) {
|
||||
const uint32_t i10 = fastmodulo(i0, ne10);
|
||||
|
||||
@@ -141,6 +146,7 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0,
|
||||
|
||||
const int i10 = fastmodulo(i0, ne10);
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
|
||||
if constexpr (sizeof...(src1_ptrs) > 0) {
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
|
||||
@@ -282,35 +288,24 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
const uint3 ne1_fastdiv = init_fastdiv_values((uint32_t) ne1);
|
||||
const uint3 ne2_fastdiv = init_fastdiv_values((uint32_t) ne2);
|
||||
|
||||
if constexpr (sizeof...(I) > 0) {
|
||||
k_bin_bcast_unravel<bin_op, src0_t, src1_t, dst_t><<<block_num, block_size, 0, stream>>>(
|
||||
{
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)block_num, block_size, 0, stream);
|
||||
ggml_cuda_kernel_launch(k_bin_bcast_unravel<bin_op, src0_t, src1_t, dst_t, type_for_index<const src1_t *, I>...>, launch_params,
|
||||
src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv, ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11,
|
||||
ne12, ne13,
|
||||
/*s0,*/ s1, s2, s3,
|
||||
s00, s01, s02, s03,
|
||||
s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
|
||||
} else {
|
||||
k_bin_bcast_unravel<bin_op, src0_t, src1_t, dst_t>
|
||||
<<<block_num, block_size, 0, stream>>>(src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv,
|
||||
ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11, ne12, ne13,
|
||||
/*s0,*/ s1, s2, s3,
|
||||
s00, s01, s02, s03,
|
||||
s10, s11, s12, s13);
|
||||
}
|
||||
} else {
|
||||
const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3);
|
||||
if constexpr (sizeof...(I) > 0) {
|
||||
k_bin_bcast<bin_op, src0_t, src1_t, dst_t><<<block_nums, block_dims, 0, stream>>>(
|
||||
{
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
|
||||
ggml_cuda_kernel_launch(k_bin_bcast<bin_op, src0_t, src1_t, dst_t, type_for_index<const src1_t *, I>...>, launch_params,
|
||||
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13,
|
||||
/*s0,*/ s1, s2, s3,
|
||||
s00 ,s01, s02, s03,
|
||||
s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
|
||||
} else {
|
||||
k_bin_bcast<bin_op, src0_t, src1_t, dst_t><<<block_nums, block_dims, 0, stream>>>(
|
||||
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13,
|
||||
/*s0,*/ s1, s2, s3,
|
||||
s00, s01, s02, s03,
|
||||
s10, s11, s12, s13);
|
||||
s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -333,6 +328,7 @@ static __global__ void k_repeat_back(
|
||||
}
|
||||
|
||||
T sum = 0;
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int64_t i3 = tid3; i3 < ne03; i3 += ne3) {
|
||||
for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
|
||||
for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "ggml-cuda.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
|
||||
#if defined(GGML_USE_HIP)
|
||||
@@ -27,6 +28,7 @@
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#if defined(GGML_USE_HIP)
|
||||
@@ -50,6 +52,7 @@
|
||||
#define GGML_CUDA_CC_TURING 750
|
||||
#define GGML_CUDA_CC_AMPERE 800
|
||||
#define GGML_CUDA_CC_ADA_LOVELACE 890
|
||||
#define GGML_CUDA_CC_HOPPER 900
|
||||
// While BW spans CC 1000, 1100 & 1200, we are integrating Tensor Core instructions available to 1200 family, see
|
||||
// https://docs.nvidia.com/cutlass/media/docs/cpp/blackwell_functionality.html#blackwell-sm120-gemms
|
||||
#define GGML_CUDA_CC_BLACKWELL 1200
|
||||
@@ -107,6 +110,24 @@
|
||||
# define GGML_CUDA_USE_CUB
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
|
||||
|
||||
// PDL host-side support (cudaLaunchKernelEx) requires CUDART >= 11.8 and excludes HIP/MUSA.
|
||||
// __CUDA_ARCH__ is undefined in host passes; GPU arch check happens in device-side code.
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11080
|
||||
# define GGML_CUDA_USE_PDL
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11080
|
||||
|
||||
static __device__ __forceinline__ void ggml_cuda_pdl_sync() {
|
||||
#if defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
|
||||
cudaGridDependencySynchronize();
|
||||
#endif // defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void ggml_cuda_pdl_lc() {
|
||||
#if defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
|
||||
cudaTriggerProgrammaticLaunchCompletion();
|
||||
#endif // defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
|
||||
}
|
||||
|
||||
#ifdef __CUDA_ARCH_LIST__
|
||||
constexpr bool ggml_cuda_has_arch_impl(int) {
|
||||
return false;
|
||||
@@ -165,6 +186,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
||||
|
||||
#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
|
||||
|
||||
|
||||
#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
|
||||
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
||||
return cublasGetStatusString(err);
|
||||
@@ -1487,3 +1509,68 @@ struct ggml_cuda_mm_fusion_args_device {
|
||||
const void * gate_bias = nullptr;
|
||||
ggml_glu_op glu_op;
|
||||
};
|
||||
|
||||
struct ggml_cuda_kernel_launch_params {
|
||||
dim3 block_nums;
|
||||
dim3 block_dims;
|
||||
size_t shmem;
|
||||
cudaStream_t stream;
|
||||
|
||||
// size_t shmem
|
||||
ggml_cuda_kernel_launch_params(const dim3& block_nums_, const dim3& block_dims_, const size_t shmem_, const cudaStream_t stream_)
|
||||
: block_nums(block_nums_), block_dims(block_dims_), shmem(shmem_), stream(stream_) {}
|
||||
|
||||
// Some call sites pass ints instead of the required size_t. This 2nd constructor casts int->size_t to avoid these -Wnarrowing warnings.
|
||||
ggml_cuda_kernel_launch_params(const dim3& block_nums_, const dim3& block_dims_, const int shmem_, const cudaStream_t stream_)
|
||||
: block_nums(block_nums_), block_dims(block_dims_), shmem((size_t)shmem_), stream(stream_) {}
|
||||
};
|
||||
|
||||
#if defined(GGML_CUDA_USE_PDL)
|
||||
struct ggml_cuda_pdl_config {
|
||||
cudaLaunchAttribute attr;
|
||||
cudaLaunchConfig_t cfg;
|
||||
|
||||
ggml_cuda_pdl_config(const ggml_cuda_kernel_launch_params & params) {
|
||||
attr.id = cudaLaunchAttributeProgrammaticStreamSerialization;
|
||||
attr.val.programmaticStreamSerializationAllowed = 1;
|
||||
|
||||
cfg = {};
|
||||
cfg.gridDim = params.block_nums;
|
||||
cfg.blockDim = params.block_dims;
|
||||
cfg.dynamicSmemBytes = params.shmem;
|
||||
cfg.stream = params.stream;
|
||||
cfg.attrs = &attr;
|
||||
cfg.numAttrs = 1;
|
||||
}
|
||||
|
||||
// Delete due to &attr
|
||||
ggml_cuda_pdl_config(const ggml_cuda_pdl_config&) = delete;
|
||||
ggml_cuda_pdl_config& operator=(const ggml_cuda_pdl_config&) = delete;
|
||||
ggml_cuda_pdl_config& operator=(ggml_cuda_pdl_config&&) = delete;
|
||||
|
||||
};
|
||||
#endif //defined(GGML_CUDA_USE_PDL)
|
||||
|
||||
|
||||
template<typename Kernel, typename... Args>
|
||||
static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_kernel_launch_params & launch_params, Args&&... args) {
|
||||
#if defined(GGML_CUDA_USE_PDL)
|
||||
|
||||
static const bool env_pdl_enabled = []() {
|
||||
const char * env = getenv("GGML_CUDA_PDL");
|
||||
return env == nullptr || std::atoi(env) != 0;
|
||||
}();
|
||||
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
if (env_pdl_enabled && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_HOPPER) {
|
||||
auto pdl_cfg = ggml_cuda_pdl_config(launch_params);
|
||||
|
||||
CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, std::forward<Args>(args)... ));
|
||||
return;
|
||||
}
|
||||
#endif //defined(GGML_CUDA_USE_PDL)
|
||||
|
||||
kernel<<<launch_params.block_nums, launch_params.block_dims, launch_params.shmem, launch_params.stream>>>(std::forward<Args>(args)... );
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont
|
||||
|
||||
const int64_t n = ne0 * ne1 * ne2;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int64_t i = (int64_t) blockIdx.x * blockDim.x + threadIdx.x; i < n; i += (int64_t) blockDim.x * gridDim.x) {
|
||||
if constexpr (dim == 0) {
|
||||
const int64_t row = i / ne0;
|
||||
@@ -64,8 +65,8 @@ static void concat_f32_cuda(const float * x,
|
||||
const int num_blocks = (n + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
||||
|
||||
if (dim == 0) {
|
||||
concat_f32_cont<0>
|
||||
<<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream);
|
||||
ggml_cuda_kernel_launch(concat_f32_cont<0>, launch_params,x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
return;
|
||||
}
|
||||
if (dim == 1) {
|
||||
|
||||
@@ -16,6 +16,7 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
|
||||
const int64_t nb12, const int64_t nb13) {
|
||||
ggml_cuda_pdl_lc();
|
||||
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= ne) {
|
||||
@@ -36,6 +37,7 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne
|
||||
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
@@ -59,6 +61,7 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
|
||||
__shared__ float tile[2][CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
|
||||
int cur_tile_buf = 0;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
#pragma unroll
|
||||
for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) {
|
||||
|
||||
@@ -142,6 +145,7 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int64_t dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
@@ -168,6 +172,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int64_t ne,
|
||||
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
@@ -182,6 +187,7 @@ static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const
|
||||
const src_t * x = (const src_t *) cx;
|
||||
dst_t * dst = (dst_t *) cdst;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
dst[i] = ggml_cuda_cast<dst_t>(x[i]);
|
||||
}
|
||||
|
||||
@@ -192,8 +198,8 @@ cudaStream_t stream) {
|
||||
|
||||
const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
cpy_scalar_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream);
|
||||
ggml_cuda_kernel_launch(cpy_scalar_contiguous<src_t, dst_t>, launch_params, cx, cdst, ne);
|
||||
}
|
||||
|
||||
template<typename src_t, typename dst_t, bool transposed = false>
|
||||
@@ -223,13 +229,15 @@ static void ggml_cpy_scalar_cuda(
|
||||
GGML_ASSERT(grid_z < USHRT_MAX);
|
||||
dim3 dimGrid(grid_x, grid_y, grid_z);
|
||||
dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
|
||||
cpy_scalar_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>
|
||||
(cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(dimGrid, dimBlock, 0, stream);
|
||||
ggml_cuda_kernel_launch(cpy_scalar_transpose<dst_t>, launch_params,
|
||||
cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
} else {
|
||||
const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
cpy_scalar<cpy_1_scalar<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream);
|
||||
ggml_cuda_kernel_launch(cpy_scalar<cpy_1_scalar<src_t, dst_t>>, launch_params,
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -636,6 +636,7 @@ static __global__ void flash_attn_mask_to_KV_max(
|
||||
if (tid < WARP_SIZE) {
|
||||
buf_iw[tid] = 1;
|
||||
}
|
||||
ggml_cuda_pdl_sync();
|
||||
__syncthreads();
|
||||
|
||||
int KV_max_sj = (ne30 - 1) * FATTN_KQ_STRIDE;
|
||||
@@ -687,6 +688,7 @@ static __global__ void flash_attn_stream_k_fixup_uniform(
|
||||
const uint3 fd_iter_j_z,
|
||||
const uint3 fd_iter_j) {
|
||||
constexpr int ncols = ncols1*ncols2;
|
||||
ggml_cuda_pdl_lc();
|
||||
|
||||
const int tile_idx = blockIdx.x; // One block per output tile.
|
||||
const int j = blockIdx.y;
|
||||
@@ -718,6 +720,7 @@ static __global__ void flash_attn_stream_k_fixup_uniform(
|
||||
|
||||
dst += sequence*ne02*ne01*D + jt*ne02*(ncols1*D) + zt_Q*D + (j*ne02 + c)*D + tid;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
// Load the partial result that needs a fixup
|
||||
float dst_val = *dst;
|
||||
float max_val;
|
||||
@@ -809,6 +812,7 @@ static __global__ void flash_attn_stream_k_fixup_general(
|
||||
float dst_val = 0.0f;
|
||||
float max_val = 0.0f;
|
||||
float rowsum = 0.0f;
|
||||
ggml_cuda_pdl_sync();
|
||||
{
|
||||
dst_val = *dst;
|
||||
|
||||
@@ -867,6 +871,7 @@ static __global__ void flash_attn_combine_results(
|
||||
const float2 * __restrict__ VKQ_meta,
|
||||
float * __restrict__ dst,
|
||||
const int parallel_blocks) {
|
||||
ggml_cuda_pdl_lc();
|
||||
// Dimension 0: threadIdx.x
|
||||
// Dimension 1: blockIdx.x
|
||||
// Dimension 2: blockIdx.y
|
||||
@@ -890,6 +895,7 @@ static __global__ void flash_attn_combine_results(
|
||||
__builtin_assume(tid < D);
|
||||
|
||||
extern __shared__ float2 meta[];
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int i = tid; i < 2*parallel_blocks; i += D) {
|
||||
((float *) meta)[i] = ((const float *)VKQ_meta) [i];
|
||||
}
|
||||
@@ -1146,7 +1152,9 @@ void launch_fattn(
|
||||
const uint3 ne01 = init_fastdiv_values(Q->ne[1]);
|
||||
|
||||
GGML_ASSERT(block_dim.x % warp_size == 0);
|
||||
fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
|
||||
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num, block_dim, nbytes_shared, main_stream);
|
||||
ggml_cuda_kernel_launch(fattn_kernel, launch_params,
|
||||
(const char *) Q->data,
|
||||
K_data,
|
||||
V_data,
|
||||
@@ -1176,9 +1184,9 @@ void launch_fattn(
|
||||
const dim3 block_dim_combine(DV, 1, 1);
|
||||
const dim3 blocks_num_combine = {(unsigned)ntiles_dst, ncols1, ncols2};
|
||||
|
||||
flash_attn_stream_k_fixup_uniform<DV, ncols1, ncols2>
|
||||
<<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
|
||||
((float *) KQV->data, dst_tmp_meta.ptr,
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num_combine, block_dim_combine, 0, main_stream);
|
||||
ggml_cuda_kernel_launch(flash_attn_stream_k_fixup_uniform<DV, ncols1, ncols2>, launch_params,
|
||||
(float *) KQV->data, dst_tmp_meta.ptr,
|
||||
Q->ne[1], Q->ne[2], K->ne[2], nblocks_sk,
|
||||
gqa_ratio, bpt, fd0, fd1, fd2);
|
||||
} else if (ntiles_dst % blocks_num.x != 0) {
|
||||
@@ -1193,9 +1201,9 @@ void launch_fattn(
|
||||
const dim3 block_dim_combine(DV, 1, 1);
|
||||
const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2};
|
||||
|
||||
flash_attn_stream_k_fixup_general<DV, ncols1, ncols2>
|
||||
<<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
|
||||
((float *) KQV->data, dst_tmp_meta.ptr,
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num_combine, block_dim_combine, 0, main_stream);
|
||||
ggml_cuda_kernel_launch(flash_attn_stream_k_fixup_general<DV, ncols1, ncols2>, launch_params,
|
||||
(float *) KQV->data, dst_tmp_meta.ptr,
|
||||
Q->ne[1], Q->ne[2], gqa_ratio, total_work,
|
||||
fd_k_j_z_ne12, fd_k_j_z, fd_k_j, fd_k);
|
||||
}
|
||||
@@ -1204,9 +1212,9 @@ void launch_fattn(
|
||||
const dim3 blocks_num_combine(Q->ne[1], Q->ne[2], Q->ne[3]);
|
||||
const size_t nbytes_shared_combine = parallel_blocks*sizeof(float2);
|
||||
|
||||
flash_attn_combine_results<DV>
|
||||
<<<blocks_num_combine, block_dim_combine, nbytes_shared_combine, main_stream>>>
|
||||
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num_combine, block_dim_combine, nbytes_shared_combine, main_stream);
|
||||
ggml_cuda_kernel_launch(flash_attn_combine_results<DV>, launch_params,
|
||||
dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks);
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -1724,6 +1724,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
ggml_cuda_pdl_sync(); // TODO optimize placement
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE))
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
|
||||
@@ -894,6 +894,8 @@ static __global__ void flash_attn_tile(
|
||||
}
|
||||
float KQ_sum[cpw] = {0.0f};
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
|
||||
// Load Q data, convert to FP16 if fast:
|
||||
#pragma unroll
|
||||
for (int jc0 = 0; jc0 < cpw; ++jc0) {
|
||||
|
||||
@@ -40,6 +40,7 @@ static __global__ void flash_attn_ext_vec(
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
ggml_cuda_pdl_lc();
|
||||
#ifdef FLASH_ATTN_AVAILABLE
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
@@ -136,6 +137,8 @@ static __global__ void flash_attn_ext_vec(
|
||||
#endif // V_DOT2_F32_F16_AVAILABLE
|
||||
int Q_i32[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)];
|
||||
float2 Q_ds[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)];
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
if constexpr (Q_q8_1) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
|
||||
@@ -86,6 +86,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
constexpr int kqs_padded = FATTN_KQ_STRIDE + 8;
|
||||
constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half);
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
const int sequence = blockIdx.z / ne02;
|
||||
const int head = blockIdx.z - sequence*ne02;
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "gated_delta_net.cuh"
|
||||
#include "ggml-cuda/common.cuh"
|
||||
|
||||
template <int S_v, bool KDA, bool keep_rs_t>
|
||||
__global__ void __launch_bounds__((ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v) * 4, 2)
|
||||
@@ -53,6 +54,7 @@ gated_delta_net_cuda(const float * q,
|
||||
float s_shard[rows_per_lane];
|
||||
// state is stored transposed: M[col][i] = S[i][col], row col is contiguous
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
@@ -189,28 +191,29 @@ static void launch_gated_delta_net(
|
||||
|
||||
int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream);
|
||||
switch (S_v) {
|
||||
case 16:
|
||||
gated_delta_net_cuda<16, KDA, keep_rs_t><<<grid_dims, block_dims, 0, stream>>>(
|
||||
ggml_cuda_kernel_launch(gated_delta_net_cuda<16, KDA, keep_rs_t>, launch_params,
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
|
||||
break;
|
||||
case 32:
|
||||
gated_delta_net_cuda<32, KDA, keep_rs_t><<<grid_dims, block_dims, 0, stream>>>(
|
||||
ggml_cuda_kernel_launch(gated_delta_net_cuda<32, KDA, keep_rs_t>, launch_params,
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
|
||||
break;
|
||||
case 64: {
|
||||
gated_delta_net_cuda<64, KDA, keep_rs_t><<<grid_dims, block_dims, 0, stream>>>(
|
||||
ggml_cuda_kernel_launch(gated_delta_net_cuda<64, KDA, keep_rs_t>, launch_params,
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
|
||||
break;
|
||||
}
|
||||
case 128: {
|
||||
gated_delta_net_cuda<128, KDA, keep_rs_t><<<grid_dims, block_dims, 0, stream>>>(
|
||||
ggml_cuda_kernel_launch(gated_delta_net_cuda<128, KDA, keep_rs_t>, launch_params,
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
|
||||
|
||||
@@ -11,6 +11,7 @@ static __global__ void k_get_rows(
|
||||
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
|
||||
for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) {
|
||||
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
|
||||
@@ -48,6 +49,8 @@ static __global__ void k_get_rows_float(
|
||||
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
|
||||
|
||||
ggml_cuda_pdl_lc();
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
|
||||
for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) {
|
||||
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
|
||||
@@ -83,6 +86,7 @@ static __global__ void k_get_rows_back_float(
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int64_t i = 0; i < nrows_grad; ++i) {
|
||||
if (rows[i] != dst_row) {
|
||||
continue;
|
||||
@@ -156,7 +160,8 @@ static void get_rows_cuda_float(
|
||||
GGML_ASSERT(ne11 <= std::numeric_limits<uint32_t>::max() / ne12);
|
||||
const uint3 ne12_fdv = init_fastdiv_values(ne12);
|
||||
|
||||
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{block_nums, block_dims, 0, stream};
|
||||
ggml_cuda_kernel_launch(k_get_rows_float<src0_t, dst_t>, launch_params,
|
||||
src0_d, src1_d, dst_d,
|
||||
ne00, /*ne01, ne02, ne03,*/
|
||||
/*ne10,*/ ne11, ne12_fdv, /*ne13,*/
|
||||
|
||||
@@ -67,9 +67,11 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
// See discussion in: https://github.com/ggml-org/llama.cpp/pull/15132
|
||||
if ((nrows / nsm) < 2) {
|
||||
const dim3 block_dims(512, 1, 1);
|
||||
reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
|
||||
ggml_cuda_kernel_launch(reduce_rows_f32</*norm=*/true>, launch_params, src0_d, dst_d, ncols);
|
||||
} else {
|
||||
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
|
||||
reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
|
||||
ggml_cuda_kernel_launch(reduce_rows_f32</*norm=*/true>, launch_params, src0_d, dst_d, ncols);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@ static __global__ void mul_mat_vec_f(
|
||||
int channel_y;
|
||||
int sample_dst;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
if constexpr (is_multi_token_id) {
|
||||
// Multi-token MUL_MAT_ID path, adding these in the normal path causes a perf regression for n_tokens=1 case
|
||||
token_idx = blockIdx.z;
|
||||
@@ -298,6 +299,7 @@ static __global__ void mul_mat_vec_f(
|
||||
static_assert(std::is_same_v<T, void>, "unsupported type");
|
||||
}
|
||||
|
||||
ggml_cuda_pdl_lc();
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_dst; ++j) {
|
||||
sumf[j] = warp_reduce_sum<warp_size>(sumf[j]);
|
||||
@@ -382,11 +384,13 @@ static void mul_mat_vec_f_switch_fusion(
|
||||
const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
|
||||
const dim3 & block_dims, const dim3 & block_nums, const int nbytes_shared, const int ids_stride, const cudaStream_t stream) {
|
||||
|
||||
const ggml_cuda_kernel_launch_params launch_params = {block_nums, block_dims, nbytes_shared, stream};
|
||||
|
||||
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
|
||||
if constexpr (ncols_dst == 1) {
|
||||
if (has_fusion) {
|
||||
mul_mat_vec_f<T, type_acc, ncols_dst, block_size, true, is_multi_token_id><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
ggml_cuda_kernel_launch(mul_mat_vec_f<T, type_acc, ncols_dst, block_size, true, is_multi_token_id>, launch_params,
|
||||
x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
|
||||
return;
|
||||
@@ -395,8 +399,8 @@ static void mul_mat_vec_f_switch_fusion(
|
||||
|
||||
GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");
|
||||
|
||||
mul_mat_vec_f<T, type_acc, ncols_dst, block_size, false, is_multi_token_id><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
ggml_cuda_kernel_launch(mul_mat_vec_f<T, type_acc, ncols_dst, block_size, false, is_multi_token_id>, launch_params,
|
||||
x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
|
||||
|
||||
|
||||
@@ -359,7 +359,9 @@ static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_d
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_K:
|
||||
return 8;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return 2;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return 8;
|
||||
default:
|
||||
@@ -422,6 +424,7 @@ static __global__ void mul_mat_vec_q(
|
||||
uint32_t channel_y;
|
||||
uint32_t sample_dst;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : fastdiv(channel_dst, channel_ratio);
|
||||
channel_y = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst;
|
||||
sample_dst = blockIdx.z;
|
||||
@@ -681,8 +684,9 @@ static void mul_mat_vec_q_switch_fusion(
|
||||
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
|
||||
if constexpr (c_ncols_dst == 1) {
|
||||
if (has_fusion) {
|
||||
mul_mat_vec_q<type, c_ncols_dst, true, small_k><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, nbytes_shared, stream);
|
||||
ggml_cuda_kernel_launch(mul_mat_vec_q<type, c_ncols_dst, true, small_k>, launch_params,
|
||||
vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
|
||||
return;
|
||||
@@ -691,8 +695,9 @@ static void mul_mat_vec_q_switch_fusion(
|
||||
|
||||
GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");
|
||||
|
||||
mul_mat_vec_q<type, c_ncols_dst, false, small_k><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, nbytes_shared, stream);
|
||||
ggml_cuda_kernel_launch(mul_mat_vec_q<type, c_ncols_dst, false, small_k>, launch_params,
|
||||
vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@ static __global__ void norm_f32(
|
||||
|
||||
float2 mean_var = make_float2(0.0f, 0.0f);
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[col];
|
||||
mean_var.x += xi;
|
||||
@@ -46,6 +47,7 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
|
||||
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int j = start; j < end; j += block_size) {
|
||||
tmp += x[j];
|
||||
}
|
||||
@@ -95,6 +97,7 @@ static __global__ void rms_norm_f32(const float * x,
|
||||
const uint3 add_nrows_packed = make_uint3(0, 0, 0),
|
||||
const uint3 add_nchannels_packed = make_uint3(0, 0, 0),
|
||||
const uint3 add_nsamples_packed = make_uint3(0, 0, 0)) {
|
||||
ggml_cuda_pdl_lc();
|
||||
const int nrows = gridDim.x;
|
||||
const int nchannels = gridDim.y;
|
||||
|
||||
@@ -124,6 +127,7 @@ static __global__ void rms_norm_f32(const float * x,
|
||||
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[col];
|
||||
tmp += xi * xi;
|
||||
@@ -163,6 +167,7 @@ static __global__ void rms_norm_back_f32(
|
||||
float sum_xx = 0.0f; // sum for squares of x, equivalent to forward pass
|
||||
float sum_xg = 0.0f; // sum for x * gradient, needed because RMS norm mixes inputs
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xfi = xf[col];
|
||||
sum_xx += xfi * xfi;
|
||||
@@ -253,6 +258,7 @@ static __global__ void l2_norm_f32(
|
||||
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[col];
|
||||
tmp += xi * xi;
|
||||
@@ -261,6 +267,7 @@ static __global__ void l2_norm_f32(
|
||||
// sum up partial sums
|
||||
extern __shared__ float s_sum[];
|
||||
tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
|
||||
ggml_cuda_pdl_lc();
|
||||
|
||||
// from https://pytorch.org/docs/stable/generated/torch.nn.functional.normalize.html
|
||||
const float scale = rsqrtf(fmaxf(tmp, eps * eps));
|
||||
@@ -300,10 +307,19 @@ static void rms_norm_f32_cuda(
|
||||
const dim3 blocks_num(nrows, nchannels, nsamples);
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(256, 1, 1);
|
||||
rms_norm_f32<256, false><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
const ggml_cuda_kernel_launch_params launch_params = {blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream};
|
||||
ggml_cuda_kernel_launch(rms_norm_f32<256, false>, launch_params,
|
||||
x, dst, ncols, stride_row, stride_channel, stride_sample, eps,
|
||||
// underlying cudaLaunchKernelEx does not support default params
|
||||
nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0),
|
||||
nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0));
|
||||
} else {
|
||||
const dim3 block_dims(1024, 1, 1);
|
||||
rms_norm_f32<1024, false><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream};
|
||||
ggml_cuda_kernel_launch(rms_norm_f32<1024, false>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps,
|
||||
// underlying cudaLaunchKernelEx does not support default params
|
||||
nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0),
|
||||
nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -346,14 +362,20 @@ static void rms_norm_mul_f32_cuda(const float * x,
|
||||
const uint3 mul_nsamples_packed = init_fastdiv_values(mul_nsamples);
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(256, 1, 1);
|
||||
rms_norm_f32<256, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream};
|
||||
ggml_cuda_kernel_launch(rms_norm_f32<256, true>, launch_params,
|
||||
x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
|
||||
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
|
||||
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed,
|
||||
// underlying cudaLaunchKernelEx does not support default params
|
||||
nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0));
|
||||
} else {
|
||||
const dim3 block_dims(1024, 1, 1);
|
||||
rms_norm_f32<1024, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream};
|
||||
ggml_cuda_kernel_launch(rms_norm_f32<1024, true>, launch_params,
|
||||
x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
|
||||
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
|
||||
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed,
|
||||
// underlying cudaLaunchKernelEx does not support default params
|
||||
nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0));
|
||||
}
|
||||
} else {
|
||||
const uint3 mul_ncols_packed = init_fastdiv_values(mul_ncols);
|
||||
@@ -367,14 +389,16 @@ static void rms_norm_mul_f32_cuda(const float * x,
|
||||
const uint3 add_nsamples_packed = init_fastdiv_values(add_nsamples);
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(256, 1, 1);
|
||||
rms_norm_f32<256, true, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims,block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream};
|
||||
ggml_cuda_kernel_launch(rms_norm_f32<256, true, true>, launch_params,
|
||||
x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
|
||||
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
|
||||
add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
|
||||
add_nchannels_packed, add_nsamples_packed);
|
||||
} else {
|
||||
const dim3 block_dims(1024, 1, 1);
|
||||
rms_norm_f32<1024, true, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream};
|
||||
ggml_cuda_kernel_launch(rms_norm_f32<1024, true, true>, launch_params,
|
||||
x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
|
||||
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
|
||||
add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
|
||||
@@ -399,10 +423,12 @@ static void l2_norm_f32_cuda(
|
||||
const dim3 blocks_num(nrows, nchannels, nsamples);
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
l2_norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, 0, stream};
|
||||
ggml_cuda_kernel_launch(l2_norm_f32<WARP_SIZE>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
} else {
|
||||
const dim3 block_dims(1024, 1, 1);
|
||||
l2_norm_f32<1024><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream};
|
||||
ggml_cuda_kernel_launch(l2_norm_f32<1024>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ static __global__ void quantize_q8_1(
|
||||
const float * __restrict__ x, void * __restrict__ vy,
|
||||
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t ne0, const uint32_t ne1, const uint3 ne2) {
|
||||
ggml_cuda_pdl_lc();
|
||||
const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i0 >= ne0) {
|
||||
@@ -28,6 +29,7 @@ static __global__ void quantize_q8_1(
|
||||
const int64_t ib = i_cont / QK8_1; // block index
|
||||
const int64_t iqs = i_cont % QK8_1; // quant index
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
const float xi = i0 < ne00 ? x[i03*s03 + i02*s02 + i01*s01 + i00] : 0.0f;
|
||||
float amax = fabsf(xi);
|
||||
float sum = xi;
|
||||
@@ -196,6 +198,7 @@ static __global__ void quantize_mmq_mxfp4(const float * __restrict__ x,
|
||||
const int64_t i2 = blockIdx.z % ne2;
|
||||
const int64_t i3 = blockIdx.z / ne2;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
const int64_t i01 = ids ? ids[i1] : i1;
|
||||
const int64_t i02 = i2;
|
||||
const int64_t i03 = i3;
|
||||
@@ -288,6 +291,7 @@ static __global__ void quantize_mmq_q8_1(
|
||||
const int64_t i3 = blockIdx.z / ne2;
|
||||
|
||||
const int64_t i00 = i0;
|
||||
ggml_cuda_pdl_sync();
|
||||
const int64_t i01 = ids ? ids[i1] : i1;
|
||||
const int64_t i02 = i2;
|
||||
const int64_t i03 = i3;
|
||||
@@ -378,7 +382,8 @@ void quantize_row_q8_1_cuda(
|
||||
const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||
const dim3 num_blocks(block_num_x, ne1, ne2*ne3);
|
||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
||||
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2_fastdiv);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, block_size, 0, stream);
|
||||
ggml_cuda_kernel_launch(quantize_q8_1, launch_params, x, vy, ne00, s01, s02, s03, ne0, ne1, ne2_fastdiv);
|
||||
GGML_UNUSED(type_src0);
|
||||
}
|
||||
|
||||
|
||||
@@ -10,6 +10,8 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
|
||||
const int num_unroll = 8;
|
||||
float temp[num_unroll];
|
||||
float sum_temp[num_unroll] = { 0.0f };
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int i = col; i < ncols;) {
|
||||
for (int j = 0; j < num_unroll; ++j) {
|
||||
if (i < ncols) {
|
||||
|
||||
@@ -134,6 +134,7 @@ static __global__ void rope_neox(const T * x,
|
||||
const float * freq_factors,
|
||||
const int64_t * row_indices,
|
||||
const int set_rows_stride) {
|
||||
ggml_cuda_pdl_lc();
|
||||
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
|
||||
if (i0 >= ne00) {
|
||||
@@ -148,6 +149,7 @@ static __global__ void rope_neox(const T * x,
|
||||
|
||||
int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3;
|
||||
const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03;
|
||||
ggml_cuda_pdl_sync();
|
||||
|
||||
// Fusion optimization: ROPE + VIEW + SET_ROWS.
|
||||
// The rope output is viewed as a 1D tensor and offset based on a row index in row_indices.
|
||||
@@ -216,6 +218,7 @@ static __global__ void rope_multi(const T * x,
|
||||
int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3;
|
||||
const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
if (i0 >= n_dims) {
|
||||
dst[idst + i0/2 + 0] = x[ix + i0/2 + 0];
|
||||
dst[idst + i0/2 + 1] = x[ix + i0/2 + 1];
|
||||
@@ -300,6 +303,7 @@ static __global__ void rope_vision(const T * x,
|
||||
int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3;
|
||||
const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
const int sect_dims = sections.v[0] + sections.v[1];
|
||||
const int sec_w = sections.v[1] + sections.v[0];
|
||||
const int sector = (i0 / 2) % sect_dims;
|
||||
@@ -399,13 +403,14 @@ static void rope_neox_cuda(const T * x,
|
||||
const dim3 block_nums(nr, n_blocks_x, 1);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
const ggml_cuda_kernel_launch_params launch_params = {block_nums, block_dims, 0, stream};
|
||||
|
||||
if (freq_factors == nullptr) {
|
||||
rope_neox<forward, false><<<block_nums, block_dims, 0, stream>>>(
|
||||
ggml_cuda_kernel_launch(rope_neox<forward, false, T, D>, launch_params,
|
||||
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
|
||||
attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride);
|
||||
} else {
|
||||
rope_neox<forward, true><<<block_nums, block_dims, 0, stream>>>(
|
||||
ggml_cuda_kernel_launch(rope_neox<forward, true, T, D>, launch_params,
|
||||
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
|
||||
attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride);
|
||||
}
|
||||
@@ -443,11 +448,13 @@ static void rope_multi_cuda(const T * x,
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
if (freq_factors == nullptr) {
|
||||
rope_multi<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
|
||||
ggml_cuda_kernel_launch(rope_multi<forward, false, T>, launch_params,
|
||||
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
|
||||
attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope);
|
||||
} else {
|
||||
rope_multi<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
|
||||
ggml_cuda_kernel_launch(rope_multi<forward, true, T>, launch_params,
|
||||
x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
|
||||
attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope);
|
||||
}
|
||||
|
||||
@@ -3,9 +3,11 @@
|
||||
#define MAX_GRIDDIM_X 0x7FFFFFFF
|
||||
|
||||
static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) {
|
||||
ggml_cuda_pdl_lc();
|
||||
int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
|
||||
int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int64_t i = tid; i < nelements; i += stride) {
|
||||
dst[i] = scale * x[i] + bias;
|
||||
}
|
||||
@@ -13,7 +15,8 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
||||
|
||||
static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) {
|
||||
const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
||||
scale_f32<<<MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, nelements);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream);
|
||||
ggml_cuda_kernel_launch(scale_f32, launch_params, x, dst, scale, bias, nelements);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
@@ -53,6 +53,7 @@ static __global__ void k_set_rows_quant(const float * __restrict__ src0,
|
||||
const int64_t i11 = fastmodulo((uint32_t) i02, ne11_fd);
|
||||
const int64_t i10 = i01;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
|
||||
|
||||
const float * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
|
||||
@@ -157,7 +158,9 @@ static __global__ void k_set_rows(const src_t * __restrict__ src0,
|
||||
const int64_t i11 = fastmodulo((uint32_t) i02, ne11_fd);
|
||||
const int64_t i10 = i01;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
|
||||
ggml_cuda_pdl_lc();
|
||||
|
||||
const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
|
||||
dst_t * dst_row_ptr = dst + dst_row*s1 + i02*s2 + i03*s3;
|
||||
@@ -203,9 +206,11 @@ static void set_rows_cuda(
|
||||
const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11);
|
||||
const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12);
|
||||
|
||||
k_set_rows<<<grid_size, block_size, 0, stream>>>(src0_d, src1_d, dst_d, ne_total, ne10, ne11, ne12, ne13, s01,
|
||||
s02, s03, s10, s11, s12, s1, s2, s3, ne00_fd, ne01_fd, ne02_fd,
|
||||
ne11_fd, ne12_fd);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(grid_size, block_size, 0, stream);
|
||||
ggml_cuda_kernel_launch(k_set_rows<src_t, idx_t, dst_t>, launch_params,
|
||||
src0_d, src1_d, dst_d, ne_total, ne10, ne11, ne12, ne13, s01,
|
||||
s02, s03, s10, s11, s12, s1, s2, s3, ne00_fd, ne01_fd, ne02_fd,
|
||||
ne11_fd, ne12_fd);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,18 +1,21 @@
|
||||
#include "softcap.cuh"
|
||||
|
||||
static __global__ void softcap_f32(const float * x, float * dst, const float scale, const float softcap, const int k) {
|
||||
ggml_cuda_pdl_lc();
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
dst[i] = tanhf(scale * x[i]) * softcap;
|
||||
}
|
||||
|
||||
static void softcap_f32_cuda(const float * x, float * dst, const float scale, const float softcap, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SOFTCAP_BLOCK_SIZE - 1) / CUDA_SOFTCAP_BLOCK_SIZE;
|
||||
softcap_f32<<<num_blocks, CUDA_SOFTCAP_BLOCK_SIZE, 0, stream>>>(x, dst, scale, softcap, k);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, CUDA_SOFTCAP_BLOCK_SIZE, 0, stream);
|
||||
ggml_cuda_kernel_launch(softcap_f32, launch_params, x, dst, scale, softcap, k);
|
||||
}
|
||||
|
||||
// fused GGML_OP_SCALE + GGML_UNARY_OP_TANH + GGML_OP_SCALE
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "common.cuh"
|
||||
#include "ssm-conv.cuh"
|
||||
#include "unary.cuh"
|
||||
|
||||
@@ -7,6 +8,7 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float
|
||||
const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1,
|
||||
float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2,
|
||||
const int64_t n_t) {
|
||||
ggml_cuda_pdl_lc();
|
||||
GGML_UNUSED(src0_nb0);
|
||||
const int tid = threadIdx.x;
|
||||
const int bidx = blockIdx.x;
|
||||
@@ -23,6 +25,7 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float
|
||||
float x[d_conv] = { 0.0f };
|
||||
float w[d_conv] = { 0.0f };
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
#pragma unroll
|
||||
for (size_t j = 0; j < d_conv; j++) {
|
||||
w[j] = w_block[tid * stride_w + j];
|
||||
@@ -128,8 +131,9 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const floa
|
||||
constexpr int kNC = decltype(NC)::value;
|
||||
if (n_t <= 32) {
|
||||
const dim3 blocks(n_s, (nr + threads - 1) / threads, 1);
|
||||
ssm_conv_f32<apply_silu, threads, kNC><<<blocks, threads, 0, stream>>>(src0, src1, bias, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
|
||||
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream);
|
||||
ggml_cuda_kernel_launch(ssm_conv_f32<apply_silu, threads, kNC>, launch_params, src0, src1, bias, src0_nb0, src0_nb1,
|
||||
src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
|
||||
} else {
|
||||
const int64_t split_n_t = 32;
|
||||
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
|
||||
|
||||
@@ -26,6 +26,7 @@ __global__ void __launch_bounds__(splitD, 1)
|
||||
const int64_t s_off, const int64_t d_inner, const int64_t L_param)
|
||||
{
|
||||
const size_t L = L_template == 0 ? L_param : L_template;
|
||||
ggml_cuda_pdl_sync();
|
||||
const float *s0_block = (const float *)((const char *)src0 + src6[blockIdx.x] * src0_nb3 + blockIdx.y * splitD * src0_nb2);
|
||||
const float *x_block = (const float *)((const char *)src1 + (blockIdx.x * src1_nb3) + blockIdx.y * splitD * sizeof(float));
|
||||
const float *dt_block = (const float *)((const char *)src2 + (blockIdx.x * src2_nb2) + blockIdx.y * splitD * sizeof(float));
|
||||
@@ -135,6 +136,7 @@ __global__ void __launch_bounds__(d_state, 1)
|
||||
|
||||
const int group_off = (head_idx / (n_head / n_group)) * d_state * sizeof(float);
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
// TODO: refactor strides to be in elements/floats instead of bytes to be cleaner and consistent with the rest of the codebase
|
||||
const float * s0_warp = (const float *) ((const char *) src0 + src6[seq_idx] * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
|
||||
const float * x_warp = (const float *) ((const char *) src1 + (seq_idx * src1_nb3) + (warp_idx * sizeof(float)));
|
||||
@@ -206,7 +208,8 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
|
||||
constexpr int num_warps = threads/WARP_SIZE;
|
||||
|
||||
const dim3 blocks((n_head * head_dim + (num_warps - 1)) / num_warps, n_seq, 1);
|
||||
ssm_scan_f32_group<128/WARP_SIZE, 128><<<blocks, threads, 0, stream>>>(
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream);
|
||||
ggml_cuda_kernel_launch(ssm_scan_f32_group<128/WARP_SIZE, 128>, launch_params,
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
|
||||
src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
|
||||
@@ -215,7 +218,8 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
|
||||
constexpr int num_warps = threads/WARP_SIZE;
|
||||
|
||||
const dim3 blocks((n_head * head_dim + (num_warps - 1)) / num_warps, n_seq, 1);
|
||||
ssm_scan_f32_group<256/WARP_SIZE, 256><<<blocks, threads, 0, stream>>>(
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream);
|
||||
ggml_cuda_kernel_launch(ssm_scan_f32_group<256/WARP_SIZE, 256>, launch_params,
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
|
||||
src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
|
||||
@@ -231,58 +235,59 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
|
||||
const dim3 blocks(n_seq, (n_head + threads - 1) / threads, 1);
|
||||
const int smem_size = (threads * (d_state + 1) * 2) * sizeof(float);
|
||||
if (d_state == 16) {
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, smem_size, stream);
|
||||
switch (n_tok)
|
||||
{
|
||||
case 1:
|
||||
ssm_scan_f32<threads, 16, 1><<<blocks, threads, smem_size, stream>>>(
|
||||
ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 1>, launch_params,
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
case 2:
|
||||
ssm_scan_f32<threads, 16, 2><<<blocks, threads, smem_size, stream>>>(
|
||||
ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 2>, launch_params,
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
case 3:
|
||||
ssm_scan_f32<threads, 16, 3><<<blocks, threads, smem_size, stream>>>(
|
||||
ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 3>, launch_params,
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
case 4:
|
||||
ssm_scan_f32<threads, 16, 4><<<blocks, threads, smem_size, stream>>>(
|
||||
ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 4>, launch_params,
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
case 5:
|
||||
ssm_scan_f32<threads, 16, 5><<<blocks, threads, smem_size, stream>>>(
|
||||
ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 5>, launch_params,
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
case 6:
|
||||
ssm_scan_f32<threads, 16, 6><<<blocks, threads, smem_size, stream>>>(
|
||||
ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 6>, launch_params,
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
case 7:
|
||||
ssm_scan_f32<threads, 16, 7><<<blocks, threads, smem_size, stream>>>(
|
||||
ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 7>, launch_params,
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
case 8:
|
||||
ssm_scan_f32<threads, 16, 8><<<blocks, threads, smem_size, stream>>>(
|
||||
ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 8>, launch_params,
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
default:
|
||||
ssm_scan_f32<threads, 16, 0><<<blocks, threads, smem_size, stream>>>(
|
||||
ggml_cuda_kernel_launch(ssm_scan_f32<threads, 16, 0>, launch_params,
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
|
||||
@@ -7,10 +7,12 @@ void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int
|
||||
const dim3 block_nums(nrows, 1, 1);
|
||||
if ((nrows / nsm) < 2) {
|
||||
const dim3 block_dims(512, 1, 1);
|
||||
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
|
||||
ggml_cuda_kernel_launch(reduce_rows_f32</*norm=*/false>, launch_params, x, dst, ncols);
|
||||
} else {
|
||||
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
|
||||
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
|
||||
ggml_cuda_kernel_launch(reduce_rows_f32</*norm=*/false>, launch_params, x, dst, ncols);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,10 +36,12 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
if ((nrows / nsm) < 2) {
|
||||
// Increase num threads to 512 for small nrows to better hide the latency
|
||||
const dim3 block_dims(512, 1, 1);
|
||||
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
|
||||
ggml_cuda_kernel_launch(reduce_rows_f32</*norm=*/false>, launch_params, src0_d, dst_d, ncols);
|
||||
} else {
|
||||
// Enough active SMs to hide latency, use smaller blocks to allow better scheduling
|
||||
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
|
||||
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
|
||||
ggml_cuda_kernel_launch(reduce_rows_f32</*norm=*/false>, launch_params, src0_d, dst_d, ncols);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,6 +105,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
|
||||
wt[i] = -INFINITY;
|
||||
}
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
#pragma unroll
|
||||
for (int i = 0; i < n_experts; i += WARP_SIZE) {
|
||||
const int expert = i + threadIdx.x;
|
||||
@@ -161,6 +162,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
|
||||
output_weights[i] = 0.f;
|
||||
}
|
||||
|
||||
ggml_cuda_pdl_lc();
|
||||
for (int k = 0; k < n_expert_used; k++) {
|
||||
float max_val = wt[0];
|
||||
int max_expert = threadIdx.x;
|
||||
@@ -271,51 +273,52 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
|
||||
dim3 grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1);
|
||||
dim3 block_dims(WARP_SIZE, rows_per_block, 1);
|
||||
cudaStream_t stream = ctx.stream();
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream);
|
||||
|
||||
switch (n_expert) {
|
||||
case 1:
|
||||
topk_moe_cuda<1, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
ggml_cuda_kernel_launch(topk_moe_cuda<1, has_bias>, launch_params,
|
||||
logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
|
||||
break;
|
||||
case 2:
|
||||
topk_moe_cuda<2, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
ggml_cuda_kernel_launch(topk_moe_cuda<2, has_bias>, launch_params,
|
||||
logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
|
||||
break;
|
||||
case 4:
|
||||
topk_moe_cuda<4, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
ggml_cuda_kernel_launch(topk_moe_cuda<4, has_bias>, launch_params,
|
||||
logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
|
||||
break;
|
||||
case 8:
|
||||
topk_moe_cuda<8, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
ggml_cuda_kernel_launch(topk_moe_cuda<8, has_bias>, launch_params,
|
||||
logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
|
||||
break;
|
||||
case 16:
|
||||
topk_moe_cuda<16, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
ggml_cuda_kernel_launch(topk_moe_cuda<16, has_bias>, launch_params,
|
||||
logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
|
||||
break;
|
||||
case 32:
|
||||
topk_moe_cuda<32, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
ggml_cuda_kernel_launch(topk_moe_cuda<32, has_bias>, launch_params,
|
||||
logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
|
||||
break;
|
||||
case 64:
|
||||
topk_moe_cuda<64, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
ggml_cuda_kernel_launch(topk_moe_cuda<64, has_bias>, launch_params,
|
||||
logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
|
||||
break;
|
||||
case 128:
|
||||
topk_moe_cuda<128, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
ggml_cuda_kernel_launch(topk_moe_cuda<128, has_bias>, launch_params,
|
||||
logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
|
||||
break;
|
||||
case 256:
|
||||
topk_moe_cuda<256, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
ggml_cuda_kernel_launch(topk_moe_cuda<256, has_bias>, launch_params,
|
||||
logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
|
||||
break;
|
||||
case 512:
|
||||
topk_moe_cuda<512, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
ggml_cuda_kernel_launch(topk_moe_cuda<512, has_bias>, launch_params,
|
||||
logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
|
||||
break;
|
||||
case 576:
|
||||
topk_moe_cuda<576, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
ggml_cuda_kernel_launch(topk_moe_cuda<576, has_bias>, launch_params,
|
||||
logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "fatal error");
|
||||
|
||||
@@ -116,19 +116,22 @@ static __device__ __forceinline__ float op_trunc(float x) {
|
||||
|
||||
template <float (*op)(float), typename T>
|
||||
static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
|
||||
ggml_cuda_pdl_lc();
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
dst[i] = (T)op((float)x[i]);
|
||||
}
|
||||
|
||||
template <float (*op)(float), typename T>
|
||||
static void unary_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
|
||||
unary_op_kernel<op><<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream);
|
||||
ggml_cuda_kernel_launch(unary_op_kernel<op, T>, launch_params, x, dst, k);
|
||||
}
|
||||
|
||||
template <float (*op)(float)>
|
||||
@@ -258,6 +261,7 @@ void ggml_cuda_op_softplus(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
template <float (*op)(float), typename T>
|
||||
static __global__ void unary_gated_op_kernel(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1) {
|
||||
ggml_cuda_pdl_lc();
|
||||
const int64_t i = int64_t(blockDim.x)*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
@@ -268,13 +272,15 @@ static __global__ void unary_gated_op_kernel(const T * x, const T * g, T * dst,
|
||||
const int64_t j0 = (i / n) * o0 + (i % n);
|
||||
const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
dst[i] = (T)(op((float)x[j0]) * (float)g[j1]);
|
||||
}
|
||||
|
||||
template <float (*op)(float), typename T>
|
||||
static void unary_gated_cuda(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1, cudaStream_t stream) {
|
||||
const int64_t num_blocks = (k + CUDA_GLU_BLOCK_SIZE - 1) / CUDA_GLU_BLOCK_SIZE;
|
||||
unary_gated_op_kernel<op><<<num_blocks, CUDA_GLU_BLOCK_SIZE, 0, stream>>>(x, g, dst, k, n, o0, o1);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_GLU_BLOCK_SIZE, 0, stream);
|
||||
ggml_cuda_kernel_launch(unary_gated_op_kernel<op, T>, launch_params, x, g, dst, k, n, o0, o1);
|
||||
}
|
||||
|
||||
template <float (*op)(float)>
|
||||
|
||||
@@ -2661,7 +2661,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
|
||||
|
||||
int mode = op_params[2];
|
||||
|
||||
if ((mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) {
|
||||
if (mode == GGML_ROPE_TYPE_VISION) {
|
||||
return false;
|
||||
}
|
||||
if (mode & 1) {
|
||||
@@ -2735,9 +2735,10 @@ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session *
|
||||
if (dst->ne[0] != d_inner || dst->ne[1] != n_t || dst->ne[2] != n_s) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: add support for non-contiguous tensors
|
||||
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
|
||||
if (src0->nb[0] != sizeof(float) || src1->nb[0] != sizeof(float) || dst->nb[0] != sizeof(float)) {
|
||||
return false;
|
||||
}
|
||||
if (src0->nb[1] != src0->ne[0] * sizeof(float) || src1->nb[1] != src1->ne[0] * sizeof(float)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user